# RDD in PySpark

Resilient Distributed Dataset

In [7]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('RDD').getOrCreate()

In [8]:
data = [
    (1, 'sai'),
    (2, 'pawan')
]

rdd = spark.sparkContext.parallelize(data)
print(rdd.collect()) # This will collect the data from all the nodes and display here

[(1, 'sai'), (2, 'pawan')]


In [9]:
# Creating the DataFrame from the RDD object.

# Method 1
df = rdd.toDF(['_id', 'name'])
df.show()

# Method 2
df = spark.createDataFrame(rdd, ['_id', 'name'])
df.show()

# Converting DataFrame to RDD
print(df.rdd.collect())

+---+-----+
|_id| name|
+---+-----+
|  1|  sai|
|  2|pawan|
+---+-----+

+---+-----+
|_id| name|
+---+-----+
|  1|  sai|
|  2|pawan|
+---+-----+

[Row(_id=1, name='sai'), Row(_id=2, name='pawan')]


# Map Transformation on RDD objects

In [10]:
data = [
    (1, 'sai', 'pawan'),
    (2, 'pawan', 'sai')
]

rdd = spark.sparkContext.parallelize(data)


# Using noraml functions
def concatinate_name(tuple):
    return tuple + (tuple[1] + tuple[2],)
print(rdd.map(concatinate_name).collect())

# Using Lambda fucntion
print(rdd.map(lambda x: x + (x[1] + x[2],)).collect())


[(1, 'sai', 'pawan', 'saipawan'), (2, 'pawan', 'sai', 'pawansai')]
[(1, 'sai', 'pawan', 'saipawan'), (2, 'pawan', 'sai', 'pawansai')]


# FlatMap in RDD

In [11]:
data = ['sai pawan', 'd sai']
rdd = spark.sparkContext.parallelize(data)
print(rdd.flatMap(lambda x : x.split(' ')).collect())

['sai', 'pawan', 'd', 'sai']
