In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext

sc = SparkContext.getOrCreate()

In [3]:
sc

#### RDD Transformation
* 1. map(), filter(), flatMap()
* 2. Changes data from one form to another
* 3. Lazy execution - Delayed execution of the transformation until an action is called  


In [4]:
## Filter 
rdd = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
rdd.filter(lambda x: x % 2 == 0).collect()

[2, 4, 6, 8, 10]

In [5]:
## Map is used to apply a function to each element of the RDD
x_map  = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
y_map = x_map.map(lambda x: x * 2)

print(f'Values of x_map: {x_map.collect()}')
print(f'Values of y_map: {y_map.collect()}')


Values of x_map: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Values of y_map: [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]


In [6]:
## FlatMap is used to apply a function to each element of the RDD and flatten the result. We can use  flatMap() to split each element of the RDD into multiple elements. 
x_flatMap  = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
y_flatMap = x_flatMap.flatMap(lambda x: (x, x*2,x*3, 200+x))

print(f'Values of x_flatMap: {x_flatMap.collect()}')
print(f'Values of y_flatMap: {y_flatMap.collect()}')

Values of x_flatMap: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Values of y_flatMap: [1, 2, 3, 201, 2, 4, 6, 202, 3, 6, 9, 203, 4, 8, 12, 204, 5, 10, 15, 205, 6, 12, 18, 206, 7, 14, 21, 207, 8, 16, 24, 208, 9, 18, 27, 209, 10, 20, 30, 210]


In [9]:
## MapPartitions is used to apply a function to each partition of the RDD.
x = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 3)
def f(iterator): yield sum(iterator)

y= x.mapPartitions(f)

print(f'Values of x: {x.collect()}')
print(f'Values of y: {y.collect()}')

Values of x: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Values of y: [6, 15, 34]


In [10]:
# glom() is used to convert the RDD into a list of lists i.e it will flaten elements of each partition into a list.
print(f'Values of x: {x.glom().collect()}')
print(f'Values of y: {y.glom().collect()}')

Values of x: [[1, 2, 3], [4, 5, 6], [7, 8, 9, 10]]
Values of y: [[6], [15], [34]]


In [11]:
#mapPartitionsWithIndex is used to apply a function to each partition of the RDD. It also takes an integer value which represents the index of the partition.
x = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 3)
def f(splitIndex, iterator): yield (splitIndex, sum(iterator))

y = x.mapPartitionsWithIndex(f)

print(f'Values of x: {x.collect()}')
print(f'Values of y: {y.collect()}')

Values of x: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Values of y: [(0, 6), (1, 15), (2, 34)]


In [15]:
# sample() is used to return a new RDD that is a sample of the original RDD. It takes three parameters: withReplacement, fraction, and seed.
# withReplacement: boolean value to indicate whether the sample is with replacement or not. take True for with replacement and False for without replacement.
x = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
y= x.sample(True, 0.5, 1)

print(x.collect())
print(y.collect())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
[3, 4, 4, 5, 6, 10]


In [13]:
# union() is used to combine two RDDs. It takes another RDD as a parameter and returns a new RDD.
rdd_1 = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
rdd_2 = sc.parallelize([11, 12, 13, 14, 15, 16, 17, 18, 19, 20])

rdd_1.union(rdd_2).collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

In [16]:
#internsection is used to return a new RDD that contains the common elements of the two RDDs.

rdd_1 = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
rdd_2 = sc.parallelize([1, 2, 3, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20])
rdd_1.intersection(rdd_2).collect()

[1, 2, 3]

In [17]:
# distinct() is used to return a new RDD that contains distinct elements of the original RDD.
rdd = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
rdd.distinct().collect()

[8, 1, 9, 2, 10, 3, 4, 5, 6, 7]

In [19]:
# GroupBy is used to group the elements of the RDD based on the given function. It returns a tuple of the key and the corresponding values.
x = sc.parallelize([('A',1),('B',2),('C',3),('A',4),('B',5),('C',6)])
x.groupBy(lambda x: x[0]).collect()


[('B', <pyspark.resultiterable.ResultIterable at 0x1d193a13eb0>),
 ('C', <pyspark.resultiterable.ResultIterable at 0x1d193cdeb30>),
 ('A', <pyspark.resultiterable.ResultIterable at 0x1d193cdec80>)]