In [1]:
import os
import sys
os.environ['SPARK_HOME']='D:/spark330hdp3sc3'
os.environ['PYLIB']=os.environ['SPARK_HOME']+'/python/lib'
sys.path.insert(0,os.environ['PYLIB']+'/py4j-0.10.9.5-src.zip')
sys.path.insert(1,os.environ['PYLIB']+'/pyspark.zip')

In [2]:
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np

In [3]:
spark = SparkSession.builder.appName('SparkTransformations') \
.config('spark.warehouse.dir','D:/tmp') \
.enableHiveSupport().getOrCreate()

In [4]:
# Having set the driver and driver options we should have spark representing spark session 
# available straight away
spark.version

'3.3.0'

In [5]:
sc = spark.sparkContext

In [6]:
sc.version

'3.3.0'

In [7]:
# RDD.map(f: Callable[[T], U], preservesPartitioning: bool = False) → pyspark.rdd.RDD[U]
a_coll = ['a', 'b', 'c']
a_coll_rdd = sc.parallelize(a_coll)
a_coll_rdd.map(lambda x: (x, 1)).collect()

[('a', 1), ('b', 1), ('c', 1)]

In [8]:
# RDD.filter(f: Callable[[T], bool]) → pyspark.rdd.RDD[T][source]
no_rdd = sc.parallelize(range(100))
print('Filtering to print only odd numbers \n', no_rdd.filter(lambda x: x % 2 != 0).collect())

Filtering to print only odd numbers 
 [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95, 97, 99]


In [9]:
# RDD.flatMap(f: Callable[[T], Iterable[U]], preservesPartitioning: bool = False) → pyspark.rdd.RDD[U]
ott_rdd = sc.parallelize([1, 2, 3])
print('Flatmapping illustration \n')
ott_rdd.flatMap(lambda x: (x, x * 100, 42)).collect()

Flatmapping illustration 



[1, 100, 42, 2, 200, 42, 3, 300, 42]

In [10]:
'''
RDD.groupBy(f: Callable[[T], K], numPartitions: Optional[int] = None, 
partitionFunc: Callable[[K], int] = <function portable_hash>) → pyspark.rdd.RDD[Tuple[K, Iterable[T]]]
'''
print('RDD groupBy transformation ')
group_by_collect = no_rdd.groupBy(lambda x: x % 2).collect()

RDD groupBy transformation 


In [11]:
print([(x[0], list(x[1])) for x in group_by_collect])

[(0, [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98]), (1, [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95, 97, 99])]


In [12]:
# A utillity function to print tuples in a list
def printTupleList(tlist):
    return [(x[0], list(x[1])) for x in tlist ]

In [13]:
# create a rdd to illustrate groupByKey transformation
pow_lt3_rdd = sc.parallelize([(1, 1), (1, 1), (1, 1), (2, 2), (2, 4), (2, 8), (3, 3), (3, 9), (3, 27)])

In [14]:
'''
RDD.groupByKey(numPartitions: Optional[int] = None, 
partitionFunc: Callable[[K], int] = <function portable_hash>) → pyspark.rdd.RDD[Tuple[K, Iterable[V]]]
'''
print('groupByKey transformation\n')
printTupleList(pow_lt3_rdd.groupByKey().collect())

groupByKey transformation



[(1, [1, 1, 1]), (2, [2, 4, 8]), (3, [3, 9, 27])]

In [15]:
'''
RDD.reduceByKey(func: Callable[[V, V], V], numPartitions: Optional[int] = None, 
partitionFunc: Callable[[K], int] = <function portable_hash>) →  pyspark.rdd.RDD[Tuple[K, V]]
'''
print('reduceByKey transformation')
pow_lt3_rdd.reduceByKey(lambda x, y: x + y).collect()

reduceByKey transformation


[(1, 3), (2, 14), (3, 39)]

In [16]:
a_coll_rdd.getNumPartitions()

8

In [18]:
# RDD.glom() → pyspark.rdd.RDD[List[T]]
a_coll_rdd.glom().collect()

[[], [], ['a'], [], [], ['b'], [], ['c']]

# MapPartitions

In [19]:
# RDD.mapPartitions(f: Callable[[Iterable[T]], Iterable[U]], preservesPartitioning: bool = False) → pyspark.rdd.RDD[U]
def show_partitions(idx, itera):
    yield 'index: ' + str(idx) + ' , elements: ' + str(list(itera))

In [21]:
'''
RDD.mapPartitionsWithIndex(f: Callable[[int, Iterable[T]], Iterable[U]], 
preservesPartitioning: bool = False) → pyspark.rdd.RDD[U]
'''
print('mapPartitionsWithIndex ')
a_coll_rdd.mapPartitionsWithIndex(show_partitions).collect()

mapPartitionsWithIndex 


['index: 0 , elements: []',
 'index: 1 , elements: []',
 "index: 2 , elements: ['a']",
 'index: 3 , elements: []',
 'index: 4 , elements: []',
 "index: 5 , elements: ['b']",
 'index: 6 , elements: []',
 "index: 7 , elements: ['c']"]

In [22]:
def f(sidx, itr): yield sidx

In [23]:
a_coll_rdd.mapPartitionsWithIndex(f).collect()

[0, 1, 2, 3, 4, 5, 6, 7]

In [24]:
# we require a function to iterate over the elements of a partition
def map_partitions_function(itera): 
    yield ', '.join(list(itera))

In [25]:
a_coll_rdd.mapPartitions(map_partitions_function).collect()

['', '', 'a', '', '', 'b', '', 'c']

# Pair RDD Operations - Joins

In [26]:
# RDD.union(other: pyspark.rdd.RDD[U]) → pyspark.rdd.RDD[Union[T, U]]
print('Union of two rdds:\n')
xu = sc.parallelize([1, 2, 3], 2)
yu = sc.parallelize([3, 4], 1)
print("union: " , xu.union(yu).collect())

Union of two rdds:

union:  [1, 2, 3, 3, 4]


In [27]:
'''
RDD.join(other: pyspark.rdd.RDD[Tuple[K, U]], numPartitions: Optional[int] = None) → 
pyspark.rdd.RDD[Tuple[K, Tuple[V, U]]][source]

'''
xj = sc.parallelize([("a", 1), ("b", 2)])
yj = sc.parallelize([("a", 3), ("a", 4), ("b", 5)])
print(xj.join(yj).collect())

[('b', (2, 5)), ('a', (1, 3)), ('a', (1, 4))]


In [28]:
print("Left Outer Join")
xoj = sc.parallelize([("a", 1), ("b", 2), ("c", 7)])
yoj = sc.parallelize([("a", 3), ("a", 4), ("b", 5), ("d", 4)])
print(xoj.leftOuterJoin(yoj).collect())
print("Right Outer Join")
print(xoj.rightOuterJoin(yoj).collect())

Left Outer Join
[('b', (2, 5)), ('c', (7, None)), ('a', (1, 3)), ('a', (1, 4))]
Right Outer Join
[('b', (2, 5)), ('a', (1, 3)), ('a', (1, 4)), ('d', (None, 4))]


In [29]:
print("Full Outer Join")
print(xoj.fullOuterJoin(yoj).collect())

Full Outer Join
[('b', (2, 5)), ('c', (7, None)), ('a', (1, 3)), ('a', (1, 4)), ('d', (None, 4))]


In [30]:
print('Distinct\n')
rpt_list = np.random.randint(0, 5, 10)
print('Randomly generated list with repeat elements ', rpt_list)
rpt_rdd = sc.parallelize(rpt_list)
print(rpt_rdd.distinct().collect())

Distinct

Randomly generated list with repeat elements  [4 3 2 4 2 2 4 2 4 2]
[2, 3, 4]


In [31]:
# RDD.coalesce(numPartitions: int, shuffle: bool = False) → pyspark.rdd.RDD[T][source]
print("coalesce to reduce number of partitions")
xCoalesce = sc.parallelize(range(10), 4)
print(xCoalesce.getNumPartitions())
yCoalesce = xCoalesce.coalesce(2)
print(yCoalesce.getNumPartitions())


coalesce to reduce number of partitions
4
2


In [32]:
# RDD.keyBy(f: Callable[[T], K]) → pyspark.rdd.RDD[Tuple[K, T]]
print('keyBy transformation')
new_no_rdd = sc.parallelize(range(10))
print(new_no_rdd.keyBy(lambda x: x % 2 == 0).collect())
print(new_no_rdd.keyBy(lambda x: x % 2 == 0).map(lambda x: ('even' if x[0] else 'odd', x[1])).collect())

keyBy transformation
[(True, 0), (False, 1), (True, 2), (False, 3), (True, 4), (False, 5), (True, 6), (False, 7), (True, 8), (False, 9)]
[('even', 0), ('odd', 1), ('even', 2), ('odd', 3), ('even', 4), ('odd', 5), ('even', 6), ('odd', 7), ('even', 8), ('odd', 9)]


# Custom Partitioner Spark RDD

In [33]:
def part_function(k):
    return 0 if k < 'H' else 1

In [34]:
x_part = sc.parallelize([('J', "James"), ('F', "Fred"), ('A', "Anna"), ('J', "John")], 3)
x_part.partitionBy(2, part_function).mapPartitionsWithIndex(show_partitions).collect()

["index: 0 , elements: [('F', 'Fred'), ('A', 'Anna')]",
 "index: 1 , elements: [('J', 'James'), ('J', 'John')]"]

In [35]:
azip = sc.parallelize(range(4))
bzip = sc.parallelize([1, 4, 9, 16])

print("Zipping RDDs")
print(azip.zip(bzip).collect())

Zipping RDDs
[(0, 1), (1, 4), (2, 9), (3, 16)]


# Sorting RDDs

In [37]:
rand_rdd = sc.parallelize(np.random.randint(0, 20, 10))
print(rand_rdd.collect())

[0, 11, 3, 1, 1, 14, 12, 7, 14, 4]


In [38]:
# RDD.sortBy(keyfunc: Callable[[T], S], ascending: bool = True, numPartitions: Optional[int] = None) → RDD[T]
print('Simple sort ascending')
rand_rdd.sortBy(lambda x: x).collect()

Simple sort ascending


[0, 1, 1, 3, 4, 7, 11, 12, 14, 14]

In [39]:
print('Simple sort descending')
rand_rdd.sortBy(lambda x: -x).collect()

Simple sort descending


[14, 14, 12, 11, 7, 4, 3, 1, 1, 0]

In [40]:
comp_rdd = sc.parallelize([("arjun", "tendulkar", 5),
    ("sachin", "tendulkar", 102), ("vachin", "tendulkar", 102),
    ("rahul", "dravid", 74), ("vahul", "dravid", 74),
    ("rahul", "shavid", 74), ("vahul", "shavid", 74),
    ("jacques", "kallis", 92), ("ricky", "ponting", 84), ("jacques", "zaalim", 92),
    ("sachin", "vendulkar", 102)])

In [41]:
comp_rdd.sortBy(lambda x: x[1], False).collect()

[('jacques', 'zaalim', 92),
 ('sachin', 'vendulkar', 102),
 ('arjun', 'tendulkar', 5),
 ('sachin', 'tendulkar', 102),
 ('vachin', 'tendulkar', 102),
 ('rahul', 'shavid', 74),
 ('vahul', 'shavid', 74),
 ('ricky', 'ponting', 84),
 ('jacques', 'kallis', 92),
 ('rahul', 'dravid', 74),
 ('vahul', 'dravid', 74)]

In [42]:
comp_df = comp_rdd.toDF(['fname', 'lname', 'centuries'])
comp_df.show()

+-------+---------+---------+
|  fname|    lname|centuries|
+-------+---------+---------+
|  arjun|tendulkar|        5|
| sachin|tendulkar|      102|
| vachin|tendulkar|      102|
|  rahul|   dravid|       74|
|  vahul|   dravid|       74|
|  rahul|   shavid|       74|
|  vahul|   shavid|       74|
|jacques|   kallis|       92|
|  ricky|  ponting|       84|
|jacques|   zaalim|       92|
| sachin|vendulkar|      102|
+-------+---------+---------+



In [43]:
from pyspark.sql.functions import *
comp_df.sort('lname', desc('fname'), desc('centuries')).show()

+-------+---------+---------+
|  fname|    lname|centuries|
+-------+---------+---------+
|  vahul|   dravid|       74|
|  rahul|   dravid|       74|
|jacques|   kallis|       92|
|  ricky|  ponting|       84|
|  vahul|   shavid|       74|
|  rahul|   shavid|       74|
| vachin|tendulkar|      102|
| sachin|tendulkar|      102|
|  arjun|tendulkar|        5|
| sachin|vendulkar|      102|
|jacques|   zaalim|       92|
+-------+---------+---------+



# RDD Aggregation
### A sequence operation which will run as a combiner of sort on the elements of the partition
### A combining operation which will reduce the combined tuples from the partitions to a single tuple

In [44]:
# RDD.aggregate(zeroValue: U, seqOp: Callable[[U, T], U], combOp: Callable[[U, U], U]) → U
def seq_op(data, elem):
    return (data[0] + elem, data[1] + 1)

def comb_op(d1, d2):
    return (d1[0] + d2[0], d1[1] + d2[1])

ardd = sc.parallelize(range(1, 13), 6)
print('First take a look at the partitions of the rdd')
print(ardd.mapPartitionsWithIndex(show_partitions).collect())
print(ardd.aggregate((0,0), seq_op, comb_op))

print(ardd.aggregate((0, 0), 
                    lambda data, elem: (data[0] + elem, data[1] + 1),
                    lambda d1, d2: (d1[0] + d2[0], d1[1] + d2[1])))

First take a look at the partitions of the rdd
['index: 0 , elements: [1, 2]', 'index: 1 , elements: [3, 4]', 'index: 2 , elements: [5, 6]', 'index: 3 , elements: [7, 8]', 'index: 4 , elements: [9, 10]', 'index: 5 , elements: [11, 12]']
(78, 12)
(78, 12)


 ### Tree aggregations operate exactly in the same way as aggreate  except for one critical difference - there is an intermediate aggregation step  data from some partitions will be sent to executors to aggregate
 ### so in the above case if there are six partitions  while aggregate will send results of all the six partitions to the driver in tree aggregate, three will go to one executor, three to another and the driver will receive the aggregations from 2 rather than 6
 ### Where there are many number of partitions tree aggregate performs significantly better than vanilla aggregate

In [45]:
print(ardd.treeAggregate((0,0), seq_op, comb_op))


(78, 12)


In [48]:
pow_2_to_4_rdd = sc.parallelize([(1, 1), (2, 4), (3, 9), (1, 1), (2, 8), (3, 27),
      (1, 1), (2, 16), (3, 81), (4, 256)])


In [49]:
pow_2_to_4_rdd.combineByKey(lambda x: (x, 1),
                lambda acc, vlu: (acc[0] + vlu, acc[1] + 1),  
                lambda v1, v2: (v1[0] + v2[0], v1[1] + v2[1])).collect()

[(1, (3, 3)), (2, (28, 3)), (3, (117, 3)), (4, (256, 1))]

### aggregateByKey also will function at the partition level - we need a value to seed the aggregation and we have a combination of a sequence operation and a combo operation playing out

In [50]:
'''
RDD.aggregateByKey(zeroValue: U, seqFunc: Callable[[U, V], U], combFunc: Callable[[U, U], U], 
numPartitions: Optional[int] = None, partitionFunc: Callable[[K], int] = <function portable_hash>) →
pyspark.rdd.RDD[Tuple[K, U]][source]
'''
pow_2_to_4_rdd.aggregateByKey(0,
                   lambda x, y: x + y, 
                   lambda a, b: a + b
                  ).collect()

[(1, 3), (2, 28), (3, 117), (4, 256)]