In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark import sql

In [2]:
conf = SparkConf()
sc = SparkContext(conf=conf)
sqlContext = sql.SQLContext(sc)

In [3]:
data = [1, 2, 3, 4, 5]
distData = sc.parallelize(data)

In [4]:
#TRANSFORMATION: map(func)

map_rdd = distData.map(lambda x: (x,1))
map_rdd.collect()
type(map_rdd)

pyspark.rdd.PipelinedRDD

In [5]:
# Filter

filter_rdd = distData.filter(lambda x: x == 2)

filter_rdd.collect()

[2]

In [6]:
# flatMap

fMap_rdd = distData.flatMap(lambda x: (x,1))
fMap_rdd.collect()
type(fMap_rdd)

pyspark.rdd.PipelinedRDD

In [7]:
#union

ds1 = range(1,9)
ds2 = range(1,9)
u01_rdd = sc.parallelize(ds1)
u02_rdd = sc.parallelize(ds2)
union_rdd = u01_rdd.union(u02_rdd)
union_rdd.collect()

[1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8]

In [8]:
#Intersection

ds1 = range(1,9)
ds2 = range(5,15)
i01_rdd = sc.parallelize(ds1)
i02_rdd = sc.parallelize(ds2)
intersection_rdd = i01_rdd.intersection(i02_rdd)
intersection_rdd.collect()

[5, 6, 7, 8]

In [9]:
# Subtract

s01_rdd = sc.parallelize(['A', 'B'])
s02_rdd = sc.parallelize(['B', 'C'])
subtract_rdd = s01_rdd.subtract(s02_rdd)
subtract_rdd.collect()

['A']

In [10]:
# Distinct

a = union_rdd.distinct()
a.collect()

[1, 2, 3, 4, 5, 6, 7, 8]

In [23]:
# groupByKey

g_rdd = sc.parallelize([('k',5),('s',3),('s',4),('p',7),('p',5),('t',8),('k',6)])
grp = g_rdd.groupByKey().mapValues(list)

grp.collect()

[('p', [7, 5]), ('k', [5, 6]), ('s', [3, 4]), ('t', [8])]

In [12]:
# reduceByKey

r_rdd = sc.parallelize([('k',5),('s',3),('s',4),('p',7),('p',5),('t',8),('k',6)])
red = g_rdd.reduceByKey(lambda x,y: x+y)

red.collect()

[('p', 12), ('k', 11), ('s', 7), ('t', 8)]

In [13]:
# aggregateByKey

a_rdd = sc.parallelize([('k',5),('s',3),('s',4),('p',7),('p',5),('t',8),('k',6)])
agg = g_rdd.aggregateByKey(0.0, lambda k,v: v+k, lambda k,v: v+k).sortByKey()

agg.collect()

[('k', 11.0), ('p', 12.0), ('s', 7.0), ('t', 8.0)]

In [14]:
# Join

# When called on datasets of type (K, V) and (K, W), returns a dataset of (K, (V, W))

j01_data = sc.parallelize([('A',1),('b',2),('c',3)])
j02_data = sc.parallelize([('A',4),('A',6),('b',7),('c',3),('c',8)])
join_rdd = j01_data.join(j02_data)

sorted(join_rdd.collect())

[('A', (1, 4)), ('A', (1, 6)), ('b', (2, 7)), ('c', (3, 3)), ('c', (3, 8))]

In [33]:
# cogroup

#  When called on datasets of type (K, V) and (K, W), returns a dataset of (K, (Iterable<V>, Iterable<W>)) tuples

c01_data = sc.parallelize([('A',1),('b',2),('c',3)])
c02_data = sc.parallelize([('A',4),('A',6),('b',7),('c',3),('c',8)])
cgrp_rdd = c01_data.cogroup(c02_data)

cgrp_rdd.collect()


[('A',
  (<pyspark.resultiterable.ResultIterable at 0x7fbe0054c650>,
   <pyspark.resultiterable.ResultIterable at 0x7fbe0066eed0>)),
 ('c',
  (<pyspark.resultiterable.ResultIterable at 0x7fbe0066e350>,
   <pyspark.resultiterable.ResultIterable at 0x7fbe00660ad0>)),
 ('b',
  (<pyspark.resultiterable.ResultIterable at 0x7fbe00660e50>,
   <pyspark.resultiterable.ResultIterable at 0x7fbe0062f2d0>))]

In [16]:
#coalesce

c_rdd = sc.parallelize(["jan","feb","mar","april","may","jun"],3)
coalesce_rdd = c_rdd.coalesce(2)

print(c_rdd.getNumPartitions())
print(coalesce_rdd.getNumPartitions())


3
2


In [17]:
#repartition

repartition_rdd = c_rdd.repartition(5)

repartition_rdd.getNumPartitions()

5

In [18]:
rdd = sc.parallelize([(31, "Sales"), (33, "Engineering")])

rdd.map(lambda x: x[1]).collect()
#df = rdd.toDF(['ID', 'DEPT'])

['Sales', 'Engineering']