#### RDD Transformations

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("My Notebook Spark App") \
    .master("local[*]") \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/07/16 18:06:27 WARN Utils: Your hostname, spark-master, resolves to a loopback address: 127.0.1.1; using 10.168.136.115 instead (on interface ens3)
25/07/16 18:06:27 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/16 18:06:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


#### 1.Map - returns new RDD by passing each element of the source through a func

In [2]:
sc = spark.sparkContext
rdd = sc.parallelize(["a","b","c"])

In [3]:
def test_func1(x):
    return x + x
rdd2 = rdd.map(test_func1)

In [4]:
rdd2.collect()

                                                                                

['aa', 'bb', 'cc']

In [6]:
rdd = sc.parallelize([1,2,3,4,5])
rdd2 = rdd.map(lambda x: x*x)
rdd2.collect()

[1, 4, 9, 16, 25]

#### 2. Filter - returns new RDD with elements for which function returns true

In [8]:
def greater_than_2(x):
    return x > 2
rdd3 = rdd.filter(greater_than_2)
rdd3.collect()

[3, 4, 5]

#### 3. flatMap - Map + Flatten

Apply a function to all elements of RDD and then flatten

In [9]:
rdd = sc.parallelize(["hello world", "spark example for flatMap"])
rdd2 = rdd.flatMap(lambda line: line.split())
rdd2.collect()

['hello', 'world', 'spark', 'example', 'for', 'flatMap']

In [None]:
#### 4.MapPartition - Apply a function to each partition

**End result is same as Map**, but following advantages

1. Only one function call per partition
2. If DB conn required, only one connection can be opened per partition

In [10]:
rdd = sc.parallelize(range(10),2)

In [11]:
rdd.getNumPartitions()

2

In [14]:
def part_multiplier(partition):
    return [x*2 for x in partition]

In [15]:
rdd2 = rdd.mapPartitions(part_multiplier)
rdd2.collect()

[0, 2, 4, 6, 8, 10, 12, 14, 16, 18]

#### 5. mapPartitionsWithIndex
Function receives index of the partition in addition to the partition
1. Helps in debugging
2. If we need to apply logic based on partition numbers

In [19]:
def give_partition_index(index,partition):
    return [str(index)+str(elem) for elem in partition]

rdd = sc.parallelize(["a","b","c","d"],2)
rdd2 = rdd.mapPartitionsWithIndex(give_partition_index)
rdd2.collect()

['0a', '0b', '1c', '1d']

#### 6.Sample
Returns sample of data

Used in,

1. Machine learning
2. Debugging

In [3]:
rdd = sc.parallelize(range(100),2)
rdd2 = rdd.sample(False,0.1)
rdd2.collect()

                                                                                

[7, 20, 22, 30, 37, 41, 43, 56, 66, 92]

#### 7. Union
Combines two datasets

Example scenarios,

1. Combining daily logs for a week

In [9]:
rdd1 = sc.parallelize(range(10),2)
rdd2 = sc.parallelize(range(11,20),2)
rdd1.union(rdd2).collect()

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19]

#### 8. intersection

Returns common elements

In [10]:
rdd1 = sc.parallelize(range(10),2)
rdd2 = sc.parallelize(range(5,20),2)
rdd1.intersection(rdd2).collect()

[8, 5, 9, 6, 7]

#### 9. Distinct

In [14]:
rdd1 = sc.parallelize(['a','b','a','c','b',1,5])

In [15]:
rdd1.distinct().collect()

['b', 'a', 'c', 1, 5]

#### 10. groupByKey

1. Expensive transformation
2. May require shuffling
3. If aggregation to be performed reduceByKey is better

In [17]:
rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
rdd2 = rdd.groupByKey()

In [19]:
rdd2.mapValues(list).collect()

[('b', [1]), ('a', [1, 1])]

#### 11. ReduceByKey

In [21]:
def funtest(a,b):
    return a+b

In [22]:
rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
rdd2 = rdd.reduceByKey(funtest)

In [23]:
rdd2.collect()

[('b', 1), ('a', 2)]

#### 12. AggregateByKey

1. Provide options to apply a sequence function within a partition and different combine function across partitions
2. 

In [None]:
#Average

In [25]:
rdd = sc.parallelize([("a", 2), ("b", 3), ("a", 4),("b", 1),("b", 5)])

In [37]:
def seq_fun(x,y):
    x[0] = x[0] + y
    x[1] = x[1] + 1
    return x

def comb_fun(x,y):
    x[0] = x[0] + y[0]
    x[1] = x[1] + y[1]
    return x

In [38]:
rdd2 = rdd.aggregateByKey([0,0],seq_fun,comb_fun)

In [39]:
rdd2.collect()

[('b', [9, 3]), ('a', [6, 2])]

In [42]:
rdd3 = rdd2.map(lambda x: (x[0],x[1][0]/x[1][1]))

In [43]:
rdd3.collect()

[('b', 3.0), ('a', 3.0)]

In [None]:
# Top 2 scores

In [56]:
scores = sc.parallelize([("u1", 95), ("u1", 99), ("u1", 87), ("u2", 88), ("u2", 91),("u2", 95),
                        ("u1", 195), ("u1", 9), ("u1", 287), ("u2", 8), ("u2", 191),("u2", 95)
                        ],2)

In [57]:
def seq_fun(x,y):
    if y > x[0]:
        x[1] = x[0]
        x[0] = y
    elif y > x[1]:
        x[1] =y
    return x

def comb_fun(x,y):
    if x[1] > y[0]:
        return x
    else:
        if x[0] > y[0]:
            x[1] = y[0]
        else:
            x[1] = x[0]
            x[0] = y[0]
    if x[1] < y[1]:
        x[1] = y[1]
    return x

In [59]:
scores_2 = scores.aggregateByKey([0,0],seq_fun,comb_fun)

In [60]:
scores_2.collect()

[('u2', [191, 95]), ('u1', [287, 195])]

#### SortbyKey

In [61]:
tmp = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]

In [63]:
sc.parallelize(tmp).sortByKey().collect()

[('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)]

#### Join

In [66]:
rdd1 = sc.parallelize([("a", 1), ("b", 4)])
rdd2 = sc.parallelize([("a", 2), ("c", 3)])

In [67]:
rdd1.join(rdd2).collect()

[('a', (1, 2))]

In [68]:
rdd1.leftOuterJoin(rdd2).collect()

[('b', (4, None)), ('a', (1, 2))]

#### Cogroup

In [87]:
rdd1 = sc.parallelize([("a", 1), ("b", 4),("a", 7), ("b", 9)])
rdd2 = sc.parallelize([("a", 2),("a", 3),("a", 5),("b", 2),("b", 3)])

In [90]:
rdd3=rdd1.cogroup(rdd2)

In [95]:
rdd3.mapValues(lambda x: (list(x[0]),list(x[1]))).collect()

[('b', ([4, 9], [2, 3])), ('a', ([1, 7], [2, 3, 5]))]

#### Cartesian

In [96]:
rdd4=rdd1.cartesian(rdd2)

In [97]:
rdd4.collect()

[(('a', 1), ('a', 2)),
 (('a', 1), ('a', 3)),
 (('a', 1), ('a', 5)),
 (('a', 1), ('b', 2)),
 (('a', 1), ('b', 3)),
 (('b', 4), ('a', 2)),
 (('b', 4), ('a', 3)),
 (('b', 4), ('a', 5)),
 (('b', 4), ('b', 2)),
 (('b', 4), ('b', 3)),
 (('a', 7), ('a', 2)),
 (('b', 9), ('a', 2)),
 (('a', 7), ('a', 3)),
 (('a', 7), ('a', 5)),
 (('b', 9), ('a', 3)),
 (('b', 9), ('a', 5)),
 (('a', 7), ('b', 2)),
 (('a', 7), ('b', 3)),
 (('b', 9), ('b', 2)),
 (('b', 9), ('b', 3))]

#### Coalesc

1. Only to reduce the partition
2. Work is done by workers not by driver
3. Doesn't retain data distribution
4. If data distribution is needed, use shuffle = True or repartition

In [98]:
rdd = sc.parallelize(range(10),4)

In [99]:
rdd.getNumPartitions()

4

In [100]:
rdd2 = rdd.coalesce(1)

In [101]:
rdd2.getNumPartitions()

1

In [102]:
spark.stop()