# Chapter 3: Programming with RDDs (Python)

In [7]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Programming with RDDs").master("local[*]").getOrCreate()
sc = spark.sparkContext

## Create RDD

Create RDD from list:

In [8]:
numeric_rdd = sc.parallelize(list(range(10)))

In [9]:
print("Numeric RDD (from list): {0}".format(numeric_rdd.collect()))

Numeric RDD (from list): [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


Create RDD from external file:

In [10]:
text_rdd = sc.textFile("../data/README.md")

In [11]:
print("Text RDD (from external file): {0}".format(text_rdd.take(10)))

Text RDD (from external file): ['# Apache Spark', '', 'Spark is a fast and general cluster computing system for Big Data. It provides', 'high-level APIs in Scala, Java, Python, and R, and an optimized engine that', 'supports general computation graphs for data analysis. It also supports a', 'rich set of higher-level tools including Spark SQL for SQL and DataFrames,', 'MLlib for machine learning, GraphX for graph processing,', 'and Spark Streaming for stream processing.', '', '<http://spark.apache.org/>']


## RDD Actions

`collect()`, `take()`: --> previous section 


In [12]:
rdd1 = sc.parallelize([1,1,2,3,33,1,4,5,8,6])
rdd2 = sc.parallelize([1,2,9,8])

`count()`, `countByValue()`

In [13]:
print("count(): {0}".format(rdd1.count()))
print("countByValue(): {0}".format(rdd1.countByValue()))

count(): 10
countByValue(): defaultdict(<class 'int'>, {1: 3, 2: 1, 3: 1, 33: 1, 4: 1, 5: 1, 8: 1, 6: 1})


`takeOrdered()`, `takeSample()`

In [14]:
print("takeOrdered: {0}".format(rdd2.takeOrdered(num=3)))
print("takeSample: {0}".format(rdd2.takeSample(num=2,withReplacement=False)))

takeOrdered: [1, 2, 8]
takeSample: [8, 1]


`reduce()`, `fold()`

In [15]:
print("Sum of list using reduce(): {0}".format(rdd1.reduce(lambda x, y: x + y)))
print("Sum of list using fold(): {0}".format(rdd1.fold(0, lambda x, y: x + y)))

Sum of list using reduce(): 64
Sum of list using fold(): 64


Calculating average using `reduce()`

In [16]:
avg1 = rdd1.reduce(lambda x, y: x + y)/rdd1.count()
print("Average calculated using reduce(): {0}".format(avg1))

Average calculated using reduce(): 6.4


Calculating average using `aggregate()`:

In [17]:
sum_values, count = rdd1.aggregate((0, 0), 
                                   (lambda acc, value: (acc[0] + value, acc[1] + 1)),
                                   (lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1])))

avg2 = sum_values / count
print("Average calculated using aggregate(): {0}".format(avg2))

Average calculated using aggregate(): 6.4


## Basic RDD Transformations

`map()`

In [18]:
rdd_map = numeric_rdd.map(lambda x: (x, 2*x))
print("RDD obtained using map(): {0}".format(rdd_map.collect()))

RDD obtained using map(): [(0, 0), (1, 2), (2, 4), (3, 6), (4, 8), (5, 10), (6, 12), (7, 14), (8, 16), (9, 18)]


`flatMap()`

In [19]:
rdd_flat_map = text_rdd.flatMap(lambda x: x.split(" "))
print("RDD obtained using flatMap(): {0}".format(rdd_flat_map.take(10)))

RDD obtained using flatMap(): ['#', 'Apache', 'Spark', '', 'Spark', 'is', 'a', 'fast', 'and', 'general']


`filter()`

In [20]:
lines_spark = text_rdd.map(lambda x: x.split(" ")).filter(lambda x: "Spark" in x)
print("Number of lines that contains the word 'Spark': {0}".format(lines_spark.count()))

Number of lines that contains the word 'Spark': 16


In [21]:
words_python = text_rdd.flatMap(lambda x: x.split(" ")).filter(lambda x: "Python" == x.replace(",", ""))
print("Number of times that the word 'Python' appears: {0}".format(words_python.count()))

Number of times that the word 'Python' appears: 4


`distinct()`

In [22]:
print("RDD from distinct(): {0}".format(rdd1.distinct().collect()))

RDD from distinct(): [4, 8, 1, 33, 5, 2, 6, 3]


## Pseudo-Set Operations

`union()`

In [23]:
rdd_union = rdd1.union(rdd2)
print("RDD from union(): {0}".format(rdd_union.collect()))

RDD from union(): [1, 1, 2, 3, 33, 1, 4, 5, 8, 6, 1, 2, 9, 8]


`subtract()`

In [24]:
rdd_subtract = rdd1.subtract(rdd2)
print("RDD from subtract(): {0}".format(rdd_subtract.collect()))

RDD from subtract(): [33, 3, 4, 5, 6]


`intersection()`

In [25]:
rdd_intersection = rdd1.intersection(rdd2)
print("RDD from intersection(): {0}".format(rdd_intersection.collect()))

RDD from intersection(): [8, 1, 2]


`cartesian()`

In [26]:
rdd_cartesian = rdd1.cartesian(rdd2)
print("RDD from cartesian(): {0}".format(rdd_cartesian.collect()))

RDD from cartesian(): [(1, 1), (1, 1), (1, 2), (1, 2), (1, 9), (1, 9), (1, 8), (1, 8), (2, 1), (3, 1), (2, 2), (3, 2), (2, 9), (3, 9), (2, 8), (3, 8), (33, 1), (1, 1), (33, 2), (1, 2), (33, 9), (1, 9), (33, 8), (1, 8), (4, 1), (5, 1), (8, 1), (6, 1), (4, 2), (5, 2), (8, 2), (6, 2), (4, 9), (5, 9), (8, 9), (6, 9), (4, 8), (5, 8), (8, 8), (6, 8)]
