## Spark Basics

### Creating RDDs

First import pyspark then create a SparkContext

In [1]:
import findspark
findspark.init()

In [2]:
import pyspark

sc = pyspark.SparkContext.getOrCreate()

#### Example 1 creating RDDs from an array of numbers

In [3]:
data = [num for num in range(1,10)]
print(data)

[1, 2, 3, 4, 5, 6, 7, 8, 9]


In [4]:
myRDD = sc.parallelize(data)

In [5]:
print(myRDD.collect())

[1, 2, 3, 4, 5, 6, 7, 8, 9]


In [6]:
print(myRDD.count())

9


#### Example 2 creating RDDs from key value pairs (tuples) 

In [7]:
kv = [('a',7), ('a', 2), ('b', 2), ('b',4), ('c',1), ('c',2), ('c',3), ('c',4)]
print(kv)

[('a', 7), ('a', 2), ('b', 2), ('b', 4), ('c', 1), ('c', 2), ('c', 3), ('c', 4)]


In [8]:
rdd2 = sc.parallelize(kv)
print(rdd2.collect())

[('a', 7), ('a', 2), ('b', 2), ('b', 4), ('c', 1), ('c', 2), ('c', 3), ('c', 4)]


In [9]:
rdd3 = rdd2.reduceByKey(lambda x, y: x+y)
print(rdd3.collect())

[('a', 9), ('b', 6), ('c', 10)]


In [10]:
rdd4 = rdd2.groupByKey()
print(rdd4.collect())

[('a', <pyspark.resultiterable.ResultIterable object at 0x0000023A1D3E6280>), ('b', <pyspark.resultiterable.ResultIterable object at 0x0000023A1D3C7700>), ('c', <pyspark.resultiterable.ResultIterable object at 0x0000023A1D3C7D00>)]


In [11]:
rdd4.map(lambda x: (x[0], list(x[1]))).collect()

[('a', [7, 2]), ('b', [2, 4]), ('c', [1, 2, 3, 4])]