In [1]:
# Generate our own JSON data 
#   This way we don't have to access the file system yet.
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession

sc = pyspark.SparkContext(appName="ex01")
# spark = SparkSession.builder.getOrCreate()

In [10]:
intRDD = sc.parallelize(list((3, 1, 2, 5, 5)))

In [11]:
intRDD.collect()

[3, 1, 2, 5, 5]

In [7]:
strRDD = sc.parallelize(['Apple', 'Orange', 'Banana', 'Grape', 'Apple'])

In [8]:
strRDD.collect()

['Apple', 'Orange', 'Banana', 'Grape', 'Apple']

In [9]:
def square(x):
    return x*x

In [12]:
intRDD.map(square).collect()

[9, 1, 4, 25, 25]

In [13]:
intRDD.map(lambda x: x*x).collect()

[9, 1, 4, 25, 25]

In [14]:
strRDD.map(lambda x: 'Fruit: ' + x).collect()

['Fruit: Apple',
 'Fruit: Orange',
 'Fruit: Banana',
 'Fruit: Grape',
 'Fruit: Apple']

In [15]:
intRDD.filter(lambda x: x < 3).collect()

[1, 2]

In [16]:
intRDD.filter(lambda x: 1 < x and x < 5).collect()

[3, 2]

In [17]:
strRDD.filter(lambda x: 'ra' in x).collect()

['Orange', 'Grape']

In [18]:
intRDD.distinct().collect()

[2, 3, 1, 5]

In [20]:
strRDD.distinct().collect()

['Apple', 'Orange', 'Banana', 'Grape']

In [21]:
isRDD = intRDD.randomSplit([0.4, 0.6])

In [22]:
isRDD[0].collect()

[3]

In [23]:
isRDD[1].collect()

[1, 2, 5, 5]

In [24]:
isRDD = intRDD.randomSplit([0.4, 0.6])

In [25]:
isRDD[0].collect()

[1, 2]

In [26]:
isRDD[1].collect()

[3, 5, 5]

In [28]:
gRDD = intRDD.groupBy(lambda x: 'even' if (x % 2 == 0) else 'odd').collect()

In [29]:
print(gRDD[0][0], sorted(gRDD[0][1]))

even [2]


In [30]:
print(gRDD[1][0], sorted(gRDD[1][1]))

odd [1, 3, 5, 5]


In [31]:
intRDD2 = sc.parallelize([5, 6])
intRDD3 = sc.parallelize([2, 7])

In [32]:
intRDD.union(intRDD2).union(intRDD3).collect()

[3, 1, 2, 5, 5, 5, 6, 2, 7]

In [33]:
intRDD.intersection(intRDD2).collect()

[5]

In [34]:
intRDD.subtract(intRDD2).collect()

[1, 2, 3]

In [35]:
intRDD.cartesian(intRDD2).collect()

[(3, 5),
 (1, 5),
 (3, 6),
 (1, 6),
 (2, 5),
 (5, 5),
 (5, 5),
 (2, 6),
 (5, 6),
 (5, 6)]

In [36]:
intRDD.first()

3

In [37]:
intRDD.take(3)

[3, 1, 2]

In [38]:
intRDD.takeOrdered(3)

[1, 2, 3]

In [39]:
intRDD.takeOrdered(3, key=lambda x: -x)

[5, 5, 3]

In [40]:
intRDD.stats()

(count: 5, mean: 3.2, stdev: 1.6, max: 5.0, min: 1.0)

In [41]:
intRDD.min()

1

In [42]:
intRDD.max()

5

In [43]:
intRDD.stdev()

1.6

In [44]:
intRDD.count()

5

In [45]:
intRDD.sum()

16

In [46]:
intRDD.mean()

3.2

In [47]:
kvRDD = sc.parallelize([(3, 4), (3, 6), (5, 6), (1, 2)])
kvRDD.collect()

[(3, 4), (3, 6), (5, 6), (1, 2)]

In [48]:
kvRDD.keys().collect()

[3, 3, 5, 1]

In [49]:
kvRDD.values().collect()

[4, 6, 6, 2]

In [50]:
kvRDD.filter(lambda item: item[0] < 5).collect()

[(3, 4), (3, 6), (1, 2)]

In [51]:
kvRDD.filter(lambda item: item[1] < 5).collect()

[(3, 4), (1, 2)]

In [52]:
kvRDD.mapValues(lambda v: v*v).collect()

[(3, 16), (3, 36), (5, 36), (1, 4)]

In [53]:
kvRDD.sortByKey(ascending=True).collect()

[(1, 2), (3, 4), (3, 6), (5, 6)]

In [54]:
kvRDD.sortByKey().collect()

[(1, 2), (3, 4), (3, 6), (5, 6)]

In [55]:
kvRDD.sortByKey(ascending=False).collect()

[(5, 6), (3, 4), (3, 6), (1, 2)]

In [57]:
kvRDD.reduceByKey(lambda v1, v2: v1 + v2).collect()

[(3, 10), (5, 6), (1, 2)]

In [61]:
kvRDD2 = sc.parallelize([(3, 8)])

In [62]:
kvRDD2.collect()

[(3, 8)]

In [63]:
kvRDD.join(kvRDD2).collect()

[(3, (4, 8)), (3, (6, 8))]

In [65]:
kvRDD.leftOuterJoin(kvRDD2).collect()

[(5, (6, None)), (1, (2, None)), (3, (4, 8)), (3, (6, 8))]

In [66]:
kvRDD.rightOuterJoin(kvRDD2).collect()

[(3, (4, 8)), (3, (6, 8))]

In [67]:
kvRDD.subtractByKey(kvRDD2).collect()

[(5, 6), (1, 2)]

In [68]:
kvRDD.first()

(3, 4)

In [69]:
kvRDD.take(3)

[(3, 4), (3, 6), (5, 6)]

In [70]:
kvFirst = kvRDD.first()

In [71]:
kvFirst[0]

3

In [72]:
kvFirst[1]

4

In [73]:
kvRDD.countByKey()

defaultdict(int, {3: 2, 5: 1, 1: 1})

In [74]:
kv = kvRDD.collectAsMap()
kv

{3: 6, 5: 6, 1: 2}

In [75]:
type(kv)

dict

In [76]:
kv[3]

6

In [77]:
kvRDD.lookup(3)

[4, 6]

In [78]:
kvRDD.lookup(5)

[6]

In [79]:
kvFruit = sc.parallelize([(1, 'apple'), (2, 'orange'), (3, 'banana'), (4, 'grape')])

In [80]:
fruitMap = kvFruit.collectAsMap()
print(str(fruitMap))

{1: 'apple', 2: 'orange', 3: 'banana', 4: 'grape'}


In [81]:
fruitIds = sc.parallelize([2, 4, 1, 3])
print(str(fruitIds.collect()))

[2, 4, 1, 3]


In [82]:
fruitNames = fruitIds.map(lambda i: fruitMap[i]).collect()
print(str(fruitNames))

['orange', 'grape', 'apple', 'banana']


In [83]:
bcFruitMap = sc.broadcast(fruitMap)

In [84]:
fruitNames = fruitIds.map(lambda i: bcFruitMap.value[i]).collect()
print(str(fruitNames))

['orange', 'grape', 'apple', 'banana']


In [85]:
total = sc.accumulator(0.0)

In [86]:
num = sc.accumulator(0)

In [87]:
intRDD.foreach(lambda i: [total.add(i), num.add(1)])

In [88]:
avg = total.value / num.value
print('total=' + str(total.value))
print('num=' + str(num.value))
print('avg=' + str(avg))

total=16.0
num=5
avg=3.2


In [89]:
intRddMemory = sc.parallelize([3, 1, 2, 5, 5])

In [90]:
intRddMemory.persist()

ParallelCollectionRDD[160] at parallelize at PythonRDD.scala:194

In [91]:
intRddMemory.is_cached

True

In [92]:
intRddMemory.unpersist()

ParallelCollectionRDD[160] at parallelize at PythonRDD.scala:194

In [93]:
intRddMemory.is_cached

False

In [94]:
intRddMemoryAndDisk = sc.parallelize([3, 1, 2, 5, 5])

In [95]:
import pyspark

In [96]:
intRddMemoryAndDisk.persist(pyspark.StorageLevel.MEMORY_AND_DISK)

ParallelCollectionRDD[161] at parallelize at PythonRDD.scala:194

In [97]:
intRddMemoryAndDisk.is_cached

True