In [3]:
!pip3 install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=d074eb0174aaaafb696cae80afd10abb9b6a3c3cbd476b79dc748cb949546fd1
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [4]:
# Load Spark engine
!pip3 install -q findspark
import findspark
findspark.init()

In [5]:
# Linking with Spark
from pyspark import SparkContext, SparkConf

In [6]:
# Initializing Spark
conf = SparkConf().setAppName("RDD_practice").setMaster("local[*]")
sc = SparkContext(conf=conf)
print(sc)

<SparkContext master=local[*] appName=RDD_practice>


In [None]:
# Create RDD and Basic Operations
# Two ways to create RDD:
#1. Parallezing an existing collection in your driver program
#2. Referencing a dataset in an external storage system like HDFS, HBase or an data source offering a Hadoop InputFormat


In [7]:
import random
randomlist = random.sample(range(0,40),10)
print(randomlist)

[23, 17, 10, 38, 7, 36, 39, 19, 21, 27]


In [8]:
#create RDD
rdd1 = sc.parallelize(randomlist,4)
rdd1.collect()

[23, 17, 10, 38, 7, 36, 39, 19, 21, 27]

In [9]:
# Data distribution in partitions
rdd1.getNumPartitions()
print(rdd1.glom().collect()) # glom shows how data is partitioned

print(rdd1.glom().take(2)) # two partitions are

[[23, 17], [10, 38], [7, 36], [39, 19, 21, 27]]
[[23, 17], [10, 38]]


In [10]:
# print last partition
rdd1.glom().collect()[3]

[39, 19, 21, 27]

In [11]:
rdd1.count()

10

In [12]:
rdd1.first()

23

In [13]:
rdd1.top(2)

[39, 38]

In [14]:
rdd1.distinct().take(4)

[36, 17, 21, 10]

In [15]:
#map
# map()is a transformation operation that applies the specified function to each element of the RDD and returns a new RDD.
# def myfunc(item):
#   return (item+1)*3

# rdd_map = rdd1.map(myfunc)
# rdd_map.collect()

rdd_map = rdd1.map(lambda item:(item+1)*3)
rdd_map.collect()

[72, 54, 33, 117, 24, 111, 120, 60, 66, 84]

In [16]:
rdd_map.glom().collect()

[[72, 54], [33, 117], [24, 111], [120, 60, 66, 84]]

In [17]:
#filter - returns a new dataset
rdd_filter = rdd1.filter(lambda x:x%3==0)
rdd_filter.collect()

[36, 39, 21, 27]

In [18]:
rdd_filter.glom().collect()

[[], [], [36], [39, 21, 27]]

In [19]:
#flatMap()
rdd_flatmap = rdd1.flatMap(lambda x:[x+2,x+5])
rdd_flatmap.collect()

[25, 28, 19, 22, 12, 15, 40, 43, 9, 12, 38, 41, 41, 44, 21, 24, 23, 26, 29, 32]

In [20]:
#descriptive statistics
print([rdd1.max(),rdd1.min(),rdd1.mean(),rdd1.stdev(),rdd1.sum()])

[39, 7, 23.7, 10.686907878334125, 237]


In [21]:
#mapPartitions()
# mapPartitions()is a transformation operation that applies the specified function to each partition of the RDD and returns a new RDD.
def myfunc(partition):
  sum=0

  for item in partition:
    sum=sum+item
# return gives single retuen while yield acts as a generator
  yield sum

rdd1.mapPartitions(myfunc).collect()

[40, 48, 43, 106]

In [22]:
#Advance RDD Transformations and Actions

In [29]:
#union
print(rdd1.collect())
rdd2 = sc.parallelize([1,14,20,20,28,10,13,3],2)
print(rdd2.collect())

rdd_union = rdd1.union(rdd2)
print(rdd_union.collect())

print(rdd_union.getNumPartitions())

[23, 17, 10, 38, 7, 36, 39, 19, 21, 27]
[1, 14, 20, 20, 28, 10, 13, 3]
[23, 17, 10, 38, 7, 36, 39, 19, 21, 27, 1, 14, 20, 20, 28, 10, 13, 3]
6


In [32]:
#intersection
rdd_intersection = rdd1.intersection(rdd2)
rdd_intersection.collect()

[10]

In [34]:
#find empty partitions
counter=0
for item in rdd_intersection.glom().collect():
  if len(item) == 0:
    counter = counter+1

counter

5

In [35]:
#coalesce(numPartitions)
rdd_intersection.coalesce(1).glom().collect()

[[10]]

In [36]:
#takeSample(withreplacement, num,[seed])
rdd1.takeSample(False,5)

[19, 36, 10, 27, 7]

In [40]:
#reduce
rdd1.reduce(lambda x,y:x+y)

237

In [41]:
#reduceByKey()
rdd_rbk = sc.parallelize([(1,4),(7,10),(5,7),(1,12),(7,12),(7,1),(9,1),(7,4)],2)
print(rdd_rbk.glom().collect())

rdd_rbk.reduceByKey(lambda x,y:x+y).collect()


[[(1, 4), (7, 10), (5, 7), (1, 12)], [(7, 12), (7, 1), (9, 1), (7, 4)]]


[(1, 16), (7, 27), (5, 7), (9, 1)]

In [44]:
#sortByKey()
rdd_rbk.reduceByKey(lambda x,y:x+y).sortByKey(False).collect()

[(9, 1), (7, 27), (5, 7), (1, 16)]

In [45]:
#countByKey()
rdd_rbk.countByKey()

defaultdict(int, {1: 2, 7: 4, 5: 1, 9: 1})

In [50]:
#groupBykeys()
rdd_group = rdd_rbk.groupByKey()
rdd_group.getNumPartitions()

for item in rdd_group.collect():
  print(item[0],[values for values in item[1]])

1 [4, 12]
7 [10, 12, 1, 4]
5 [7]
9 [1]


In [51]:
#lookup(key)
rdd_rbk.lookup(7)

[10, 12, 1, 4]

In [52]:
# cache:
# By default, each transformed RDD may be recomputed each time you run an action on it.
# We can also use persist an RDD in memory using the persist method.

rdd_rbk.persist()

#Persistance

ParallelCollectionRDD[77] at readRDDFromFile at PythonRDD.scala:289