In [1]:
spark

1. Create a new session in spark - spark RDD 

In [8]:
sc.stop()

In [9]:
from pyspark import SparkConf, SparkContext
config = SparkConf().setMaster("local[2]").setAppName("RDDSession")
sc=SparkContext(conf = config)

In [10]:
sc

2. Create spark session for dataframe and spark sql
3. getOrCreate() - get the running spark session or create it if the session is not running

In [11]:
spark = SparkSession.builder.appName("SQL Session").getOrCreate()

#CREATE RDD
"Create RDD using sc.parallelize() using python collection like numpy ,array,list,tuples etc.."

In [12]:
rdd1= sc.parallelize([10,20,30,40,50,60,70,80,90])
type(rdd1)

pyspark.rdd.RDD

In [13]:
print(rdd1)

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:195


In [14]:
rdd1.collect()

[10, 20, 30, 40, 50, 60, 70, 80, 90]

In [15]:
print("number of elements in rdd is " , rdd1.count())

number of elements in rdd is  9


In [12]:
#number of rdd partitions - will be decided by the number of workers
print("the number of rdd partiotions are : ",rdd1.getNumPartitions())


the number of rdd partiotions are :  2


In [13]:
#this is used to represent the actual data present in each partition(worker node)
rdd1.glom().collect()

[[10, 20, 30, 40], [50, 60, 70, 80, 90]]

In [14]:
rdd2= rdd1.map(lambda val : val*2)
rdd2.collect()

[20, 40, 60, 80, 100, 120, 140, 160, 180]

In [15]:
#take() - used to take any specific values
rdd2.take(5)

[20, 40, 60, 80, 100]

In [16]:
rdd3 = rdd1.map(lambda val:str(val)+"Number")
print(rdd3.collect())

['10Number', '20Number', '30Number', '40Number', '50Number', '60Number', '70Number', '80Number', '90Number']


In [17]:
rdd4 = sc.parallelize(range(1,31))

In [18]:
rdd5=rdd4.filter(lambda val : val<10)
rdd5.collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9]

In [19]:
#create a rdd b/w 1 to 50 and show the list of even no.
rdd6 = sc.parallelize(range(1,51))
rdd7= rdd6.filter(lambda x:x%2==0).map(lambda val:str(val)+" even")
print(rdd7.collect())

['2 even', '4 even', '6 even', '8 even', '10 even', '12 even', '14 even', '16 even', '18 even', '20 even', '22 even', '24 even', '26 even', '28 even', '30 even', '32 even', '34 even', '36 even', '38 even', '40 even', '42 even', '44 even', '46 even', '48 even', '50 even']


In [20]:
city_rdd = sc.parallelize(["Delhi,Kolkata,Kochi,Vizag,Varkala,Chennai,Bangalore,Pune",
                           "Dubai,New York,Berlin,Noida,Bangalore,Vizag",
                           "Venice,Dehradun,Munnar,Mumbai,Kochi,Kottayam",
                           "London,Paris,Melbourne,Bali,Abu Dhabi"])

In [21]:
city_rdd_map=city_rdd.map(lambda val : val.split(","))
print(city_rdd_map.collect())

[['Delhi', 'Kolkata', 'Kochi', 'Vizag', 'Varkala', 'Chennai', 'Bangalore', 'Pune'], ['Dubai', 'New York', 'Berlin', 'Noida', 'Bangalore', 'Vizag'], ['Venice', 'Dehradun', 'Munnar', 'Mumbai', 'Kochi', 'Kottayam'], ['London', 'Paris', 'Melbourne', 'Bali', 'Abu Dhabi']]


In [22]:
city_names=city_rdd_map.flatMap(lambda val:val)
print(city_names.collect())

['Delhi', 'Kolkata', 'Kochi', 'Vizag', 'Varkala', 'Chennai', 'Bangalore', 'Pune', 'Dubai', 'New York', 'Berlin', 'Noida', 'Bangalore', 'Vizag', 'Venice', 'Dehradun', 'Munnar', 'Mumbai', 'Kochi', 'Kottayam', 'London', 'Paris', 'Melbourne', 'Bali', 'Abu Dhabi']


In [23]:
rdd1 = sc.parallelize([1, 2, 3, 4, 5, 6])
rdd2 = sc.parallelize([7, 8, 9, 10, 11, 12])
# union - Applies to two or more RDD
rdd3 = rdd1.union(rdd2)
rdd3.collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [24]:
distinct_rdd = city_names.distinct()
print(distinct_rdd.collect())

['Kochi', 'Chennai', 'Dubai', 'Dehradun', 'Mumbai', 'Kottayam', 'Bali', 'Delhi', 'Kolkata', 'Vizag', 'Varkala', 'Bangalore', 'Pune', 'New York', 'Berlin', 'Noida', 'Venice', 'Munnar', 'London', 'Paris', 'Melbourne', 'Abu Dhabi']


In [16]:
#wide transformation - one prtition is not self sufficient.
food =[('fries',2),('dosa',1),('sandwich',3),('biriyani',140),('chapati',3),('puri',2),
       ('sandwich',4),('payasam',2),('maggi',2),('dosa',3),('puri',4),('fries',5)]
food_rdd = sc.parallelize(food)


In [17]:
food_rdd.glom().collect()

[[('fries', 2),
  ('dosa', 1),
  ('sandwich', 3),
  ('biriyani', 140),
  ('chapati', 3),
  ('puri', 2)],
 [('sandwich', 4),
  ('payasam', 2),
  ('maggi', 2),
  ('dosa', 3),
  ('puri', 4),
  ('fries', 5)]]

In [18]:
food_rdd.reduceByKey(lambda x,y :x+y).collect()

[('fries', 7),
 ('sandwich', 7),
 ('biriyani', 140),
 ('puri', 6),
 ('dosa', 4),
 ('chapati', 3),
 ('payasam', 2),
 ('maggi', 2)]

In [19]:
food_rdd.getNumPartitions()

2

In [20]:
food_rdd=food_rdd.repartition(4)

#### repartition():
    * This method is used to increased partitions of rdd. repartition() shuffles all data values across multiple workers to increase number of partitions. Increasing number of partitions will make data process parallely.

In [21]:
food_rdd.getNumPartitions()

4

#### coalesce()
    * This method is used to decrease number of partitions. It merge existing RDD Partitions into less number of Partitions. coalesce doesnot uses shuffling.

In [22]:
food_rdd = food_rdd.coalesce(2)


In [23]:
food_rdd.getNumPartitions()

2

In [24]:
# groupBy()
number_rdd = sc.parallelize(range(1,31))
# group rdd values by odd and even number
grouped_rdd = number_rdd.groupBy(lambda val : val % 2)
grouped_rdd.collect()

[(0, <pyspark.resultiterable.ResultIterable at 0x7fabec343a90>),
 (1, <pyspark.resultiterable.ResultIterable at 0x7fabec343940>)]

In [25]:
#collect and print group()
for key,group in grouped_rdd.collect():
    print(f"{key}:{list(group)}")

0:[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
1:[1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29]


In [26]:
#join() - join two RDD based on their key which requires shuffling the data to ensure that all keys are paired
#same key goes to same worker
order1 = sc.parallelize([('Fries',2),('Puri',3),('PaniPuri',4),('Dosa',1),('Vada',2),('Fries',2),('PaniPuri',5),('Biryani',5)])
order2 = sc.parallelize([('Juice',3),('Idly',3),('PaniPuri',6),('Dosa',4),('Biryani',1)])

In [27]:
join_rdd=order1.join(order2)

In [29]:
print(join_rdd.collect())

[('Dosa', (1, 4)), ('PaniPuri', (4, 6)), ('PaniPuri', (5, 6)), ('Biryani', (5, 1))]


In [30]:

# order1 is leftRDD and order2 is rightRDD and here, leftOuterJoin will keep left RDD complete and common keys from right RDD.
order1.leftOuterJoin(order2).collect()

[('Dosa', (1, 4)),
 ('Vada', (2, None)),
 ('PaniPuri', (4, 6)),
 ('PaniPuri', (5, 6)),
 ('Puri', (3, None)),
 ('Biryani', (5, 1)),
 ('Fries', (2, None)),
 ('Fries', (2, None))]

In [31]:
order1.rightOuterJoin(order2).collect()

[('Dosa', (1, 4)),
 ('Idly', (None, 3)),
 ('PaniPuri', (4, 6)),
 ('PaniPuri', (5, 6)),
 ('Biryani', (5, 1)),
 ('Juice', (None, 3))]

In [32]:
order1.fullOuterJoin(order2).collect()

[('Dosa', (1, 4)),
 ('Vada', (2, None)),
 ('Idly', (None, 3)),
 ('PaniPuri', (4, 6)),
 ('PaniPuri', (5, 6)),
 ('Puri', (3, None)),
 ('Biryani', (5, 1)),
 ('Fries', (2, None)),
 ('Fries', (2, None)),
 ('Juice', (None, 3))]

### Task 1 : wordcount example using RDD

In [33]:
# Create RDD using File - sc.textFile() , in this "file://" will be used as prefix to the filepath.
rdd = sc.textFile('file:///home/hadoop/Downloads/Harry Potter and the Deathly Hallows.txt')

In [43]:
import string
import re
rdd_word1=rdd.flatMap(lambda val : val.split(" "))
#remove punctions
rdd_word_clean = rdd_word1.map(lambda word : word.translate(str.maketrans('','',string.punctuation)))
#convert all words to lowercase words and map all the words with 1
rdd_word2= rdd_word_clean.map(lambda word:(word.lower(),1))
#reduceByKey to count the words
rdd_word3 = rdd_word2.reduceByKey(lambda x,y :x+y)
#filter words appearing more than 10 times
rdd_word4= rdd_word3.filter(lambda word:word[1]>10)
#sort by words freq in descending order
rdd_word5=rdd_word4.sortBy(lambda val : val[1],ascending = False)
rdd_word5.collect()


[('the', 10281),
 ('and', 5316),
 ('to', 4869),
 ('of', 4124),
 ('he', 3889),
 ('a', 3525),
 ('was', 2741),
 ('harry', 2640),
 ('his', 2614),
 ('it', 2354),
 ('in', 2209),
 ('had', 1997),
 ('said', 1925),
 ('that', 1887),
 ('you', 1683),
 ('as', 1411),
 ('at', 1404),
 ('i', 1289),
 ('him', 1242),
 ('with', 1129),
 ('not', 1117),
 ('they', 1097),
 ('on', 1075),
 ('hermione', 1048),
 ('her', 1021),
 ('ron', 1010),
 ('for', 996),
 ('but', 977),
 ('she', 925),
 ('—', 841),
 ('from', 840),
 ('have', 792),
 ('them', 788),
 ('…', 756),
 ('were', 755),
 ('be', 750),
 ('out', 712),
 ('into', 655),
 ('all', 652),
 ('could', 641),
 ('up', 633),
 ('there', 614),
 ('what', 584),
 ('been', 583),
 ('“i', 536),
 ('wand', 533),
 ('is', 521),
 ('we', 506),
 ('back', 504),
 ('who', 494),
 ('did', 476),
 ('—”', 472),
 ('their', 465),
 ('then', 460),
 ('would', 447),
 ('so', 447),
 ('dumbledore', 443),
 ('like', 438),
 ('this', 437),
 ('if', 433),
 ('by', 426),
 ('looked', 424),
 ('know', 421),
 ('one', 40

In [36]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [44]:
rdd_word5.saveAsTextFile("hdfs://localhost:9000/wordcount/spark_output/")

In [45]:
!hdfs dfs -cat /wordcount/spark_output/part-00000

('the', 10281)
('and', 5316)
('to', 4869)
('of', 4124)
('he', 3889)
('a', 3525)
('was', 2741)
('harry', 2640)
('his', 2614)
('it', 2354)
('in', 2209)
('had', 1997)
('said', 1925)
('that', 1887)
('you', 1683)
('as', 1411)
('at', 1404)
('i', 1289)
('him', 1242)
('with', 1129)
('not', 1117)
('they', 1097)
('on', 1075)
('hermione', 1048)
('her', 1021)
('ron', 1010)
('for', 996)
('but', 977)
('she', 925)
('—', 841)
('from', 840)
('have', 792)
('them', 788)
('…', 756)
('were', 755)
('be', 750)
('out', 712)
('into', 655)
('all', 652)
('could', 641)
('up', 633)
('there', 614)
('what', 584)
('been', 583)
('“i', 536)
('wand', 533)
('is', 521)
('we', 506)
('back', 504)
('who', 494)
('did', 476)
('—”', 472)
('their', 465)
('then', 460)
('would', 447)
('so', 447)
('dumbledore', 443)
('like', 438)
('this', 437)
('if', 433)
('by', 426)
('looked', 424)
('know', 421)
('one', 404)
('over', 403)
('now', 394)
('do', 391)
('an', 381)
('me', 369)
('no', 352)
('around', 351)
('are', 350)
('about', 350)
('mor

In [46]:
rdd.cache()

file:///home/hadoop/Downloads/Harry Potter and the Deathly Hallows.txt MapPartitionsRDD[56] at textFile at NativeMethodAccessorImpl.java:0

In [47]:
from pyspark import StorageLevel
rdd.persist(StorageLevel.MEMORY_ONLY)


file:///home/hadoop/Downloads/Harry Potter and the Deathly Hallows.txt MapPartitionsRDD[56] at textFile at NativeMethodAccessorImpl.java:0

In [49]:
rdd.unpersist()
rdd.persist(StorageLevel.DISK_ONLY)


file:///home/hadoop/Downloads/Harry Potter and the Deathly Hallows.txt MapPartitionsRDD[56] at textFile at NativeMethodAccessorImpl.java:0