In [5]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-2.4.7/spark-2.4.7-bin-hadoop2.7.tgz

In [6]:
!tar xf spark-2.4.7-bin-hadoop2.7.tgz
!pip install -q findspark

In [11]:
import os 
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.7-bin-hadoop2.7"

In [12]:
import findspark
findspark.init()

In [13]:
## Create a spark Session
### Spark Session is the gateway for creating a spark Program.
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName("RDDProgram").getOrCreate()

In [14]:
spark

 AppName Sets a name for the application, which will be shown in the Spark web UI.
    

 Config Sets a config option. Options set using this method are automatically propagated to both SparkConf and SparkSession‘s own configuration.

master is a Spark, Mesos or YARN cluster URL, or a special “local” string to run in local modemaster 

getOrCreate() Gets an existing SparkSession or, if there is no existing one, creates a new one based on the options set in this builder.

In [15]:
sc = spark.sparkContext

* Spark introduces the concept of an RDD (Resilient Distributed Dataset), an
 immutable fault-tolerant, distributed collection of objects that can be operated on
 in parallel. 

* An RDD can contain any type of object and is created by loading an
 external dataset or distributing a collection from the driver program.

In [16]:
sc

* Spark introduces the concept of an RDD (Resilient Distributed Dataset), an
 immutable fault-tolerant, distributed collection of objects that can be operated on
 in parallel. 

* An RDD can contain any type of object and is created by loading an
 external dataset or distributing a collection from the driver program.

### Creating RDD in Pyspark

#####  There are three ways to create an RDD in Spark.

* Parallelizing already existing collection in driver program.
* Referencing a dataset in an external storage system (e.g. HDFS, Hbase, shared file system).
* Creating RDD from already existing RDDs.

In [17]:
rdd1 = sc.parallelize([("maths",92),("english",75),("SCiences",85),("Social",90)])

In [18]:
rdd1

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:195

In [19]:
rdd1.collect()

[('maths', 92), ('english', 75), ('SCiences', 85), ('Social', 90)]

In [20]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [21]:
!ls "/content/drive/My Drive"

 05_06_cpee_hyd
 05_class_injestion_other_tools.odp
 20170212_Batch21_CSE7219c_GNQ_Set2.pdf
 20170306_Batch34_CSE7322c_CuTe.xls.gsheet
 20171126_Batch31_CSE7322c_Day01_ClassNotes.odp
 20171203_Batch31_CSE7322c_Day02_ClassNotes.odp
 2017_12_03_big_data-20171203T053932Z-001.zip
 20171210_Batch31_CSE7322c_Day03_ClassNotes.odp
 20171217_Batch31_CSE7322c_Day04_ClassNotes.odp
 20180519_Batch38_CSE7323c_BIOnHadoop_Application1.zip
 2018_12_08_big_data_48
 2018_12_08_big_data_48.zip
 20190120_Batch48_CSE7322c_CUTe_BigData_PartA.docx
 20190120_Batch48_CSE7322c_CUTe_BigData_PartB.docx
 20190525_Batch57_CSE7322c_Spark_Recap_Revised.pptx
 20190929_Batch67_MiTH_VIVA_PanelAllocations_1_Panel4.gsheet
 2019-Nov-10_DrSriramMurthys_TextMining_Class2_Batch74.gdoc
 2019-Oct_Research-Status.docx
 2019-Oct_Research-Status_Jayant_Kumar_Mulmoodi_NLP.gdoc
 2019-Oct_Research-Status_Jayant_Kumar_Mulmoodi_YARN.gdoc
 20200102_Version1_CloudComputing_ver2.3_Final_withSAS.pdf
 20200102_Version1_CloudComputing_ver2.3

#### Creating an rdd by reading a file 

In [22]:
rdd2 = sc.textFile('/content/drive/My Drive/BigData/spark2020/temp_data.txt')

In [23]:
rdd2.collect()

['1901\t-78\t1',
 '1901\t-72\t1',
 '1901\t-94\t1',
 '1901\t-61\t1',
 '1901\t-56\t1',
 '1901\t-28\t1',
 '1901\t-67\t1',
 '1901\t-33\t1',
 '1901\t-28\t1',
 '1901\t-33\t1',
 '1901\t-44\t1',
 '1901\t-39\t1',
 '1901\t0\t1',
 '1901\t6\t1',
 '1901\t0\t1',
 '1901\t6\t1',
 '1901\t6\t1',
 '1901\t-11\t1',
 '1901\t-33\t1',
 '1901\t-50\t1',
 '1901\t-44\t1',
 '1901\t-28\t1',
 '1901\t-33\t1',
 '1901\t-33\t1',
 '1901\t-50\t1',
 '1901\t-33\t1',
 '1901\t-28\t1',
 '1901\t-44\t1',
 '1901\t-44\t1',
 '1901\t-44\t1',
 '1901\t-39\t1',
 '1901\t-50\t1',
 '1901\t-44\t1',
 '1901\t-39\t1',
 '1901\t-33\t1',
 '1901\t-22\t1',
 '1901\t0\t1',
 '1901\t-6\t1',
 '1901\t-17\t1',
 '1901\t-44\t1',
 '1901\t-39\t1',
 '1901\t-33\t1',
 '1901\t-6\t1',
 '1901\t17\t1',
 '1901\t22\t1',
 '1901\t28\t1',
 '1901\t28\t1',
 '1901\t11\t1',
 '1901\t-17\t1',
 '1901\t-28\t1',
 '1901\t-56\t1',
 '1901\t-44\t1',
 '1901\t-44\t1',
 '1901\t-67\t1',
 '1901\t-44\t1',
 '1901\t-39\t1',
 '1901\t-22\t1',
 '1901\t-22\t1',
 '1901\t-22\t1',
 '1901\t-39\t1',

In [24]:
rdd3 = rdd2.map(lambda s: s.split('\t'))

In [25]:
type(rdd3)

pyspark.rdd.PipelinedRDD

In [26]:
rdd3.collect()

[['1901', '-78', '1'],
 ['1901', '-72', '1'],
 ['1901', '-94', '1'],
 ['1901', '-61', '1'],
 ['1901', '-56', '1'],
 ['1901', '-28', '1'],
 ['1901', '-67', '1'],
 ['1901', '-33', '1'],
 ['1901', '-28', '1'],
 ['1901', '-33', '1'],
 ['1901', '-44', '1'],
 ['1901', '-39', '1'],
 ['1901', '0', '1'],
 ['1901', '6', '1'],
 ['1901', '0', '1'],
 ['1901', '6', '1'],
 ['1901', '6', '1'],
 ['1901', '-11', '1'],
 ['1901', '-33', '1'],
 ['1901', '-50', '1'],
 ['1901', '-44', '1'],
 ['1901', '-28', '1'],
 ['1901', '-33', '1'],
 ['1901', '-33', '1'],
 ['1901', '-50', '1'],
 ['1901', '-33', '1'],
 ['1901', '-28', '1'],
 ['1901', '-44', '1'],
 ['1901', '-44', '1'],
 ['1901', '-44', '1'],
 ['1901', '-39', '1'],
 ['1901', '-50', '1'],
 ['1901', '-44', '1'],
 ['1901', '-39', '1'],
 ['1901', '-33', '1'],
 ['1901', '-22', '1'],
 ['1901', '0', '1'],
 ['1901', '-6', '1'],
 ['1901', '-17', '1'],
 ['1901', '-44', '1'],
 ['1901', '-39', '1'],
 ['1901', '-33', '1'],
 ['1901', '-6', '1'],
 ['1901', '17', '1'],
 ['

### RDDs support two types of operations:
* Transformations are operations (such as map, filter, join, union, and so on) that are performed on an RDD and which yield a new RDD containing the result.

* Transformations in Spark are “lazy”, meaning that they do not compute their results right away. 
* They just “remember” the operation to be performed and the dataset (e.g., file) to which the operation is to be    performed. 
* The transformations are only actually computed when an action is called and the result is returned to the driver program. 
* This design enables Spark to run more efficiently. For example, if a big file was transformed in various ways and passed to first action, Spark would only process and return the result for the first line, rather than do the work for the entire file.

In [27]:
intRdd = sc.parallelize([10,20,30,40,50])
mapRdd = intRdd.map(lambda x : x**2)

In [28]:
mapRdd.collect()

[100, 400, 900, 1600, 2500]

In [29]:
numRdd = sc.parallelize([9,10,11,12,13,14,15,16,17,18,19,20])
oddRdd = numRdd.filter(lambda num : num%2 == 1)
oddRdd.collect()

[9, 11, 13, 15, 17, 19]

In [30]:
evenRdd = numRdd.filter(lambda num : num%2 == 0)
evenRdd.collect()

[10, 12, 14, 16, 18, 20]

In [31]:
x = sc.parallelize([('comp',1),('tab',1),('comp',1),('comp',1),('comp',1),
                     ('tab',1),('tab',1),('tab',1),('tab',1),('tab',1),('tab',1)])

In [32]:
x.collect()

[('comp', 1),
 ('tab', 1),
 ('comp', 1),
 ('comp', 1),
 ('comp', 1),
 ('tab', 1),
 ('tab', 1),
 ('tab', 1),
 ('tab', 1),
 ('tab', 1),
 ('tab', 1)]

In [33]:
y = x.reduceByKey(lambda a,b:a+b)
y.collect()

[('comp', 4), ('tab', 7)]

In [34]:
mapRddint = sc.parallelize([3,4,5]).map(lambda x : range(1,x)).collect()

In [35]:
mapRddint

[range(1, 3), range(1, 4), range(1, 5)]

In [36]:
flatMapRddInt = sc.parallelize([3,4,5]).flatMap(lambda x : range(1,x)).collect()

In [37]:
flatMapRddInt

[1, 2, 1, 2, 3, 1, 2, 3, 4]

In [38]:
sentence = ['Welecome to Big Data.','The batch is 88','Module is 7246o','Rdd are constructed']
sentRdd = sc.parallelize(sentence)


In [39]:
sentRdd

ParallelCollectionRDD[19] at parallelize at PythonRDD.scala:195

In [40]:
sentRdd.map(lambda sent : sent.split(' ')).collect()

[['Welecome', 'to', 'Big', 'Data.'],
 ['The', 'batch', 'is', '88'],
 ['Module', 'is', '7246o'],
 ['Rdd', 'are', 'constructed']]

In [41]:
wordsRdd = sentRdd.flatMap(lambda sent : sent.split(' '))
wordsRdd.collect()

['Welecome',
 'to',
 'Big',
 'Data.',
 'The',
 'batch',
 'is',
 '88',
 'Module',
 'is',
 '7246o',
 'Rdd',
 'are',
 'constructed']

In [43]:
profRdd = sc.parallelize([('proff1',1),('proff2',1),('proff1',1),('proff1',1),('proff2',1),('proff3',1)])

In [45]:
profRdd.groupByKey().collect()

[('proff1', <pyspark.resultiterable.ResultIterable at 0x7fef5803e518>),
 ('proff3', <pyspark.resultiterable.ResultIterable at 0x7fef5803e550>),
 ('proff2', <pyspark.resultiterable.ResultIterable at 0x7fef5803e5c0>)]

In [46]:
profRdd.groupByKey().map(lambda x : (x[0],list(x[1]))).collect()

[('proff1', [1, 1, 1]), ('proff3', [1]), ('proff2', [1, 1])]

In [48]:
proffRdd1 = sc.parallelize(['proff1','proff2','proff1'
,'proff1','proff2','proff3','proff1','proff2','proff3','student1','student2','student3'])

In [50]:
result = proffRdd1.groupBy(lambda word : word[0]).collect()

In [51]:
result

[('p', <pyspark.resultiterable.ResultIterable at 0x7fef58060cf8>),
 ('s', <pyspark.resultiterable.ResultIterable at 0x7fef58060048>)]

In [52]:
[(x,list(y)) for (x,y) in result]

[('p',
  ['proff1',
   'proff2',
   'proff1',
   'proff1',
   'proff2',
   'proff3',
   'proff1',
   'proff2',
   'proff3']),
 ('s', ['student1', 'student2', 'student3'])]

In [53]:
deviceRdd = sc.parallelize(['tab','computer','mobile','router','mouseclick'])
pairRdd = deviceRdd.map(lambda x : (len(x),x))

In [54]:
pairRdd.collect()

[(3, 'tab'), (8, 'computer'), (6, 'mobile'), (6, 'router'), (10, 'mouseclick')]

In [55]:
result = pairRdd.mapValues(lambda y : "Device Name is " + y)

In [56]:
result.collect()

[(3, 'Device Name is tab'),
 (8, 'Device Name is computer'),
 (6, 'Device Name is mobile'),
 (6, 'Device Name is router'),
 (10, 'Device Name is mouseclick')]

In [57]:
rdd1 = sc.parallelize([("Mercedes","E-Class"),("Toyota","Corolla"),("Renault","Duster")])
rdd2 = sc.parallelize([("Mercedes","S-Class"),("Toyota","Fortuner"),("Suzuki","Mayona")])
innerJoinRdd = rdd1.join(rdd2)

In [58]:
innerJoinRdd.collect()

[('Mercedes', ('E-Class', 'S-Class')), ('Toyota', ('Corolla', 'Fortuner'))]

In [59]:
leftOuterJoinRdd = rdd1.leftOuterJoin(rdd2)
leftOuterJoinRdd.collect()

[('Mercedes', ('E-Class', 'S-Class')),
 ('Renault', ('Duster', None)),
 ('Toyota', ('Corolla', 'Fortuner'))]

In [60]:
unionRdd = rdd1.union(rdd2)
unionRdd.collect()

[('Mercedes', 'E-Class'),
 ('Toyota', 'Corolla'),
 ('Renault', 'Duster'),
 ('Mercedes', 'S-Class'),
 ('Toyota', 'Fortuner'),
 ('Suzuki', 'Mayona')]

In [61]:
unionRdd.first()

('Mercedes', 'E-Class')

In [62]:
unionRdd.take(2)

[('Mercedes', 'E-Class'), ('Toyota', 'Corolla')]

In [63]:
unionRdd.takeOrdered(4)

[('Mercedes', 'E-Class'),
 ('Mercedes', 'S-Class'),
 ('Renault', 'Duster'),
 ('Suzuki', 'Mayona')]

### Word Count Example

In [64]:
inputRdd = sc.textFile('/content/drive/My Drive/BigData/spark2020/input.txt')

In [65]:
inputRdd.take(2)

['Hello All', 'Test practice']

In [66]:
inputRdd.count()

2

In [67]:
wordRdd = inputRdd.flatMap(lambda x : x.split(' ')).map(lambda word : (word,1)).reduceByKey(lambda a,b:a+b)

In [68]:
wordRdd.collect()

[('Hello', 1), ('Test', 1), ('All', 1), ('practice', 1)]

In [69]:
sortedWords = wordRdd.sortByKey()

In [70]:
sortedWords.collect()

[('All', 1), ('Hello', 1), ('Test', 1), ('practice', 1)]