In [1]:
import os
import sys
os.environ["SPARK_HOME"] = "/usr/hdp/current/spark2-client"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
sys.path.insert(0, os.environ["PYLIB"] + "/py4j-0.10.4-src.zip")
sys.path.insert(0, os.environ["PYLIB"] + "/pyspark.zip")

In [2]:
from pyspark import SparkConf,SparkContext
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.master("local[*]").appName("RDD Commands") \
.config(conf = SparkConf()).getOrCreate()


 AppName Sets a name for the application, which will be shown in the Spark web UI.
    

 Config Sets a config option. Options set using this method are automatically propagated to both SparkConf and SparkSession‘s own configuration.

master is a Spark, Mesos or YARN cluster URL, or a special “local” string to run in local modemaster 

getOrCreate() Gets an existing SparkSession or, if there is no existing one, creates a new one based on the options set in this builder.

In [5]:

sc = spark.sparkContext


In [6]:
sc

* Spark introduces the concept of an RDD (Resilient Distributed Dataset), an
 immutable fault-tolerant, distributed collection of objects that can be operated on
 in parallel. 

* An RDD can contain any type of object and is created by loading an
 external dataset or distributing a collection from the driver program.

### Creating RDD in Pyspark

#####  There are three ways to create an RDD in Spark.

* Parallelizing already existing collection in driver program.
* Referencing a dataset in an external storage system (e.g. HDFS, Hbase, shared file system).
* Creating RDD from already existing RDDs.

In [6]:
rdd1 = sc.parallelize([("maths",52),("english",75),("science",82),
                       ("computer",65),("maths",85)])

In [7]:
rdd1.collect()


[('maths', 52),
 ('english', 75),
 ('science', 82),
 ('computer', 65),
 ('maths', 85)]

In [8]:
! cat temp_data.txt | head -10

1901	-78	1
1901	-72	1
1901	-94	1
1901	-61	1
1901	-56	1
1901	-28	1
1901	-67	1
1901	-33	1
1901	-28	1
1901	-33	1
cat: write error: Broken pipe


In [7]:
rdd2 = sc.textFile("file:///home/jayantm/Batches/Batch41/sparkrdd/temp_data.txt") 

In [None]:
rdd2.collect()

In [13]:
rdd3 = rdd2.map(lambda s : s.split('\t'))

In [14]:
type(rdd3)

pyspark.rdd.PipelinedRDD

In [15]:
rdd3.collect()

[[u'1901', u'-78', u'1'],
 [u'1901', u'-72', u'1'],
 [u'1901', u'-94', u'1'],
 [u'1901', u'-61', u'1'],
 [u'1901', u'-56', u'1'],
 [u'1901', u'-28', u'1'],
 [u'1901', u'-67', u'1'],
 [u'1901', u'-33', u'1'],
 [u'1901', u'-28', u'1'],
 [u'1901', u'-33', u'1'],
 [u'1901', u'-44', u'1'],
 [u'1901', u'-39', u'1'],
 [u'1901', u'0', u'1'],
 [u'1901', u'6', u'1'],
 [u'1901', u'0', u'1'],
 [u'1901', u'6', u'1'],
 [u'1901', u'6', u'1'],
 [u'1901', u'-11', u'1'],
 [u'1901', u'-33', u'1'],
 [u'1901', u'-50', u'1'],
 [u'1901', u'-44', u'1'],
 [u'1901', u'-28', u'1'],
 [u'1901', u'-33', u'1'],
 [u'1901', u'-33', u'1'],
 [u'1901', u'-50', u'1'],
 [u'1901', u'-33', u'1'],
 [u'1901', u'-28', u'1'],
 [u'1901', u'-44', u'1'],
 [u'1901', u'-44', u'1'],
 [u'1901', u'-44', u'1'],
 [u'1901', u'-39', u'1'],
 [u'1901', u'-50', u'1'],
 [u'1901', u'-44', u'1'],
 [u'1901', u'-39', u'1'],
 [u'1901', u'-33', u'1'],
 [u'1901', u'-22', u'1'],
 [u'1901', u'0', u'1'],
 [u'1901', u'-6', u'1'],
 [u'1901', u'-17', u'1'],

### RDDs support two types of operations:
* Transformations are operations (such as map, filter, join, union, and so on) that are performed on an RDD and which yield a new RDD containing the result.

* Actions are operations (such as reduce, count, first, and so on) that return a value after running a computation on an RDD.

* Transformations in Spark are “lazy”, meaning that they do not compute their results right away. 
* They just “remember” the operation to be performed and the dataset (e.g., file) to which the operation is to be    performed. 
* The transformations are only actually computed when an action is called and the result is returned to the driver program. 
* This design enables Spark to run more efficiently. For example, if a big file was transformed in various ways and passed to first action, Spark would only process and return the result for the first line, rather than do the work for the entire file.

### Transformations

* coalesce() - Return a new RDD that is reduced into numPartitions partitions.

* Map Transformation

In [16]:
intRdd = sc.parallelize([10, 20, 30, 40, 50])    

In [17]:
mapRDD = intRdd.map(lambda x : x**2)
mapRDD.collect()

[100, 400, 900, 1600, 2500]

* Filter(Transformation):
    
* The filter operation evaluates a Boolean function for each data item of the RDD
 and puts the items for which the function returned true into the resulting RDD. Filter
 is a Transformation. Collect is an Action.

In [18]:
numRdd = sc.parallelize([11,12,13,14,15,16,17,18])
filterRdd1 = numRdd.filter(lambda x : x%2 == 1)
filterRdd1.collect()

[11, 13, 15, 17]

In [19]:
filterRdd2 = numRdd.filter(lambda x : x%2 == 0)
filterRdd2.collect()

[12, 14, 16, 18]

* ReduceByKey (Transformation):
* Spark RDD reduceByKey function merges the values for each key using an associative reduce function. Basically reduceByKey function works only for RDDs which contains key and value pairs kind of elements (i.e. RDDs having tuple or Map as a data element).

In [20]:
x = sc.parallelize([("comp", 1), ("tab", 1), ("comp", 1), ("comp", 1),
("tab", 1), ("tab", 1), ("tab", 1), ("tab", 1)])

In [None]:
x.collect()

In [22]:
y = x.reduceByKey(lambda a, b: a + b)

In [23]:
y.collect()

[('comp', 3), ('tab', 5)]

* flatMap (Transformation) :
* Spark flatMap function returns a new RDD by first applying a function to all elements of this RDD, and then flattening the results.

In [24]:
sc.parallelize([3,4,5]).map(lambda x: range(1,x)).collect()

[[1, 2], [1, 2, 3], [1, 2, 3, 4]]

In [25]:
sc.parallelize([3,4,5]).flatMap(lambda x: range(1,x)).collect()

[1, 2, 1, 2, 3, 1, 2, 3, 4]

In [26]:
sentRdd = sc.parallelize(["Welcome to Batch 36.", "This is Lab Session","We are doing pySpark Activity"])

In [27]:
sentRdd.map(lambda x: x.split(' ')).collect()

[['Welcome', 'to', 'Batch', '36.'],
 ['This', 'is', 'Lab', 'Session'],
 ['We', 'are', 'doing', 'pySpark', 'Activity']]

In [28]:
wordlist = sentRdd.flatMap(lambda x: x.split(' ')).collect()

In [29]:
type(wordlist)

list

In [30]:
wordlist

['Welcome',
 'to',
 'Batch',
 '36.',
 'This',
 'is',
 'Lab',
 'Session',
 'We',
 'are',
 'doing',
 'pySpark',
 'Activity']

* groupByKey(Transformation):
* Spark groupByKey function returns a new RDD. The returned RDD gives back an object which allows to iterate over the results. The results of groupByKey returns a list by calling list() on values.

In [31]:
example = sc.parallelize([('x',1), ('x',1), ('y', 1), ('z', 1)])

In [32]:
example.collect()

[('x', 1), ('x', 1), ('y', 1), ('z', 1)]

In [33]:
example.groupByKey().collect()

[('y', <pyspark.resultiterable.ResultIterable at 0x31ca1d0>),
 ('x', <pyspark.resultiterable.ResultIterable at 0x31ca650>),
 ('z', <pyspark.resultiterable.ResultIterable at 0x31ca6d0>)]

In [34]:
itRdd = example.groupByKey()

In [35]:
itRdd.map(lambda x : (x[0], list(x[1]))).collect()

[('y', [1]), ('x', [1, 1]), ('z', [1])]

* groupBy (Transformation) :
* groupBy function returns an RDD of grouped items. This operation will return the new RDD which basically is made up with a KEY (which is a group) and list of items of that group (in a form of Iterator). Order of element within the group may not same when you apply the same operation on the same RDD over and over.

In [36]:
namesRdd = sc.parallelize(["Joseph", "Jimmy", "Tina","Thomas","James","Cory","Christine", "Jackeline", "Juan"])

In [37]:
namesRdd.collect()

['Joseph',
 'Jimmy',
 'Tina',
 'Thomas',
 'James',
 'Cory',
 'Christine',
 'Jackeline',
 'Juan']

In [38]:
result =namesRdd.groupBy(lambda word: word[0]).collect()

In [39]:
result

[('C', <pyspark.resultiterable.ResultIterable at 0x29f32d0>),
 ('J', <pyspark.resultiterable.ResultIterable at 0x317c3d0>),
 ('T', <pyspark.resultiterable.ResultIterable at 0x317cc10>)]

In [40]:
[(x, list(y)) for (x, y) in result]

[('C', ['Cory', 'Christine']),
 ('J', ['Joseph', 'Jimmy', 'James', 'Jackeline', 'Juan']),
 ('T', ['Tina', 'Thomas'])]

In [96]:
# sorted([(x, sorted(y)) for (x, y) in result])

* mapValues (Transformation) :
* Apply a function to each value of a pair RDD without changing the key.

In [43]:
namesRdd = sc.parallelize(["dog", "tiger", "lion", "cat", "panther","eagle"])
pairRdd = namesRdd.map(lambda x :(len(x), x))

In [44]:
pairRdd.collect()

[(3, 'dog'),
 (5, 'tiger'),
 (4, 'lion'),
 (3, 'cat'),
 (7, 'panther'),
 (5, 'eagle')]

In [45]:
result = pairRdd.mapValues(lambda y: "Animal name is " + y)
result.collect()

[(3, 'Animal name is dog'),
 (5, 'Animal name is tiger'),
 (4, 'Animal name is lion'),
 (3, 'Animal name is cat'),
 (7, 'Animal name is panther'),
 (5, 'Animal name is eagle')]

* join (pair Rdd Transformation): 

In [46]:
rdd1 = sc.parallelize([("red",20),("red",30),("blue", 100)])
rdd2 = sc.parallelize([("red",40),("red",50),("yellow", 10000)])

In [47]:
rdd1.join(rdd2).collect()

[('red', (20, 40)), ('red', (20, 50)), ('red', (30, 40)), ('red', (30, 50))]

* inner join and outer join (Transformation)

In [48]:
rdd1 = sc.parallelize([("Mercedes", "E-Class"), ("Toyota", "Corolla"),("Renault", "Duster")])
rdd2 = sc.parallelize([("Mercedes", "C-Class"), ("Toyota", "Prius"),("Toyota", "Etios")])

In [49]:
innerJoinRdd = rdd1.join(rdd2)
innerJoinRdd.collect()

[('Mercedes', ('E-Class', 'C-Class')),
 ('Toyota', ('Corolla', 'Prius')),
 ('Toyota', ('Corolla', 'Etios'))]

In [50]:
outerJoinRdd = rdd1.leftOuterJoin(rdd2)
outerJoinRdd.collect()

[('Renault', ('Duster', None)),
 ('Mercedes', ('E-Class', 'C-Class')),
 ('Toyota', ('Corolla', 'Prius')),
 ('Toyota', ('Corolla', 'Etios'))]

* Union:
* Combines the values in various Rdds to form a cohesive unit

In [51]:
d1= [('k1', 1), ('k2', 2), ('k3', 5)]
d2= [('k1', 3), ('k2',4), ('k4', 8)]

In [52]:
d1_RDD = sc.parallelize(d1)
d2_RDD = sc.parallelize(d2)

In [53]:
d3_union = d1_RDD.union(d2_RDD)

In [54]:
d3_union.collect()

[('k1', 1), ('k2', 2), ('k3', 5), ('k1', 3), ('k2', 4), ('k4', 8)]

* collect (Action):
* Collect action returns the results or the value. When an action is called transformations are executed.

In [55]:
rdd1 = sc.textFile('file:///home/jayantm/Batches/Batch41/sparkrdd/input.txt')

In [56]:
rdd1.collect()

[u'Hello All', u'Test practice']

In [57]:
rdd1.first()

u'Hello All'

In [58]:
rdd1.take(2)

[u'Hello All', u'Test practice']

* takeOrdered(Action):
* Orders the data items of the RDD using their inherent implicit ordering function and returns the first n items as an array.

In [59]:
rdd1 = sc.parallelize(["dog", "cat", "ape", "salmon", "gnu"])
rdd1.takeOrdered(3)

['ape', 'cat', 'dog']

* reduce (Action):
* This function provides the well-known reduce functionality in Spark. Please note that any function f you provide, should be commutative in order to generate reproducible results.

In [60]:
intVals = range(1,15)
numRdd = sc.parallelize(intVals)
cSum = numRdd.reduce(lambda a, b: a + b)

In [61]:
cSum

105

### Word Count Example

In [62]:
textdata =sc.textFile('file:///home/jayantm/Batches/Batch41/sparkrdd/washingtonData.txt')

In [63]:
textdata.collect()

[u"The Washington National Opera was established in 1957 as the Opera Society of Washington by Day Thorpe, the music critic of the now defunct Washington Star, but then the most influential Washington newspaper of its day. Paul Callaway, the choirmaster and organist of the Washington National Cathedral, was its first music director. Together, the two set out to seek funding and they found support from Gregory and Peggy Smith who provided $10,000 as seed money for a production of Mozart's Die Entf\xfchrung aus dem Serail which would be performed following the end of their summer season (which Calloway conducted) by the Washington Symphony Orchestra.",
 u'',
 u'Characteristic of Thorpe and Calloway\'s early years was a rejection of cuts to the scores, a rejection of opera in English, and a rejection of expensive scenery as well as of "fat sopranos" and "self-centered tenors".[2]',
 u'',
 u'The pair set out to seek a new public and, beginning with the first production of Die Entf\xfchrung

In [64]:
### Calculating the word count
# creating tokens by spliting the text based on space and flatenning the data
# assigning value 1 to each token
# count the occurences of the word

In [65]:
word_tokens = textdata.flatMap(lambda line: line.split(' '))\
                        .map(lambda word: (word, 1))\
                        .reduceByKey(lambda a, b: a + b) 

In [66]:
# Displaying the word and count
word_tokens.collect()

[(u'', 6),
 (u'when', 1),
 (u'Smith', 1),
 (u'not', 1),
 (u"'company'", 1),
 (u"Calloway's", 1),
 (u'White', 1),
 (u'Fe.', 1),
 (u'time,', 1),
 (u'ballet', 1),
 (u'Stravinsky', 4),
 (u'had', 4),
 (u'onwards:', 1),
 (u'aus', 1),
 (u'follow.', 1),
 (u'years,', 1),
 (u'regularly', 2),
 (u'"Capital', 1),
 (u'Magic', 2),
 (u'Santa', 2),
 (u'very', 1),
 (u'downs', 1),
 (u'early', 3),
 (u'1957', 2),
 (u'English,', 1),
 (u'they', 1),
 (u'press', 1),
 (u'Old', 1),
 (u'However,', 4),
 (u'bringing', 1),
 (u'opera', 5),
 (u'venue', 1),
 (u'"There', 1),
 (u'Four', 1),
 (u"singers'", 1),
 (u'Cathedral,', 1),
 (u'small', 1),
 (u'Paul', 2),
 (u'Carlo', 1),
 (u'A', 1),
 (u'financial', 1),
 (u'Washington', 8),
 (u"University's", 1),
 (u'some', 1),
 (u'Some', 1),
 (u'1960s', 1),
 (u"Callaway's", 1),
 (u'Auditorium,', 1),
 (u'Serail', 1),
 (u'excerpts', 1),
 (u'funding', 1),
 (u'for', 2),
 (u'Successful', 1),
 (u'day.', 1),
 (u'arranged', 1),
 (u'Manticore.', 1),
 (u'expensive', 1),
 (u'illnesses.', 1),
 

In [67]:
# displaying the number of words
word_tokens.count()

293

In [68]:
# Sorting the words and count based on key
sorted = word_tokens.sortByKey()

In [69]:
sorted.collect()

[(u'', 6),
 (u'"Capital', 1),
 (u'"Sparkle', 2),
 (u'"There', 1),
 (u'"a', 1),
 (u'"fat', 1),
 (u'"ill-starred"', 1),
 (u'"self-centered', 1),
 (u'"the', 1),
 (u'$10,000', 1),
 (u"'company'", 1),
 (u'(along', 1),
 (u'(conducted', 1),
 (u'(which', 1),
 (u'-', 2),
 (u'1956', 1),
 (u'1957', 2),
 (u'1957,', 1),
 (u'1958', 1),
 (u'1960s', 1),
 (u'1960s.', 1),
 (u'1961', 1),
 (u'1966', 1),
 (u'31', 1),
 (u'A', 1),
 (u'Administrator', 1),
 (u'Ariadne', 1),
 (u'Artistic', 1),
 (u'Auditorium,', 1),
 (u'Bliss', 1),
 (u'But', 1),
 (u'By', 1),
 (u"Callaway's", 1),
 (u'Callaway,', 1),
 (u'Calloway', 1),
 (u"Calloway's", 1),
 (u'Carlo', 1),
 (u'Cathedral,', 1),
 (u'Characteristic', 1),
 (u'Day', 1),
 (u'December', 2),
 (u'Die', 2),
 (u'Each', 1),
 (u'English,', 1),
 (u'English-language', 1),
 (u'Entf\xfchrung', 2),
 (u'Erwartung', 1),
 (u'Erwartung)', 1),
 (u'Fe', 1),
 (u'Fe.', 1),
 (u'Fidelio;', 1),
 (u'Flute', 2),
 (u'Four', 1),
 (u'George', 1),
 (u'Gian', 1),
 (u'Gorgon,', 1),
 (u'Gregory', 1),
 