In [0]:
from pyspark import SparkConf, SparkContext #spark conf enable configuration like reading data from external data sources, spark context is the entry point of the spark at the cluster like rdd


In [0]:
conf = SparkConf().setAppName('Master')

In [0]:
sc = SparkContext.getOrCreate(conf=conf)

In [0]:
text = sc.textFile('/FileStore/tables/test.txt')

In [0]:
text.collect() #collect means run transformations that are behind the action

Out[10]: ['1 2 3 4 5 6 ', '7 8 9 10 11 12', '13 14 15 16 17 ']

In [0]:
text2 = text.map(lambda x: x + 'Teniola')
text2.collect()

Out[11]: ['1 2 3 4 5 6 Teniola', '7 8 9 10 11 12Teniola', '13 14 15 16 17 Teniola']

In [0]:
def foo(x):
    return x.split(' ')
text3 = text.map(foo)
text3.collect()

Out[12]: [['1', '2', '3', '4', '5', '6', ''],
 ['7', '8', '9', '10', '11', '12'],
 ['13', '14', '15', '16', '17', '']]

In [0]:
input = sc.textFile('/FileStore/tables/input.txt')
input.collect()

Out[13]: ['Hi how are you?', 'my name is Teniola', 'Great']

In [0]:
def len_words(data):
    return [len(x) for x in data.replace('?', '').split(' ')]
input1 = input.map(len_words)
input1.collect()

Out[14]: [[2, 3, 3, 3], [2, 4, 2, 7], [5]]

In [0]:
input2 = input.map(lambda x: [len(s) for s in x.split(' ')])
input2.collect()

Out[15]: [[2, 3, 3, 4], [2, 4, 2, 7], [5]]

In [0]:
## FLATMAP flatMap()
## flatmap is used as a mapper of data and explodes data before final output => rdd.flatMap(lambda x:x.split())

In [0]:
flat_input = input.flatMap(lambda x : x.split(' '))
flat_input.collect()

Out[17]: ['Hi', 'how', 'are', 'you?', 'my', 'name', 'is', 'Teniola', 'Great']

In [0]:
def foo(x):
    return True

In [0]:
## Filter it is used to remove an element form the RDD
text4 = text.filter(lambda x: x != '13 14 15 16 17 ')
text4.collect()

Out[19]: ['1 2 3 4 5 6 ', '7 8 9 10 11 12']

In [0]:
filter_rdd = sc.textFile('/FileStore/tables/filter.txt')
filter_rdd.collect()

Out[20]: ['This mango company animal',
 'Cat dog ant mci laptop',
 'chair switch mobile am charger cover',
 'amanda any alarm ant']

In [0]:
flamapped_rdd = filter_rdd.flatMap(lambda y: y.split(' '))

In [0]:
def filter_map(data):
    if data.lower().startswith('a') or data.lower().startswith('c'):
        return False
    return True

In [0]:
flatmapped_rdd = flamapped_rdd.filter(filter_map)
flatmapped_rdd.collect()

Out[23]: ['This', 'mango', 'dog', 'mci', 'laptop', 'switch', 'mobile']

In [0]:
flamapped_rdd.collect()

Out[24]: ['This',
 'mango',
 'company',
 'animal',
 'Cat',
 'dog',
 'ant',
 'mci',
 'laptop',
 'chair',
 'switch',
 'mobile',
 'am',
 'charger',
 'cover',
 'amanda',
 'any',
 'alarm',
 'ant']

In [0]:
## using lambda with filter

filter_by_lambda = flamapped_rdd.filter(lambda x: not (x.lower().startswith('a') or x.lower().startswith('c')))
filter_by_lambda.collect()

Out[25]: ['This', 'mango', 'dog', 'mci', 'laptop', 'switch', 'mobile']

In [0]:
## RDD distinct()
## distinct is used to get the distinct elements in RDD
filter_rdd.flatMap(lambda x: x.split(' ')).distinct().collect()

Out[26]: ['mango',
 'Cat',
 'ant',
 'mci',
 'laptop',
 'chair',
 'switch',
 'mobile',
 'am',
 'This',
 'company',
 'animal',
 'dog',
 'charger',
 'cover',
 'amanda',
 'any',
 'alarm']

In [0]:
## RDD group by
## group by is used to create groups based on keys in RDD
## for groupbyKey to work properly the data must be in the format of tuple with k, v i.e., (k, v)
## mapValues(list) are usually used to get the proup data

filter_rdd.flatMap(lambda x: x.split(' ')).map(lambda x: (x, len(x))).groupByKey().mapValues(list).collect()

Out[27]: [('mango', [5]),
 ('Cat', [3]),
 ('ant', [3, 3]),
 ('mci', [3]),
 ('laptop', [6]),
 ('chair', [5]),
 ('switch', [6]),
 ('mobile', [6]),
 ('am', [2]),
 ('This', [4]),
 ('company', [7]),
 ('animal', [6]),
 ('dog', [3]),
 ('charger', [7]),
 ('cover', [5]),
 ('amanda', [6]),
 ('any', [3]),
 ('alarm', [5])]

In [0]:
### reduceByKey is used to combine data based on Keys in RDD
## for reduceByKey to work properly the data must be in the format (K, v) rdd.reduceByKey(lambda x, y: x + y) reduce by needs an expression while groupby doesnt 

In [0]:
text.collect()

Out[29]: ['1 2 3 4 5 6 ', '7 8 9 10 11 12', '13 14 15 16 17 ']

In [0]:
text.flatMap(lambda x: x.split(' ')).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y).collect()

Out[30]: [('1', 1),
 ('4', 1),
 ('', 2),
 ('8', 1),
 ('9', 1),
 ('10', 1),
 ('12', 1),
 ('14', 1),
 ('16', 1),
 ('17', 1),
 ('2', 1),
 ('3', 1),
 ('5', 1),
 ('6', 1),
 ('7', 1),
 ('11', 1),
 ('13', 1),
 ('15', 1)]

In [0]:
reduce = sc.textFile('/FileStore/tables/reduce_quiz.txt')
reduce.collect()

Out[31]: ['This mango company animal',
 'cat mango ant ant laptop',
 'chair switch mobile am charger cover',
 'amanda any alarm ant this charger ',
 'amanda any laptop animal']

In [0]:
reduce.flatMap(lambda x: x.split(' ')).filter(lambda x: x != '').map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y).collect()

Out[32]: [('mango', 2),
 ('cat', 1),
 ('ant', 3),
 ('laptop', 2),
 ('chair', 1),
 ('switch', 1),
 ('mobile', 1),
 ('am', 1),
 ('this', 1),
 ('This', 1),
 ('company', 1),
 ('animal', 2),
 ('charger', 2),
 ('cover', 1),
 ('amanda', 2),
 ('any', 2),
 ('alarm', 1)]

In [0]:
## count() it reursn the number of element in RDD count is an action
input.flatMap(lambda x: x.split(' ')).count()

Out[35]: 9

In [0]:
## countByValue() provides how many times each value occur in RDD countByValue is an action rdd.countByValue() it returns a dictionary of key value pair

In [0]:
## saving RDD to textfile SaveAsTextFile is used to save RDD to file rdd.SaveAsTextFile('path.filename.txt')

In [0]:
## repartiton(), coalesce() 