In [42]:
from pyspark import SparkContext
sc=sc.stop()
sc = SparkContext("local","name")
rdd = sc.textFile("files.txt")

### MAP and FlatMAP

In [43]:
def test_fun(lines):
    lines=lines.lower()
    lines=lines.split(' ')
    return lines
map_rdd1 = rdd.map(test_fun)
map_rdd1.take(5)

[['spark',
  'has',
  'certain',
  'operations',
  'which',
  'can',
  'be',
  'performed',
  'on',
  'rdd.',
  'an',
  'operation',
  'is',
  'a',
  'method,',
  'which',
  'can',
  'be',
  'applied',
  'on',
  'a',
  'rdd',
  'to',
  'accomplish',
  'certain',
  'task.',
  'rdd',
  'supports',
  'two',
  'types',
  'of',
  'operations,',
  'which',
  'are',
  'action',
  'and',
  'transformation.',
  'an',
  'operation',
  'can',
  'be',
  'something',
  'as',
  'simple',
  'as',
  'sorting,',
  'filtering',
  'and',
  'summarizing',
  'data.spark',
  'is',
  'used',
  'in',
  'fastest',
  'compared',
  'to',
  'hive',
  'engine']]

In [44]:
rdd2= rdd.flatMap(test_fun)
rdd2.take(5)

['spark', 'has', 'certain', 'operations', 'which']

### Filter :

Removing the stopword from the file like is ,am ,are , a,the, for . To remove the stop words, we can use a “filter” transformation which will return a new RDD containing only the elements that satisfy given condition.

In [45]:
stopword =['is','am' ,'a', 'are','the','for']
filter_rdd = rdd2.filter(lambda x :  x not in stopword)
filter_rdd.take(10)

['spark',
 'has',
 'certain',
 'operations',
 'which',
 'can',
 'be',
 'performed',
 'on',
 'rdd.']

### groupBy : 

The “groupBy”  transformation will group the data in the original RDD. It creates a set of key value pairs, where the key is output of a user function, and the value is all items for which the function yields this key.

In [46]:
groupby_rdd = filter_rdd.groupBy(lambda w : w[0:3])
groupby_rdd.take(2)

[('spa', <pyspark.resultiterable.ResultIterable at 0x106cc8a58>),
 ('has', <pyspark.resultiterable.ResultIterable at 0x106cc8470>)]

In [49]:
map_data = filter_rdd.map(lambda y : (y,1))
map_data.take(10)
groupby_data = map_data.groupByKey()
groupby_data.take(10)

[('spark', <pyspark.resultiterable.ResultIterable at 0x106cc8a20>),
 ('has', <pyspark.resultiterable.ResultIterable at 0x106cc8cc0>),
 ('certain', <pyspark.resultiterable.ResultIterable at 0x106cc8780>),
 ('operations', <pyspark.resultiterable.ResultIterable at 0x106cc8b38>),
 ('which', <pyspark.resultiterable.ResultIterable at 0x106cc8240>),
 ('can', <pyspark.resultiterable.ResultIterable at 0x106cc8c18>),
 ('be', <pyspark.resultiterable.ResultIterable at 0x106cc84a8>),
 ('performed', <pyspark.resultiterable.ResultIterable at 0x106cc8ba8>),
 ('on', <pyspark.resultiterable.ResultIterable at 0x106cc8320>),
 ('rdd.', <pyspark.resultiterable.ResultIterable at 0x106cc88d0>)]

In [50]:
print (list((j[0],j[1]) for j in groupby_data.take(10)))

[('spark', <pyspark.resultiterable.ResultIterable object at 0x106cd81d0>), ('has', <pyspark.resultiterable.ResultIterable object at 0x106cd8240>), ('certain', <pyspark.resultiterable.ResultIterable object at 0x106cd8358>), ('operations', <pyspark.resultiterable.ResultIterable object at 0x106cd8438>), ('which', <pyspark.resultiterable.ResultIterable object at 0x106cd82e8>), ('can', <pyspark.resultiterable.ResultIterable object at 0x106cd84e0>), ('be', <pyspark.resultiterable.ResultIterable object at 0x106cd8668>), ('performed', <pyspark.resultiterable.ResultIterable object at 0x106cd8048>), ('on', <pyspark.resultiterable.ResultIterable object at 0x106cd8828>), ('rdd.', <pyspark.resultiterable.ResultIterable object at 0x106cd8128>)]


In [55]:
filter_rdd.filter(lambda x : x =='which').collect()

['which', 'which', 'which']

#### Count the each word in the file : 

The “mapValues” transformation is like a map (can be applied on any RDD) transform but it has one difference that when we apply map transform on pair RDD we can access the key and value both of this RDD but in case of “mapValues” transformation, it will transform the values by applying some function and key will not be affected. So for example, in below code I applied sum, which will calculate the sum (counts) for the each word.

In [63]:
fregency_check = groupby_data.mapValues(sum).map(lambda x : (x[0],x[1]))
fregency_check.take(5)

[('spark', 1), ('has', 1), ('certain', 2), ('operations', 1), ('which', 3)]

### Count the each word in the file using reduceByKey:

The “reduceByKey” transformations first combined the values for each key in all partition, so each partition will have only one value for a key then after shuffling, in reduce phase executors will apply operation for example, in my case sum(lambda x: x+y).



In [66]:
reduce_key_frquency = map_data.reduceByKey(lambda x,y: x+y).map(lambda x : (x[0],x[1]))
reduce_key_frquency.take(5)

[('spark', 1), ('has', 1), ('certain', 2), ('operations', 1), ('which', 3)]

But in case of “groupByKey” transformation, it will not combine the values in each key in all partition it directly shuffle the data then merge the values for each key. Here in “groupByKey” transformation lot of shuffling in the data is required to get the answer, so it is better to use “reduceByKey” in case of large shuffling of data.



### mapPartitions

The “mapPartitions” is like a map transformation but runs separately on different partitions of a RDD. So, for counting the frequencies of words ‘which’ and ‘certain’ in each partition of RDD.

I am creating the function to calculate which will count the frequency of the words then passing function to mapPartition transformation


In [73]:
def word_frquency_check(iterator):
    
    count_word1=0
    count_word2=0
    
    for i in iterator:
        if i == 'which':
            count_word1 +=1
        if i == 'certain':
            count_word2 +=1
    
    return(count_word1,count_word2) 
filter_rdd.mapPartitions(word_frquency_check).glom().collect()

[[3, 2]]

### To Check the number of partition in the RDD : 

filter_rdd.getNumPartitions()

In [72]:
filter_rdd.getNumPartitions()

1

### coalesce:

To reduce the number of partition in the RDD Coalesce is used to reduce the partition and it's will creates new RDD.

In [77]:
reduce_part=filter_rdd.coalesce(1)
reduce_part.take(5)

['spark', 'has', 'certain', 'operations', 'which']

# Actions: 

I have already covered the some action in above practice 
1. Collect 
2. Take
3. getNumPartitions


### Reduce :

A reduce action is use for aggregating all the elements of RDD by applying pairwise user function.



In [82]:
rdd_4 = sc.parallelize(range(1,2000))
reduce_check = rdd_4.reduce(lambda x,y: x+y)
reduce_check

1999000