In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [2]:
conf = SparkConf().setAppName('SparkApp').setMaster('local')
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

In [26]:
txt_file = sc.textFile("./data/sample.txt")

In [27]:
print(txt_file.collect())

['Master Chief Petty Officer John-117 or Master Chief is a fictional character and the protagonist in the Halo multimedia franchise', 'Master Chief is a playable character in the series of science fiction first-person shooter video games Halo', 'Master Chief is a towering supersoldier known as a Spartan raised and trained from childhood for combat']


#### Q1: Convert all words in a rdd to lowercase and split the lines of a document using space.

In [28]:
splitted_line_rdd = txt_file.map(lambda x: x.lower().split())

words_rdd = splitted_line_rdd.flatMap(lambda data: [x for x in data])

In [37]:
print(words_rdd.collect()) # printing all words
print("\nSize :: %d" % list_of_words.count())

['master', 'chief', 'petty', 'officer', 'john-117', 'or', 'master', 'chief', 'is', 'a', 'fictional', 'character', 'and', 'the', 'protagonist', 'in', 'the', 'halo', 'multimedia', 'franchise', 'master', 'chief', 'is', 'a', 'playable', 'character', 'in', 'the', 'series', 'of', 'science', 'fiction', 'first-person', 'shooter', 'video', 'games', 'halo', 'master', 'chief', 'is', 'a', 'towering', 'supersoldier', 'known', 'as', 'a', 'spartan', 'raised', 'and', 'trained', 'from', 'childhood', 'for', 'combat']

Size :: 54


#### Q2: Next, I want to remove the words, which are not necessary to analyze this text. We call these words as “stop words”; Stop words do not add much value in a text. For example, “is”, “am”, “are” and “the” are few examples of stop words.

In [34]:
stop_words_list = sc.textFile("./data/stop_words.txt").collect()
no_stopwords_rdd = list_of_words.filter(lambda x: x not in stop_words_list)

In [36]:
print(no_stopwords_rdd.collect()) # printing only the first 20 words
print("\nSize :: %d" % no_stopwords_rdd.count())

['master', 'chief', 'petty', 'officer', 'john-117,', 'master', 'chief,', 'fictional', 'character', 'protagonist', 'halo', 'multimedia', 'franchise.', 'master', 'chief', 'playable', 'character', 'series', 'science', 'fiction', 'first-person', 'shooter', 'video', 'games', 'halo.', 'master', 'chief', 'towering', 'supersoldier', 'known', 'spartan,', 'raised', 'trained', 'childhood', 'combat.']

Size :: 35


#### Q3: After getting the results into `no_stopwords_rdd`, we want to group the words in `no_stopwords_rdd` based on which letters they start with. For example, suppose I want to group each word of `no_stopwords_rdd` based on first 3 characters.

In [51]:
grouped_rdd = no_stopwords_rdd.groupBy(lambda x: x[:3])

print(" :: Groups and words ::")
{k: list(v) for (k,v) in grouped_list.collect()}

 :: Groups and words ::


{'mas': ['master', 'master', 'master', 'master'],
 'chi': ['chief', 'chief,', 'chief', 'chief', 'childhood'],
 'pet': ['petty'],
 'off': ['officer'],
 'joh': ['john-117,'],
 'fic': ['fictional', 'fiction'],
 'cha': ['character', 'character'],
 'pro': ['protagonist'],
 'hal': ['halo', 'halo.'],
 'mul': ['multimedia'],
 'fra': ['franchise.'],
 'pla': ['playable'],
 'ser': ['series'],
 'sci': ['science'],
 'fir': ['first-person'],
 'sho': ['shooter'],
 'vid': ['video'],
 'gam': ['games'],
 'tow': ['towering'],
 'sup': ['supersoldier'],
 'kno': ['known'],
 'spa': ['spartan,'],
 'rai': ['raised'],
 'tra': ['trained'],
 'com': ['combat.']}

#### Q4: What if we want to calculate how many times each word is coming in corpus ?

In [57]:
print(" :: Groups and number of words ::")
{k: len(list(v)) for (k,v) in grouped_list.collect()}

 :: Groups and number of words ::


{'mas': 4,
 'chi': 5,
 'pet': 1,
 'off': 1,
 'joh': 1,
 'fic': 2,
 'cha': 2,
 'pro': 1,
 'hal': 2,
 'mul': 1,
 'fra': 1,
 'pla': 1,
 'ser': 1,
 'sci': 1,
 'fir': 1,
 'sho': 1,
 'vid': 1,
 'gam': 1,
 'tow': 1,
 'sup': 1,
 'kno': 1,
 'spa': 1,
 'rai': 1,
 'tra': 1,
 'com': 1}

#### Q5: How do I perform a task (say count the words ‘spark’ and ‘apache’ in rdd3) separatly on each partition and get the output of the task performed in these partition ?

#### Q6: What if I want to work with samples instead of full data ?

#### Q7: What if I want to create a RDD which contains all the elements (a.k.a. union) of two RDDs ?

#### Q8: If we want to join the two pair RDDs based on their key.

#### Q9: How to calculate distinct elements in a RDD ?

#### Q10: What if I want to reduce the number of partition of a RDD and get the result in a new RDD?

#### Q11: How do I find out number of parition in RDD ?

#### Q13: Count the number of elements in RDD.

#### Q14: Find the maximum, minimum, sum, variance and standard deviation of “num_rdd”.