In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [2]:
conf = SparkConf().setAppName('SparkApp').setMaster('local')
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

In [69]:
txt_file = sc.textFile("./data/sample.txt")

In [70]:
print(txt_file.collect())

['Master Chief Petty Officer John-117, or Master Chief, is a fictional character and the protagonist in the Halo multimedia franchise.', 'Master Chief is a playable character in the series of science fiction first-person shooter video games Halo.', 'Master Chief is a towering supersoldier known as a Spartan, raised and trained from childhood for combat.']


#### Q1: Convert all words in a rdd to lowercase and split the lines of a document using space.

In [71]:
splitted_line_list = txt_file.map(lambda x: x.lower().split()).collect()

list_of_words = sc.parallelize([item for sublist in splitted_line_list for item in sublist])

print(list_of_words.take(20)) # printing only the first 20 words
print("\nSize of list :: %d" % list_of_words.count())

['master', 'chief', 'petty', 'officer', 'john-117,', 'or', 'master', 'chief,', 'is', 'a', 'fictional', 'character', 'and', 'the', 'protagonist', 'in', 'the', 'halo', 'multimedia', 'franchise.']

Size of list :: 54


#### Q2: Next, I want to remove the words, which are not necessary to analyze this text. We call these words as “stop words”; Stop words do not add much value in a text. For example, “is”, “am”, “are” and “the” are few examples of stop words.

In [72]:
stop_words_list = sc.textFile("./data/stop_words.txt").collect()
list_without_stopwords = list_of_words.filter(lambda x: x not in stop_words_list)

print(list_without_stopwords.take(20)) # printing only the first 20 words
print("\nSize of list :: %d" % list_without_stopwords.count())

['master', 'chief', 'petty', 'officer', 'john-117,', 'master', 'chief,', 'fictional', 'character', 'protagonist', 'halo', 'multimedia', 'franchise.', 'master', 'chief', 'playable', 'character', 'series', 'science', 'fiction']

Size of list :: 35


#### Q3: After getting the results into `list_without_stopwords`, we want to group the words in rdd3 based on which letters they start with. For example, suppose I want to group each word of rdd3 based on first 3 characters.

In [95]:
list_without_stopwords.glom().collect()

[['master',
  'chief',
  'petty',
  'officer',
  'john-117,',
  'master',
  'chief,',
  'fictional',
  'character',
  'protagonist',
  'halo',
  'multimedia',
  'franchise.',
  'master',
  'chief',
  'playable',
  'character',
  'series',
  'science',
  'fiction',
  'first-person',
  'shooter',
  'video',
  'games',
  'halo.',
  'master',
  'chief',
  'towering',
  'supersoldier',
  'known',
  'spartan,',
  'raised',
  'trained',
  'childhood',
  'combat.']]

In [98]:
grouped_list = list_without_stopwords.groupBy(lambda x: x[:3])
# [(k, list(v)) for (k, v) in grouped_list.take(x) for x in range(1, grouped_list.count()+1)]
# grouped_list.take(1)
[grouped_list.take(x) for x in range(1, grouped_list.count()+1)]

[[('mas', <pyspark.resultiterable.ResultIterable at 0x7f7816b39400>)],
 [('mas', <pyspark.resultiterable.ResultIterable at 0x7f7816abe370>),
  ('chi', <pyspark.resultiterable.ResultIterable at 0x7f7816abe4f0>)],
 [('mas', <pyspark.resultiterable.ResultIterable at 0x7f7816abe130>),
  ('chi', <pyspark.resultiterable.ResultIterable at 0x7f7816abe070>),
  ('pet', <pyspark.resultiterable.ResultIterable at 0x7f7816abed00>)],
 [('mas', <pyspark.resultiterable.ResultIterable at 0x7f7816abe940>),
  ('chi', <pyspark.resultiterable.ResultIterable at 0x7f7816abe430>),
  ('pet', <pyspark.resultiterable.ResultIterable at 0x7f7816abe580>),
  ('off', <pyspark.resultiterable.ResultIterable at 0x7f781776a070>)],
 [('mas', <pyspark.resultiterable.ResultIterable at 0x7f7816abe760>),
  ('chi', <pyspark.resultiterable.ResultIterable at 0x7f7816abeeb0>),
  ('pet', <pyspark.resultiterable.ResultIterable at 0x7f7816abe9d0>),
  ('off', <pyspark.resultiterable.ResultIterable at 0x7f7816abe910>),
  ('joh', <pyspa

#### Q4: What if we want to calculate how many times each word is coming in corpus ?

#### Q5: How do I perform a task (say count the words ‘spark’ and ‘apache’ in rdd3) separatly on each partition and get the output of the task performed in these partition ?

#### Q6: What if I want to work with samples instead of full data ?

#### Q7: What if I want to create a RDD which contains all the elements (a.k.a. union) of two RDDs ?

#### Q8: If we want to join the two pair RDDs based on their key.

#### Q9: How to calculate distinct elements in a RDD ?

#### Q10: What if I want to reduce the number of partition of a RDD and get the result in a new RDD?

#### Q11: How do I find out number of parition in RDD ?

#### Q13: Count the number of elements in RDD.

#### Q14: Find the maximum, minimum, sum, variance and standard deviation of “num_rdd”.