In [None]:
# we need this for splitting a line into words using regular expressions
import re 

# Find the SPARK ibrary, so that it can be imported into Jupyter Notebook
import findspark
findspark.init()

# Link in the Python version of the SPARK library
import pyspark

In [None]:
# First step - creating the SPARK Context
sc = pyspark.SparkContext(appName="RDDOperations")

In [None]:
# Using the SparkContext, and its function 'textFile', create an RDD 'rdd1' by reading a text file
# The functions defined in the class 'SparkContext' are documented at ...
# https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.SparkContext.html

rdd1 = sc.textFile("/home/hduser/data/pg20417.txt")
print(type(rdd1))

In [None]:
# The class RDD has a number of useful functions for performing various 'transformations' and 'actions' on the RDD
# Refer to https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.RDD.html
print("Number of lines in the file: ",rdd1.count())
print("\nThe first 2 lines line: ", rdd1.take(2))
print("\nPrint only the first line: ", rdd1.first())

In [None]:
# We will create a 'word count' for the file by using the 'Map - Reduce' approach ... 
# First step: Split each line into its constituent words, and create a collection of these words using 'flatmap' function of RDD
# Splitting is accomplished using a lambda function ... and regular expressions
the_words = rdd1.flatMap(lambda x: re.split("[ .,!()\'\"\n\r?:;_\-|{}\[\]\\\/]",x))

In [None]:
# Print the first 5 words, and the total count of words resulting from the above operation 
print(the_words.take(5))
print(the_words.count())

In [None]:
# Begin the map-reduce process to create a count of each word
# First step: Each word is converted into a 'key-value' pair
# A new RDD 'words_kv' is created. It is an RDD of the Python type 'dictionary'
words_kv = the_words.map(lambda x : (x,1))

In [None]:
# Print the first 5 (k,v) pairs
print(words_kv.take(5))

In [None]:
# Second step: We use the 'CountByKey' function of RDD to 'reduce' the (k,v) pairs
# The resulting 'word_count' is a Python 'dictionary'
word_count = words_kv.countByKey()
print(type(word_count))

In [None]:
# Print the first 10 key-value pairs
count = 0
for key, value in word_count.items():
    print(key, ':', value)
    count += 1
    if count >= 20:
        break

In [None]:
# Write out the entire dictionary into a csv file
import csv

# Write dictionary to a CSV file
with open('word_count.csv', 'w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    for key, value in word_count.items():
        csv_writer.writerow([key, value])

In [None]:
# Once we are done with using SPARK, we should NOT forget to 'stop' the SparkContext
sc.stop()