In [None]:
from pyspark.sql import SparkSession

# New API
spark_session = SparkSession.builder\
        .master("spark://192.168.2.35:7077") \
        .appName("Lecture1_Example4_wordcount_examples")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores", 2)\
        .config("spark.driver.port",9999)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext
spark_context.setLogLevel("ERROR")

In [None]:
book=spark_context.textFile("hdfs://192.168.2.35:9000/data/books/book-1.txt")
book.take(5)

In [None]:
# rdd.map(): Return a new RDD by applying a function to each element of this RDD.
## split each line into seperated words
book_sp=book.map(lambda x: x.split(" "))
book_sp.take(5)

In [None]:
# rdd.filter(): Return a new RDD containing only the elements that satisfy a predicate.
## for instance we can filter out sentences with too short phrases, which might be useless for analysis.
book_sp_1=book_sp.filter(lambda x: len(x) > 1)
book_sp_1.take(5)

In [None]:
# rdd.flatMap(): Return a new RDD by first applying a function to all elements of this RDD, and then flattening the results.
## for example we can create single word RDD from previous result
book_sw=book_sp_1.flatMap(lambda x: x)
book_sw.take(20)

In [None]:
# rdd.groupBy(): Return an RDD of grouped items. Can be used to group the RDD elements by some condition.
## for example we group the words by their length.
book_sw_fl = book_sw.groupBy(lambda x: len(x))
book_sw_fl.take(2)

In [None]:
book_sw_fl.mapValues(list).take(1)

In [None]:
# rdd.groupByKey(): Group the values for each key in the RDD into a single sequence, can be used to group RDD by key of elements.
## NOTICE that the elements of RDD must be a (key,value) pair.
## for example we can first construct (word,1) key-value pair, and then group by key, which is the word:
book_sw_p = book_sw.map(lambda x: (x,1))
book_wk=book_sw_p.groupByKey()
book_wk.take(5)

In [None]:
# use .mapValues() to pass each value in the key-value pair through a map function
book_wk.mapValues(list).take(2)

In [None]:
# rdd.reduceByKey(): Merge the values for each key using an associative and commutative reduce function.
## NOTICE that the elements of RDD must be a (key,value) pair.
## for example we can reduce the (word,1) key-value pair, and do wordcount:
from operator import add
book_wordcount = book_sw_p.reduceByKey(add)
book_wordcount.take(5)

In [None]:
# set hash seed to disable randomness
import os
os.environ["PYTHONHASHSEED"]=str(123)
# Frequency of the word "Discovery"
book_wordcount.lookup("Discovery")

In [None]:
book_wordcount.keys().take(20)

In [None]:
# rdd.distinct(): Return a new RDD containing the distinct elements in this RDD.
# check the length of list before/after distinct
print("Before .distinct():",book_sw.count())
print("After  .distinct():",book_sw.distinct().count())

In [None]:
# rdd.keyBy(): Creates tuples of the elements in this RDD by applying f.
## for example we can realize FirstLetterCount with this operation.
book_sw.keyBy(lambda x: x[0]).take(5)

In [None]:
# Pipelined operation
sorted(                                  # sort the results by alphabet
    book.map(lambda x: x.split(" "))     # split each line into seperated words
    .filter(lambda x: len(x) > 0)        # filter out empty lines
    .flatMap(lambda x: x)                # flatMap to single words
    .filter(lambda x: len(x) > 0)        # filter out empty words
    .keyBy(lambda x: x[0].lower())       # extract the first letter and covert to lower case
    .map(lambda x: (x[0],1))             # create (first_letter, 1) pairs
    .reduceByKey(add)                    # reduce the key-value pair by adding up
    .collect()                           # collect the result
)

In [None]:
# define a function and use it in spark
def key_pair(x):
    return (x[0],x,1)
book_sw.map(key_pair).take(5)

In [18]:
spark_session.stop()