Spark Intro
by Oleksandr (#126784)

### 1. Setup Spark environment

In [101]:
from pyspark.sql import SparkSession
import re

In [102]:
spark = SparkSession.builder \
    .appName("Spark-Intro-App") \
    .config("spark.driver.extraJavaOptions", "-Djava.security.manager=allow") \
    .config("spark.executor.extraJavaOptions", "-Djava.security.manager=allow") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("WARN")


In [103]:
data = sc.textFile("lusiadas.txt")

### 2. Count of the occurrences of each word in the text

In [104]:
# words_rdd = data.flatMap(lambda line: line.lower().split()).filter(lambda word: word.strip() != "")
words_rdd = data.flatMap(lambda line: re.findall(r'\b\w+\b', line.lower()))
word_counts = words_rdd.countByValue()

for word, count in list(word_counts.items())[:10]:
    print(f"{word}: {count}")


luís: 2
vaz: 1
de: 1434
camões: 1
os: 744
lusíadas: 1
canto: 24
primeiro: 34
1: 10
as: 505


### 3. Adapt the code to find the count of the occurrences of each word in the text

In [109]:
long_words = words_rdd.filter(lambda word: len(word) >= 3)
print(long_words.take(10))

['luís', 'vaz', 'camões', 'lusíadas', 'canto', 'primeiro', 'armas', 'barões', 'assinalados', 'que']


In [None]:
# bigrams = long_words.flatMap(lambda x: [x,x]).zipWithIndex().map(lambda x: ((1 + x[1]) // 2, x[0])).groupByKey().mapValues(list)
# print(bigrams.take(10))

In [141]:
indexed_words = long_words.zipWithIndex()
bigrams = indexed_words.map(lambda x: (x[1], x[0])) \
                      .join(indexed_words.map(lambda x: (x[1]-1, x[0]))) \
                      .map(lambda x: (x[1][1], x[1][0]))

bigram_counts = bigrams.countByValue()
sc.parallelize(bigram_counts.items()).map(lambda x: (x[1], x[0])).sortByKey(ascending=False).take(10)

[(79, ('não', 'que')),
 (54, ('que', 'mais')),
 (53, ('que', 'com')),
 (51, ('que', 'por')),
 (49, ('com', 'que')),
 (42, ('por', 'que')),
 (36, ('lhe', 'que')),
 (30, ('que', 'rei')),
 (28, ('tão', 'que')),
 (26, ('tanto', 'que'))]

### 4. Counting number of uniq long words starting with each letter

In [146]:
long_words.distinct().map(lambda word: word[0]).countByValue()

defaultdict(int,
            {'l': 284,
             'p': 770,
             'b': 223,
             'a': 1053,
             'o': 194,
             'n': 221,
             'f': 436,
             'e': 666,
             'r': 407,
             't': 460,
             's': 581,
             'd': 651,
             'i': 366,
             'á': 22,
             'm': 506,
             'c': 973,
             'q': 91,
             'v': 369,
             'h': 141,
             'g': 230,
             'u': 39,
             'j': 70,
             'â': 4,
             'í': 17,
             'ó': 6,
             'x': 4,
             '1': 57,
             'ú': 9,
             'à': 1,
             'z': 9,
             'ô': 1,
             'é': 1})