# Fundamentals of Database Technologies - Assignment 3 (Part 2)

## Frequent Words

### Question 5: Download Text Files

In [2]:
# Get Alice in the Wonderland in text format
!wget -O /tmp/alice.txt https://www.gutenberg.org/files/11/11-0.txt

In [3]:
# Get The Prince in text format
!wget -O /tmp/thePrince.txt http://www.gutenberg.org/cache/epub/1232/pg1232.txt

In [4]:
rddAlice = sc.textFile("file:/tmp/alice.txt")
rddPrince = sc.textFile("file:/tmp/thePrince.txt")

### Question 6a: Average length of non-empty lines

In [6]:
# Average length of non-empty lines (Alice in the Wonderland)
rddAlice \
  .map(len) \
  .filter(lambda x: x > 0) \
  .mean()

In [7]:
# Average length of non-empty lines (The Prince)
rddPrince \
  .map(len) \
  .filter(lambda x: x > 0) \
  .mean()

### Question 6b: Average number of words per line

Below calculates the average number of words per line. Please note that empty lines are included for the calculation of average.

In [9]:
from operator import add
import re

pattern = re.compile(r"(\w[\w']*\w)")

In [10]:
# Average number of words per line (Alice in the Wonderland)

#rddAlice.map(lambda x: len(x.split())).mean()
rddAlice \
  .map(lambda x: len(pattern.findall(x))) \
  .mean()

In [11]:
# Average number of words per line (The Prince)

#rddPrince.map(lambda x: len(x.split())).mean()
rddPrince \
  .map(lambda x: len(pattern.findall(x))) \
  .mean()

### Question 7a: Word counts of 10 most popular words

In [13]:
# Perform word count, returning (a) the 10 most popular words

# Cache the splitted words
rddAliceWords = rddAlice.flatMap(lambda x: pattern.findall(x)).cache()
rddPrinceWords = rddPrince.flatMap(lambda x: pattern.findall(x)).cache()

In [14]:
# Show the word counts of 10 most popular words (Alice in the Wonderland)
rddAliceWords \
  .map(lambda x: (x, 1)) \
  .reduceByKey(add) \
  .top(10, lambda (k, v): v)

In [15]:
# Show the word counts of 10 most popular words (The Prince)
rddPrinceWords \
  .map(lambda x: (x, 1)) \
  .reduceByKey(add) \
  .top(10, lambda (k, v): v)

### Question 7b: Number of words starting with a capital letter (> 10 times occurrence)

In [17]:
# Number of words starting with a Capitalised character, that occur more than 10 times (Alice in the Wonderland)

rddAliceWords \
  .filter(lambda x: x[0].isupper()) \
  .map(lambda x: (x, 1)) \
  .reduceByKey(add) \
  .filter(lambda (k, v): v > 10) \
  .count()

In [18]:
# Number of words starting with a Capitalised character, that occur more than 10 times (The Prince)

rddPrinceWords \
  .filter(lambda x: x[0].isupper()) \
  .map(lambda x: (x, 1)) \
  .reduceByKey(add) \
  .filter(lambda (k, v): v > 10) \
  .count()

### Question 7c: Number of words that occur exactly n times (for n up to 10)

In [20]:
# The number for words that occur exactly n times, for n up to 10 (Alice in the Wonderland)

n = 10

rddAliceWords \
  .map(lambda x: (x, 1)) \
  .reduceByKey(add) \
  .filter(lambda (k, v): v <= n) \
  .map(lambda (k, v): (v, k)) \
  .countByKey().items()

In [21]:
# The number for words that occur exactly n times, for n up to 10 (The Prince)

rddPrinceWords \
  .map(lambda x: (x, 1)) \
  .reduceByKey(add) \
  .filter(lambda (k, v): v <= n) \
  .map(lambda (k, v): v) \
  .countByValue().items()

### Question 10a: 10 most popular words in one book that does not show at all in the other

In [23]:
# 10) Combine word counts from both books (in Spark, not outside of it!), to get: (a) the 10 most popular words in one book that does not show at all in the other

# Cache the word counts from both books (full outer join since we need words that do not appear in one of the books)
rddBothWords = rddAliceWords.map(lambda x: (x, 1)).reduceByKey(add).fullOuterJoin(rddPrinceWords.map(lambda x: (x, 1)).reduceByKey(add)).cache()

In [24]:
# Top 10 words in "Alice in the Wonderland" that does not show at all in "The Prince"

rddBothWords \
  .filter(lambda (k, (v, w)): w == None) \
  .top(10, lambda (k, (v, w)): v)

In [25]:
# Top 10 words in "The Prince" that does not show at all in "Alice in the Wonderland"

rddBothWords \
  .filter(lambda (k, (v, w)): v == None) \
  .top(10, lambda (k, (v, w)): w)

### Question 10b: Ratio of occurrence of words that appear in both books at least 5 times in each book

The below section calculates the ratio = Occurrence in "Alice in the Wonderland" / Occurrence in "The Prince"

In [27]:
#(b) for words that appear in both books (in each at least 5 times), make a ratio of occurrence in one book vs another. Show 10 words from the top and (a separate result) 10 from the bottom.

# Cache the word counts from both books (inner join since we need words that appear in both books)
rddBothWords5 = rddAliceWords.map(lambda x: (x, 1)).reduceByKey(add).join(rddPrinceWords.map(lambda x: (x, 1)).reduceByKey(add)).filter(lambda (k, (v, w)): v >= 5 and w >= 5).cache()

In [28]:
# 10 words with the highest ratio of occurrence (ratio = occurrence in "Alice in the wonderland" / occurrence in "The Prince")

rddBothWords5 \
  .map(lambda (k, (v, w)): (k, float(v) / w)) \
  .top(10, lambda (k, v): v)

In [29]:
# 10 words with the lowest ratio of occurrence (ratio = occurrence in "Alice in the wonderland" / occurrence in "The Prince")

rddBothWords5 \
  .map(lambda (k, (v, w)): (k, float(v) / w)) \
  .top(10, lambda (k, v): -v)

The below section calculates the ratio = Occurrence in "The Prince" / Occurrence in "Alice in the Wonderland"

In [31]:
# 10 words with the highest ratio of occurrence (ratio = occurrence in "The Prince" / occurrence in "Alice in the wonderland")

rddBothWords5 \
  .map(lambda (k, (v, w)): (k, float(w) / v)) \
  .top(10, lambda (k, v): v)

In [32]:
# 10 words with the lowest ratio of occurrence (ratio = occurrence in "The Prince" / occurrence in "Alice in the wonderland)

rddBothWords5 \
  .map(lambda (k, (v, w)): (k, float(w) / v)) \
  .top(10, lambda (k, v): -v)

In [33]:
# How to write map(f1) and filter(f2) as a flatMap?

#def f3(x):
#  if f2(f1(x)):
#    return [f1(x)]
#  else:
#    return [] 
#flatMap(f3)

# Example: f1 is len() function, f2 specifies that x >= 5
#def f3(x):
#  if (len(x) >= 5):
#    return [len(x)]
#  else:
#    return []  
#rdd.flatMap(f3)

