In [1]:
import re
import sys
from pyspark import SparkConf, SparkContext

conf = SparkConf()
sc = SparkContext(conf=conf)

fname = "data/pg100.txt"
lines = sc.textFile(fname)

#### Word Count

In [2]:
words = lines.flatMap(lambda l: re.split(r'[^\w]+', l))

pairs = words.map(lambda w: (w, 1))

counts = pairs.reduceByKey(lambda n1, n2: n1 + n2)

# uncomment to save to directory named word_count
# counts.saveAsTextFile("word_count")

# print top 10 words 
result = sorted(counts.collect(), key=lambda rdd : -rdd[1])
result[:10]

[('', 197060),
 ('the', 23455),
 ('I', 22225),
 ('and', 18715),
 ('to', 16433),
 ('of', 15830),
 ('a', 12851),
 ('you', 12236),
 ('my', 10840),
 ('in', 10074)]

#### Character Count

In [3]:
phrases = lines.flatMap(lambda l: re.split(r'[^\w]+', l))

# only consider words that begins with letters (not numbers)
words = phrases.filter(lambda word: re.match('^[A-Za-z]', word))

# map to lowercase, key contains the first letter only
letters = words.map(lambda w: (w[0].lower(), 1))

counts = letters.reduceByKey(lambda n1, n2: n1 + n2)

# sort in descending order
result = sorted(counts.collect(), key=lambda rdd : rdd[0])

# save results
with open('letter_count.txt', 'w') as file:
    for letter, count in result:
        file.write("(%s, %i)\n"%(letter, count))

result

[('a', 86000),
 ('b', 46001),
 ('c', 34983),
 ('d', 39173),
 ('e', 20409),
 ('f', 37186),
 ('g', 21167),
 ('h', 61028),
 ('i', 62420),
 ('j', 3372),
 ('k', 9535),
 ('l', 32389),
 ('m', 56252),
 ('n', 27313),
 ('o', 43712),
 ('p', 28059),
 ('q', 2388),
 ('r', 15234),
 ('s', 75226),
 ('t', 127781),
 ('u', 9230),
 ('v', 5801),
 ('w', 60097),
 ('x', 14),
 ('y', 25926),
 ('z', 79)]

In [4]:
sc.stop()