In [1]:
# Import what we need
import findspark
findspark.init()

import pyspark
from os import getlogin, path
import operator

In [2]:
# Get the current user
current_user = getlogin()

# Create a variable for the home directory to save all the data to
home_dir = path.join("/Users/robert.dempsey/Downloads", current_user)
data_dir = path.join(home_dir, "data")
word_count_dir = path.join(data_dir, "wordcount")

print("Home Directory: {}".format(home_dir))
print("Data Directory: {}".format(data_dir))
print("Word Count Directory: {}".format(word_count_dir))

Home Directory: /Users/robert.dempsey/Downloads/robert.dempsey
Data Directory: /Users/robert.dempsey/Downloads/robert.dempsey/data
Word Count Directory: /Users/robert.dempsey/Downloads/robert.dempsey/data/wordcount


In [3]:
# Create a Spark context to use
sc = pyspark.SparkContext(appName="WordCount")

In [5]:
# Load the text file into an RDD
text_file = sc.textFile("/Users/robert.dempsey/Dev/daamlobd/data/obama.txt")

In [6]:
# Run the word count
counts = text_file.flatMap(lambda line: line.split(" ")) \
             .map(lambda word: (word, 1)) \
             .reduceByKey(lambda a, b: a + b)

In [7]:
type(counts)

pyspark.rdd.PipelinedRDD

In [8]:
# Show the count of words we've counted
counts.count()

2488

In [9]:
# View 10 of the results
counts.take(10)

[('Administration', 1),
 ('of', 207),
 ('Barack', 1),
 ('2011', 2),
 ('', 118),
 ('Address', 1),
 ('Joint', 1),
 ('Session', 1),
 ('Congress', 10),
 ('State', 3)]

In [10]:
# Save the results to the data directory
# Note - this will create more than one file containing all the results
if not path.exists(word_count_dir):
    counts.saveAsTextFile(word_count_dir)
    print("Results saved.")
else:
    print("Directory exists. Please delete it before proceeding.")

Results saved.


## Working with the results

In [11]:
# Convert the word count results into a Python list we can work with.
word_count = counts.collect()

In [12]:
# Print some stats
print("Counts: {}".format(len(word_count)))
print("Example count: {}".format(word_count[0]))
print("Data type: {}".format(type(word_count[0])))

Counts: 2488
Example count: ('Administration', 1)
Data type: <class 'tuple'>


In [13]:
# Convert the list of tuples to a dictionary
word_count_dict = dict(word_count)

# Sort the dictionary by count
sorted_word_count = sorted(word_count_dict.items(), key=operator.itemgetter(1))

# Show the word and the count
for word in sorted_word_count:
    print("{} - {}".format(word[0], word[1]))

Administration - 1
Barack - 1
Address - 1
Joint - 1
Session - 1
January - 1
President, - 1
congratulating - 1
112th - 1
Boehner. - 1
mark - 1
occasion, - 1
mindful - 1
chair - 1
pray - 1
Giffords. - 1
secret - 1
fiercely - 1
beliefs. - 1
thing. - 1
robust - 1
tragedy - 1
pause. - 1
Amid - 1
passion - 1
reminded - 1
from, - 1
consequential - 1
political - 1
preference. - 1
creed, - 1
fulfilled. - 1
simple - 1
usher - 1
can, - 1
must. - 1
votes, - 1
New - 1
politics. - 1
root - 1
else. - 1
rewarded. - 1
map, - 1
poised - 1
back, - 1
measure - 1
prospects - 1
enterprise, - 1
opportunities - 1
children. - 1
on—together. - 1
December. - 1
passed, - 1
today. - 1
full - 1
sector - 1
These - 1
steps - 1
decades - 1
making. - 1
watching - 1
probably - 1
nearby - 1
pretty - 1
limited - 1
hard, - 1
chances - 1
life, - 1
decent - 1
paycheck - 1
occasional - 1
Maybe - 1
pride - 1
many, - 1
booming - 1
storefronts - 1
Main - 1
dwindle - 1
disappear, - 1
changed - 1
middle - 1
right. - 1
transformed 

incentives, - 1
break - 1
dependence - 1
biofuels - 1
vehicles - 1
currently - 1
you've - 1
noticed, - 1
yesterday's - 1
breakthroughs - 1
setting - 1
goal: - 1
sources. - 1
wind - 1
solar. - 1
nuclear, - 1
natural - 1
urge - 1
happen. - 1
Maintaining - 1
crucial - 1
overseas, - 1
then - 1
Think - 1
school. - 1
fallen - 1
instills - 1
child. - 1
Only - 1
parents - 1
Super - 1
function - 1
fame - 1
performance. - 1
pouring - 1
launched - 1
States, - 1
innovative - 1
plans - 1
teacher - 1
achievement, - 1
meaningful - 1
generation. - 1
less - 1
spend - 1
way, - 1
Democratic - 1
Governors - 1
follow - 1
flexible - 1
see, - 1
mandate, - 1
local - 1
principals, - 1
Take - 1
located - 1
gangs. - 1
97 - 1
received - 1
Most - 1
school's - 1
principal - 1
wiped - 1
"Thank - 1
it." - 1
Let's - 1
impact - 1
child's - 1
front - 1
builders." - 1
Here - 1
respect. - 1
excuses - 1
baby - 1
retiring - 1
math. - 1
who's - 1
contemplating - 1
choice: - 1
child, - 1
teacher. - 1
Your - 1
unwarranted - 1


In [14]:
# Close the Spark context
sc.stop()