In [None]:
# Import and create a new SQLContext 
from pyspark.sql import SQLContext

sqlContext = SQLContext(sc)

In [None]:
# Read the country CSV file into an RDD.
country_lines = sc.textFile('file://path/country-list.csv')

In [None]:
# Convert each line into a pair of words
country_words = country_lines.flatMap(lambda line : [line.split(",")])

In [None]:
# Convert each pair of words into a tuple
country_tuples = country_words.map(lambda country : (country[0], country[1].strip()))

In [None]:
country_tuples.take(5)

In [None]:
# Create the DataFrame, look at schema and contents
countryDF = sqlContext.createDataFrame(country_tuples, ["country", "code"])
countryDF.printSchema()
countryDF.take(3)

In [None]:
# Read tweets CSV file into RDD of lines
users_tweets = sc.textFile('file://path/users_tweets.csv')

In [None]:
# Clean the data: some tweets are empty. Remove the empty tweets using filter() 
clean_tweets = users_tweets.filter(lambda x : len(x) > 1)

In [None]:
# Perform WordCount on the cleaned tweet texts. (note: this is several lines.)
tweet_words = clean_tweets.flatMap(lambda line : line.split(" "))
tweet_tuples = tweet_words.map(lambda word : (word,1))
tweet_counts = tweet_tuples.reduceByKey(lambda a, b : (a + b))

In [None]:
# Create the DataFrame of tweet word counts
tweetsDF = sqlContext.createDataFrame(tweet_counts, ["tweet", "count"])
tweetsDF.printSchema()
tweetsDF.take(3)

In [None]:
# Join the country and tweet data frames (on the appropriate column)
merge = countryDF.join(tweetsDF, (countryDF.country == tweetsDF.tweet))
merge.printSchema()
merge.take(5)

In [None]:
# Number of distinct countries mentioned
merge.count()

In [None]:
# Number of countries mentioned in tweets.
from pyspark.sql.functions import sum

merge.select(sum('count')).show()

In [None]:
# Top three countries and their counts.
from pyspark.sql.functions import desc

merge.select("country", "count").orderBy(desc("count")).show(3)

In [None]:
# Average number of times a country mentioned
from pyspark.sql.functions import avg

merge.select(avg("count")).show()

In [None]:
# Counts for Wales, Netherlands, Kenya

merge.filter(merge["country"].isin({'Kenya', 'Wales', 'Netherlands'})).show()
