In [0]:
#dataset link
#https://github.com/Qian-Han/coursera-Big-Data-specialization/blob/Qian-Han/Big-Data-Integration-and-Processing/week5/SoccerTweetAnalysis.ipynb

# Import and create a new SQLContext 
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [0]:
# Read the country CSV file into an RDD.
country_lines = sc.textFile('dbfs:/FileStore/shared_uploads/AKSHATAKHEDEKAR01032001@rjcollege.edu.in/country_list.csv')

In [0]:
# Convert each line into a pair of words
country_lines.map(lambda a: a.split(",")).collect()

Out[8]: [['"Afghanistan', ' AFG"'],
 ['"Albania', ' ALB"'],
 ['"Algeria', ' ALG"'],
 ['"American Samoa', ' ASA"'],
 ['"Andorra', ' AND"'],
 ['"Angola', ' ANG"'],
 ['"Anguilla', ' AIA"'],
 ['"Antigua and Barbuda', ' ATG"'],
 ['"Argentina', ' ARG"'],
 ['"Armenia', ' ARM"'],
 ['"Aruba', ' ARU"'],
 ['"Australia', ' AUS"'],
 ['"Austria', ' AUT"'],
 ['"Azerbaijan', ' AZE"'],
 ['"Bahamas', ' BAH"'],
 ['"Bahrain', ' BHR"'],
 ['"Bangladesh', ' BAN"'],
 ['"Barbados', ' BRB"'],
 ['"Belarus', ' BLR"'],
 ['"Belgium', ' BEL"'],
 ['"Belize', ' BLZ"'],
 ['"Benin', ' BEN"'],
 ['"Bermuda', ' BER"'],
 ['"Bhutan', ' BHU"'],
 ['"Bolivia', ' BOL"'],
 ['"Bosnia and Herzegovina', ' BIH"'],
 ['"Botswana', ' BOT"'],
 ['"Brazil', ' BRA"'],
 ['"British Virgin Islands', ' VGB"'],
 ['"Brunei', ' BRU"'],
 ['"Bulgaria', ' BUL"'],
 ['"Burkina Faso', ' BFA"'],
 ['"Burundi', ' BDI"'],
 ['"Cambodia', ' CAM"'],
 ['"Cameroon', ' CMR"'],
 ['"Canada', ' CAN"'],
 ['"Cape Verde', ' CPV"'],
 ['"Cayman Islands', ' CAY"'],
 ['"Ce

In [0]:
# Convert each pair of words into a tuple
country_tuples = country_lines.map(lambda a: (a.split(",")[0].lower(), a.split(",")[1]))


In [0]:
# Create the DataFrame, look at schema and contents
countryDF = sqlContext.createDataFrame(country_tuples, ["country", "code"])
countryDF.printSchema()
countryDF.take(3)

root
 |-- country: string (nullable = true)
 |-- code: string (nullable = true)

Out[10]: [Row(country='"afghanistan', code=' AFG"'),
 Row(country='"albania', code=' ALB"'),
 Row(country='"algeria', code=' ALG"')]

In [0]:
# Read tweets CSV file into RDD of lines
tweets = sc.textFile('dbfs:/FileStore/shared_uploads/AKSHATAKHEDEKAR01032001@rjcollege.edu.in/tweets.csv')
tweets.count()

Out[12]: 13994

In [0]:
# Clean the data: some tweets are empty. Remove the empty tweets using filter() 
filtered_tweets = tweets.filter(lambda a: len(a) > 0)
filtered_tweets.count()

Out[13]: 13390

In [0]:
# Perform WordCount on the cleaned tweet texts. (note: this is several lines.)
word_counts = filtered_tweets.flatMap(lambda a: a.split(" ")) \
    .map(lambda word: (word.lower(), 1)) \
    .reduceByKey(lambda a, b: a + b)

In [0]:
from pyspark.sql import HiveContext
from pyspark.sql.types import *

# sc is an existing SparkContext.
sqlContext = HiveContext(sc)

schemaString = "word count"

fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
schema = StructType(fields)

# Create the DataFrame of tweet word counts
tweetsDF = sqlContext.createDataFrame(word_counts, schema)
tweetsDF.printSchema()
tweetsDF.count()

root
 |-- word: string (nullable = true)
 |-- count: string (nullable = true)

Out[15]: 25031

In [0]:
# Join the country and tweet DataFrames (on the appropriate column)
joined = countryDF.join(tweetsDF, countryDF.country == tweetsDF.word)
joined.take(5)
joined.show()

+--------+-----+--------+-----+
| country| code|    word|count|
+--------+-----+--------+-----+
|"germany| GER"|"germany|    2|
|  "wales| WAL"|  "wales|    1|
|  "spain| ESP"|  "spain|    3|
+--------+-----+--------+-----+



In [0]:
# Question 1: number of distinct countries mentioned
distinct_countries = joined.select("country").distinct()
distinct_countries.show(100)

+--------+
| country|
+--------+
|"germany|
|  "spain|
|  "wales|
+--------+



In [0]:
# Question 2: number of countries mentioned in tweets.
from pyspark.sql.functions import sum
from pyspark.sql import SparkSession
from pyspark.sql import Row

countries_count = joined.groupBy("country")
joined.createOrReplaceTempView("records")
spark.sql("SELECT country, count(*) count1 FROM records group by country order by count1 desc, country asc").show(100)

+--------+------+
| country|count1|
+--------+------+
|"germany|     1|
|  "spain|     1|
|  "wales|     1|
+--------+------+

