In [25]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, split, lower
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, ArrayType

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [26]:
# Define date
year = "2020"
month = "05"
day = "29"

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [27]:
dt = "{}-{}-{}".format(year, month, day)
# Input location
database = "covid_project"
table = "{}.twitter_data".format(database)

# Output location
my_bucket = "<s3-bucket-name>"
output_path = "s3://{}/notebook/covid_twitter_etl2/".format(my_bucket)
output_table = "{}.covid_twitter_etl2_ntbk".format(database)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [28]:
# Defining functions
def getTop10(map_collection):
    # Sort in order of highest word count 
    sorted_x = sorted(map_collection.items(), key=lambda kv: kv[1], reverse=True)
    top10words = []

    for i in sorted_x:
        if len(top10words) >= 10:
            break
        
        if i[0] != "":    # remove empty word
            top10words.append(i[0])
    return top10words

def calculateTop10Words(df2):
    # Convert tweet to an array
    df_transf_new= df2.withColumn("tweet", split(lower(col("tweet")), " "))

    # Count appearence of each word
    d = df_transf_new.rdd.flatMap(lambda a: a.tweet).countByValue()
    return getTop10(d)

def calculateTop10HashtagsPerCountry(df2):
    # Convert hashtags to an array
    df_transf_new= df2.withColumn(
        "hashtags",
        split(regexp_replace(lower(col("hashtags")), r"(^\[)|(\]$)|(')", ""), ", ")
    )
    d = df_transf_new.rdd.flatMap(lambda a: a.hashtags).countByValue()
    return getTop10(d)

def calculateNumberOfCovidTweetsRetrieved(df2):
    return df2.count()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [29]:
# Create Session
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .enableHiveSupport() \
    .config("spark.hadoop.mapreduce.input.fileinputformat.input.dir.recursive","true") \
    .config("hive.mapred.supports.subdirectories","true") \
    .getOrCreate()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [30]:
# Let's display table DDL
spark.sql('show create table {}'.format(table)).show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+
|      createtab_stmt|
+--------------------+
|CREATE EXTERNAL T...|
+--------------------+

In [31]:
# Read into DF -> drop any rows that have 'None' as hashtag
query = 'select country, tweet, lower(hashtags) as hashtags from {} where year="{}" AND month="{}" AND day="{}" AND hashtags NOT LIKE "%None%"'.format(table, year, month, day)
df = spark.sql(query)
df.persist()
df.printSchema()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- country: string (nullable = true)
 |-- tweet: string (nullable = true)
 |-- hashtags: string (nullable = true)

In [32]:
# Define schema for final dataframe
final_schema = StructType([
        StructField('country', StringType(), False),
        StructField('day', StringType(), False),
        StructField('numberOfCovidTweetsRetrieved', IntegerType(), False),
        StructField('top10WordsPerCountry', ArrayType(StringType()), False),
        StructField('top10HashtagsPerCountry', ArrayType(StringType()), False)
])

row_list = []

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [33]:
# Get unique countries
countries_code = df.select('country').distinct().collect()
countries_code[:3] # Get first 3 values

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[Row(country='MM'), Row(country='LT'), Row(country='DZ')]

In [34]:
# Calculate top 10 hashtags and words for each country
for country in countries_code:
    country = country['country']
    c_df = df.filter(df.country == country)
    top10WordsPerCountry = calculateTop10Words(c_df)
    top10HashtagsPerCountry = calculateTop10HashtagsPerCountry(c_df)
    numberOfCovidTweetsRetrieved = calculateNumberOfCovidTweetsRetrieved(c_df)
    new_row = (country, dt, numberOfCovidTweetsRetrieved, top10WordsPerCountry, top10HashtagsPerCountry)
    row_list.append(new_row)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [35]:
# Create dataframe from above list
final_df = spark.createDataFrame(row_list, final_schema)
final_df.show(4)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+----------+----------------------------+--------------------+-----------------------+
|country|       day|numberOfCovidTweetsRetrieved|top10WordsPerCountry|top10HashtagsPerCountry|
+-------+----------+----------------------------+--------------------+-----------------------+
|     MM|2020-05-29|                           1|[., shwe, dagon, ...|   [throwbacktravel,...|
|     LT|2020-05-29|                           1|[yra, kaip, apie,...|              [covid19]|
|     DZ|2020-05-29|                           4|[#covid19, de, .,...|   [samsunggalaxynot...|
|     CI|2020-05-29|                           4|[de, la, le, au, ...|   [dssr, covid19, d...|
+-------+----------+----------------------------+--------------------+-----------------------+
only showing top 4 rows

In [36]:
# Write to Hive table and S3
final_df.repartition(1).write.mode("append").option("path", output_path).format("Parquet").saveAsTable(output_table)
print("Wrote the output!")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Wrote the output!