In [None]:

# This notebook is the realization of a homework assigned to me in college.
# The goal is to extract some analytics by formulating a series of queries 
# based on the data contained within this dataset.

# The project involves the use of three different tools to answer the same 
# analytics: MongoDB, Neo4j and Apache Spark.
# In this notebook I will implement the results obtained using Apache Spark.


In [None]:
pip install pyspark

In [None]:
# Import required modules
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pandas as pd
from wordcloud import WordCloud,STOPWORDS
import matplotlib.pyplot as plt

# Create a SparkSession
spark = SparkSession.builder.appName("TikTokHomework").getOrCreate()

In [None]:
# Creating the schema to correctly define the columns (variables)
schema = [StructField("comment_id",StringType(),False),
         StructField("text",StringType(),False),
         StructField("video_id",StringType(),False),
         StructField("create_time",StringType(),False),
         StructField("like_count",IntegerType(),False),
         StructField("status",BooleanType(),False),
         StructField("author.unique_id",StringType(),False),
         StructField("author.nickname",StringType(),False),
         StructField("author.is_private",BooleanType(),False),
         StructField("author.language",StringType(),False),
         StructField("author.signature",StringType(),False),
         StructField("author.custom_verify",BooleanType(),False),
         StructField("author.uid",StringType(),False),
         StructField("author.sec_uid",StringType(),False),
         StructField("author.avatar_thumb",StringType(),False),
         StructField("author.region",StringType(),False),
         StructField("author.ins_id",StringType(),False),
         StructField("author.youtube_channel_title",StringType(),False),
         StructField("author.youtube_channel_id",StringType(),False),
         StructField("author.twitter_id",StringType(),False),
         ]

final_schema = StructType(fields=schema)

# Reading TikTok Video Comments dataset
dat = spark.read.csv("../input/tiktok-video-comments-david-dobriks-top-videos/final_data.csv", header=True, multiLine=True, escape="\"", schema=final_schema)
dat.createOrReplaceTempView('''Comments''')

# Format create_time column in Date type
dat = dat.withColumn("create_time", from_unixtime("create_time", "yyyy-MM-dd HH:mm:ss"))

In [None]:
# 1. How much content per author?
dat.groupBy("`author.unique_id`").count().sort('count',ascending=False).show()

In [None]:
# 2. How many likes per author?
dat.groupBy("`author.unique_id`").sum("`like_count`").sort(sum('`like_count`'),ascending=False).show()

In [None]:
# 3. TagCloud of terms

tmpDF = dat.withColumn("WORD", explode(split(col("text"), " "))).groupBy("WORD").count().sort("count", ascending=False)

# Convert SparkDataframe to a PandasDataFrame for iterate the elements
pandasDF = tmpDF.toPandas()

comments=''
stopwords=set(STOPWORDS)

for index, row in pandasDF.iterrows():
    tokens=row["WORD"]
    comments+=tokens+" "

# Generate the TagCloud
wordcloud=WordCloud(width=800, height=800, background_color='white', stopwords=stopwords, min_font_size=10).generate(comments)

# Visualize the TagCloud
plt.figure(figsize=(8,8), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

In [None]:
# 4. What are the top 5 most adopted terms by each author?

tmpDF = dat.withColumn("WORD", explode(split(col("text"), " "))).groupBy("WORD", "`author.unique_id`").count().sort("count", ascending=False)
tmpDF = tmpDF.groupBy("`author.unique_id`").agg({"WORD" : "collect_list"})
finDF = tmpDF.withColumn("TOP5WORDS", slice("collect_list(WORD)", 1, 5))
finDF.select(col("`author.unique_id`").alias("USER"), col("TOP5WORDS")).show()

In [None]:
# 5. What are the 5 most frequently used terms in each of the daily time slots (i.e., Morning, Afternoon, Evening, Night)?

tmpDF = dat.withColumn("Time_Slot", when((hour("create_time")>=0) & (hour("create_time")<6),"Night")
                                            .when((hour("create_time")>=6) & (hour("create_time")<12),"Morning")
                                            .when((hour("create_time")>=12) & (hour("create_time")<18),"Afternoon")
                                            .otherwise("Evening"))
tmpDF = tmpDF.withColumn("WORD", explode(split(col("text"), " "))).groupBy("WORD", "Time_Slot").count().sort("count", ascending=False)
tmpDF = tmpDF.groupBy("Time_Slot").agg({"WORD" : "collect_list"})
finDF = tmpDF.withColumn("TOP5WORDS", slice("collect_list(WORD)", 1, 5))
finDF.select(col("Time_Slot"), col("TOP5WORDS")).show()

In [None]:
# 6. Distribution of contents in the various languages

dat.groupBy("`author.language`").count().show()

In [None]:
# 7. Authors, in alphabetical order, who have received more than 100 likes

dat.groupBy("`author.unique_id`").sum("`like_count`").where("sum(`like_count`) > 100").sort("`author.unique_id`", ascending=True).show()

In [None]:
# 8. Given as input a term, show the graph of its use over time

# Input
# input_term = str(input("Enter a term: "))
# For this example we'll use: david
input_term = "david"

tmpDF = dat.withColumn("MONTH", month("create_time")).withColumn("WORD", explode(split(col("text"), " "))).groupBy("WORD", "MONTH").count().where("WORD == '{0}'".format(input_term)).sort("MONTH", ascending=True)

# Convert SparkDataframe to a PandasDataFrame for iterate the elements
pandasDF = tmpDF.toPandas()

# Arrays for store x (month) and y (frequency) values
x = []
y = []

# Loop for store the values
for index, row in pandasDF.iterrows():
    x.append(row["MONTH"])
    y.append(row["count"])

# Checking if there is actually data (i.e. if the term exists)
if len(x) == 0:
    print("Termine non presente.")
else:
    # Plot the graphs
    fig = plt.figure()
    plt.plot(x, y, marker = "o", color = "red")
    fig.suptitle("Term: {0}".format(input_term))
    plt.xlabel("Month")
    plt.ylabel("Frequency")
    plt.show()

In [None]:
# 9. Given a time slot as input, show the TagCloud of the terms used

# We will use the same results as in the fifth query
tmpDF = dat.withColumn("Time_Slot", when((hour("create_time")>=0) & (hour("create_time")<6),"Night")
                                            .when((hour("create_time")>=6) & (hour("create_time")<12),"Morning")
                                            .when((hour("create_time")>=12) & (hour("create_time")<18),"Afternoon")
                                            .otherwise("Evening"))
tmpDF = tmpDF.withColumn("WORD", explode(split(col("text"), " "))).groupBy("WORD", "Time_Slot").count().sort("count", ascending=False)
tmpDF = tmpDF.groupBy("Time_Slot").agg({"WORD" : "collect_list"})
finDF = tmpDF.withColumn("TOP5WORDS", slice("collect_list(WORD)", 1, 5))
finDF.select(col("Time_Slot"), col("TOP5WORDS"))

# Convert SparkDataframe to a PandasDataFrame for iterate the elements
pandasDF = finDF.toPandas()

# Time slot input
# input_hour = input("Enter a time slot (Night, Morning, Afternoon, Evening): ")
# For this example we'll use: Night
input_hour = "Night"

# Checking the validity of the input
while input_hour != "Night" and input_hour != "Morning" and input_hour != "Evening" and input_hour != "Afternoon":
    input_hour = input("Error! Enter a time slot (Night, Morning, Afternoon, Evening):")

from wordcloud import WordCloud,STOPWORDS
comment_words = ""
stopwords = set(STOPWORDS)

# Loop to find the time slot given as input and access related terms
for index, row in pandasDF.iterrows():
    if input_hour == row["Time_Slot"]:
        for value in row["collect_list(WORD)"]:
            comment_words += value + " "
        # Generate the TagCloud
        wordcloud=WordCloud(width=800, height=800, background_color='white', 
                            stopwords=stopwords, min_font_size=10).generate(comment_words)


# Visualize the TagCloud
plt.figure(figsize=(8,8), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()