In [18]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

# Initialize a Spark Session. The App name refers to Reddit, since Reddit data is being processed. 
# The Spark master is located on host 8080, so this is denoted in the file.
# The Session should be created if it doesn't exist yet, and otherwise get. 
spark = SparkSession.builder.appName("RedditData").master("spark://spark-master:7077").getOrCreate()

Next, the data will be read from the reddit_vm csv file and saved to df. Since the csv file uses a header, this will be denoted as true in the function.

In [19]:
reddit_data = spark.read.format("bigquery").load("de2021labs.reddit_data.reddit_data")

In [20]:
reddit_data.show(5)

+-------+-----+-------+----+---------+-------------+--------------------+-------------------+
|  title|score|     id| url|comms_num|      created|                body|          timestamp|
+-------+-----+-------+----+---------+-------------+--------------------+-------------------+
|Comment|    0|fofa0yy|null|        0| 1.58775959E9|Because Anti-Vaxx...|2020-04-24 23:19:50|
|Comment|    0|ej9xuaf|null|        0|1.553474721E9|What do you mean ...|2019-03-25 02:45:21|
|Comment|    0|ejc2imz|null|        0|1.553547167E9|Good thing I didn...|2019-03-25 22:52:47|
|Comment|    0|ejah7aa|null|        0| 1.55348913E9|What is thimerosa...|2019-03-25 06:45:30|
|Comment|    0|ejagfa0|null|        0|1.553488573E9|I never once said...|2019-03-25 06:36:13|
+-------+-----+-------+----+---------+-------------+--------------------+-------------------+
only showing top 5 rows



# Exploratory data analysis
Next, some exploratory data analysis will be performed to understand the data better.

In [21]:
print("Number of reddit posts: {}\n".format(reddit_data.count()))

Number of reddit posts: 1597



Next, we will check if there are are posts without any text in the body. These cannot be used for sentiment analysis, and should be removed in the data cleaning step.

In [22]:
reddit_data.filter("body is not null").show()

+-------+-----+-------+----+---------+-------------+--------------------+-------------------+
|  title|score|     id| url|comms_num|      created|                body|          timestamp|
+-------+-----+-------+----+---------+-------------+--------------------+-------------------+
|Comment|    0|fofa0yy|null|        0| 1.58775959E9|Because Anti-Vaxx...|2020-04-24 23:19:50|
|Comment|    0|ej9xuaf|null|        0|1.553474721E9|What do you mean ...|2019-03-25 02:45:21|
|Comment|    0|ejc2imz|null|        0|1.553547167E9|Good thing I didn...|2019-03-25 22:52:47|
|Comment|    0|ejah7aa|null|        0| 1.55348913E9|What is thimerosa...|2019-03-25 06:45:30|
|Comment|    0|ejagfa0|null|        0|1.553488573E9|I never once said...|2019-03-25 06:36:13|
|Comment|    0|ejaftc8|null|        0|1.553488143E9|Sodium and chloride.|2019-03-25 06:29:03|
|Comment|    0|ek76jw5|null|        0|1.554521745E9|Why are you givin...|2019-04-06 06:35:45|
|Comment|    0|ek49q2s|null|        0|1.554430197E9|The prob

In [23]:
print("Number of reddit posts that have null in the body: {}\n".format(reddit_data.filter("body is null").count()))

Number of reddit posts that have null in the body: 374



# Cleaning
First, the rows with null values for the body will be removed. Next, unnecessary columns will be removed.

In [24]:
reddit_data = reddit_data.filter("body is not null")

In [25]:
reddit_data = reddit_data.drop('comms_num')
reddit_data = reddit_data.drop('url')
reddit_data = reddit_data.drop('created')

Create a special dataframe with only the text in the body, to perform sentiment analysis on.

In [26]:
reddit_data_text = reddit_data.select(['body', 'timestamp'])

Add an index to the body text.

In [27]:
from pyspark.sql.functions import *
reddit_data_text = reddit_data_text.select("*").withColumn("id", monotonically_increasing_id()).select("id","body")

In [28]:
reddit_data_text.show(5)

+---+--------------------+
| id|                body|
+---+--------------------+
|  0|Because Anti-Vaxx...|
|  1|What do you mean ...|
|  2|Good thing I didn...|
|  3|What is thimerosa...|
|  4|I never once said...|
+---+--------------------+
only showing top 5 rows



# Sentiment analysis

In [29]:
!pip3 install textblob



In [30]:
from textblob import TextBlob

In [31]:
# There was an issue with the texblob module such that it did not allow for application of an udf to an entire column. 
# To fix this, we converted the pyspark df to a pandas df and simply appended a list of sentiments to the df.

pandas_df = reddit_data_text.toPandas()
sentiment_list = list()

for index, row in pandas_df.iterrows():
    sentiment = row['body']
    sentiment_list.append(TextBlob(sentiment).sentiment[0])

sentiments_df = spark.createDataFrame(sentiment_list, StringType()) \
                    .select("*") \
                    .withColumn("id", monotonically_increasing_id()) 

# Join dataframes
reddit_sentiments = reddit_data_text.join(sentiments_df, "id")

# Peak
reddit_sentiments.show(10)

+---+--------------------+--------------------+
| id|                body|               value|
+---+--------------------+--------------------+
|  0|Because Anti-Vaxx...|                 0.0|
|  1|What do you mean ...|-0.08806818181818182|
|  2|Good thing I didn...| 0.13333333333333333|
|  3|What is thimerosa...|                 0.0|
|  4|I never once said...|-0.19166666666666662|
|  5|Sodium and chloride.|                 0.0|
|  6|Why are you givin...| 0.10570779220779221|
|  7|The problem is th...|-0.01515151515151...|
|  8|In this instance,...|                 0.0|
|  9|Anti-vax is [prom...| 0.19999999999999998|
+---+--------------------+--------------------+
only showing top 10 rows



The next step is to save the sentiment analysis df to BigQuery.

In [32]:
bucket = "dejadsdejads_group10assignment2"
spark.conf.set('temporaryGcsBucket', bucket)

conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

reddit_sentiments.write.format('bigquery') \
  .option('table', 'de2021labs.reddit_data.reddit_sentiments') \
  .mode("overwrite") \
  .save()

# Word count
Besides sentiment analysis, a word count is also performed for the reddit dataset.

In [None]:
import pyspark.sql.functions as f
from pyspark.ml.feature import Tokenizer, StopWordsRemover

# Tokenize
tokenizer = Tokenizer(inputCol='body', outputCol='body_tokenized')
reddit_data_text_tokenized = tokenizer.transform(reddit_data_text).select('id', 'body_tokenized')

# Remove unnecessary words
unncessary_words_list = ["like", "know", "get", 'one', 'think', 'cause', 'say', 'even', "don't", 'got', 'also', 'good', 'said',
                        'make', 'it.', 'first', 'many', 'still', 'actually', "don't", 'want', 'read', 'print', 'vaccine',
                        'vaccines', 'vaccinated', 'vaccination', 'may', 'saying', 'point', 'virus', 'never', 'much', 'see',
                        '1', 'way', 'wrong', 'really', 'used', 'well', 'getting', 'take', 'every', 'go', '>'] 
unncessary_words_list.extend(StopWordsRemover().getStopWords())
remover = StopWordsRemover(inputCol='body_tokenized', outputCol='body_clean', stopWords=unncessary_words_list)
reddit_data_text_no_stopwords = remover.transform(reddit_data_text_tokenized).select(['id', 'body_clean'])

# Return to regular strings
reddit_data_text_no_stopwords = reddit_data_text_no_stopwords.withColumn("body_clean", 
                                                                         concat_ws(",", "body_clean"))

# Count the words
reddit_data_text_no_stopwords.withColumn('body_clean', f.explode(f.split(f.col('body_clean'), ',')))\
    .groupBy('body_clean')\
    .count()\
    .sort('count', ascending=False)\
    .filter(f.col('body_clean') != "")\
    .show()

In [None]:
# Save the word count
bucket = "dejadsdejads_group10assignment2"
spark.conf.set('temporaryGcsBucket', bucket)

conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

reddit_sentiments.write.format('bigquery') \
  .option('table', 'de2021labs.reddit_data.reddit_wordcount') \
  .mode("overwrite") \
  .save()

At last, the spark context should be stopped.

In [33]:
# Stop the spark context
spark.stop()