# Analysis of Reddit Comments on Climate Change
This notebook analyzes Reddit comments on climate change. Our team's goal is to: ...

SENG 550 Final Project
- Monmoy Maahdie
- Smitkumar Saraiya
- Farhan Ali
- Kai Ferrer

## 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
from pyspark.sql.functions import collect_list
# import pyspark
from collections import Counter
from pyspark.sql.functions import col, length, regexp_replace, udf, split, explode
from pyspark.sql.types import DoubleType, StringType, IntegerType
import spacy

In [2]:

from pyspark.ml.feature import Tokenizer, HashingTF, VectorAssembler, StopWordsRemover
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder


## 2. Create Spark Session

In [3]:
# Initialize spark session
spark = SparkSession.builder \
    .appName("Reddit Climate Change Comments") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.sql.shuffle.partitions", "100") \
    .getOrCreate()

24/12/19 12:27:53 WARN Utils: Your hostname, Smits-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.1.86 instead (on interface en0)
24/12/19 12:27:53 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/19 12:27:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## 3. Load Data

In [4]:
nlp = spacy.load('en_core_web_sm')
stopwords = nlp.Defaults.stop_words
len(stopwords)

326

In [5]:
nlp.Defaults.stop_words -= {"no", "not"}

In [6]:
# Create dataframe
df = spark.read.csv("the-reddit-climate-change-dataset-comments.csv", header=True, inferSchema=True)
df = df.repartition(100)  #  increase the number of partitions for large datasets - idk what to put 
# df.show(5, truncate=False) # checking the dataset by displaying first 5 rows
df_original = df # save original dataset

24/12/19 12:28:06 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

In [7]:
new_columns = [col_name.replace('.', '_') for col_name in df.columns]
df = df.toDF(*new_columns)
# df.show(5, truncate=False)

In [8]:
df_clean = df.dropna()
# df_clean.show(n=5, truncate=False)

In [9]:
df_clean = df_clean.drop("permalink")

# Separate records where sentiment and score can be cast to numbers
clean_df = df_clean.filter(
    col("sentiment").cast(DoubleType()).isNotNull() &
    col("score").cast(DoubleType()).isNotNull()
)

# Records where either sentiment or score contain non-numerical values
problematic_df = df_clean.filter(
    col("sentiment").cast(DoubleType()).isNull() |
    col("score").cast(DoubleType()).isNull()
)




In [10]:
# clean_df.filter((df_clean["type"] == "comment") & (df_clean["subreddit_name"] == "technology")).show(n=100, truncate=False) | YOU CAN MIDIFY THE SUBREDDIT NAME TO SEE CLIMATE CHANGE DISCUSSIONS ON DIFFERENT SUBREDDITS
# clean_df.show(n=10, truncate=False) # we want this data

In [11]:
# print(clean_df.count())

In [12]:
# problematic_df.filter(df_clean["type"] == "comment").show(n=5, truncate=False) # this we do not need

In [13]:
clean_df_1 = clean_df.filter(col("type") == "comment") # only comments exist in here
# clean_df_1.show(n=10, truncate=False)

In [14]:
# Load spacy model
nlp = spacy.load('en_core_web_sm')

# Define the preprocessing function
def preprocess(comment):
    doc = nlp(comment)
    processed_words_list = []
    for token in doc:
        if not token.is_punct and not token.like_url and not token.is_stop:
            processed_words_list.append(token.lemma_.strip().lower())
    return ' '.join(processed_words_list)

    
# Create a UDF from the function
preprocess_udf = udf(preprocess, StringType())


In [15]:

# Apply the UDF to create a new column
clean_df_2 = clean_df_1.withColumn('processed_body', preprocess_udf(col('body')))
clean_df_2.show(n=10, truncate=False)

[Stage 4:>                                                          (0 + 1) / 1]

+-------+-------+------------+--------------------+--------------+-----------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------+-----+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [16]:
def create_label(sentiment):
    if float(sentiment) < -0.05:
        return -1
    elif float(sentiment) >= -0.05 and float(sentiment) <= 0.05:
        return 0
    else:
        return 1

create_label_udf = udf(create_label, IntegerType())



In [17]:
clean_df_3 = clean_df_2.withColumn('label', create_label_udf(col('sentiment')))
# Count occurrences of each subreddit_name and order by count in descending order
subreddit_counts = clean_df_3.groupBy('subreddit_name') \
    .count() \
    .orderBy('count', ascending=False)

# Show the results



In [18]:
# print("Most common subreddits:")
# subreddit_counts.show(10, truncate=False)

In [19]:
# # If you need the total number of unique subreddits
# unique_subreddits = subreddit_counts.count()
# print(f"\nTotal number of unique subreddits: {unique_subreddits}")

In [20]:
# subreddit_counts.filter(subreddit_counts['subreddit_name'] == 'climate').show()
clean_df_3 = clean_df_3.withColumn("sentiment", col("sentiment").cast(DoubleType()))


In [21]:

# print("At tokenizer")
# # Tokenize comment text
# tokenizer = Tokenizer(inputCol="processed_body", outputCol="words")

# # Transform words into numerical features
# print("At hashingTF")
# hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=10000)

# # Define the model
# print("At linear regression")
# lr = LinearRegression(featuresCol="features", labelCol="sentiment")

# # Create a pipeline
# print("At pipeline")
# pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

# # Split the data
# print("Data split")
# (train_data, test_data) = clean_df_3.randomSplit([0.01, 0.01])

# # Train the model
# print("model traiing")
# model = pipeline.fit(train_data)

# # Make predictions
# print("Predictions")
# predictions = model.transform(test_data)

# # Evaluate the model
# print("Mordel eval")
# evaluator = RegressionEvaluator(labelCol="sentiment", predictionCol="prediction", metricName="rmse")
# rmse = evaluator.evaluate(predictions)
# print("Root Mean Squared Error (RMSE) on test data =", rmse)


## Spark Warehouse

just using whatever Kai put down for spark warehousing 

In [22]:
spark.sql("CREATE DATABASE reddit_db") 

DataFrame[]

In [23]:
spark.sql("SHOW DATABASES").show() # check that reddit_db is in here

+---------+
|namespace|
+---------+
|  default|
|reddit_db|
+---------+



In [24]:
spark.sql("SHOW TABLES").show() # should be empty tables

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



In [25]:
spark.sql("DROP TABLE IF EXISTS reddit_db.comments")

DataFrame[]

In [26]:
spark.sql("""
CREATE TABLE IF NOT EXISTS reddit_db.comments (
    `type` STRING,
    `id` STRING,
    `subreddit_id` STRING,
    `subreddit_name` STRING,
    `subreddit_nsfw` STRING,
    `created_utc` STRING,
    `body` STRING,
    `sentiment` STRING,
    `score` STRING,
    `processed_body` STRING,
    `label` STRING
)
USING PARQUET
""")

DataFrame[]

In [27]:
spark.sql("SHOW TABLES").show() # should be updated to have one table now

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



In [28]:
clean_df_3.show(5, truncate=False)

[Stage 7:>                                                          (0 + 1) / 1]

+-------+-------+------------+--------------------+--------------+-----------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------+-----+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|type   |id     |sub

                                                                                

In [29]:
# align the columns - spark only accepts '_' but the dataset uses '.'
df_aligned = clean_df_3

# this is lowkey still transformation?

In [30]:
df_aligned.printSchema() # double check


root
 |-- type: string (nullable = true)
 |-- id: string (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- subreddit_name: string (nullable = true)
 |-- subreddit_nsfw: string (nullable = true)
 |-- created_utc: string (nullable = true)
 |-- body: string (nullable = true)
 |-- sentiment: double (nullable = true)
 |-- score: string (nullable = true)
 |-- processed_body: string (nullable = true)
 |-- label: integer (nullable = true)



In [31]:
spark.sql("SHOW TABLES").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



In [32]:
df_aligned.show(n=5, truncate=False)

[Stage 10:>                                                         (0 + 1) / 1]

+-------+-------+------------+--------------------+--------------+-----------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------+-----+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|type   |id     |sub

                                                                                

In [None]:
df_aligned.write.insertInto("reddit_db.comments", overwrite=False) # insert data from csv/df into spark table

In [34]:
spark.sql("SELECT * FROM reddit_db.comments LIMIT 100").show() #validate the table

+----+---+------------+--------------+--------------+-----------+----+---------+-----+--------------+-----+
|type| id|subreddit_id|subreddit_name|subreddit_nsfw|created_utc|body|sentiment|score|processed_body|label|
+----+---+------------+--------------+--------------+-----------+----+---------+-----+--------------+-----+
+----+---+------------+--------------+--------------+-----------+----+---------+-----+--------------+-----+



In [None]:
df_tokens = df_aligned.withColumn("words", split(col("body"), r"\s+"))
df_tokens = df_tokens.filter(df_tokens["words"].isNotNull())
df_tokens.show(5) #check if words column created

[Stage 13:> (0 + 8) / 100][Stage 14:>  (0 + 0) / 31][Stage 15:>  (0 + 0) / 31]

In [None]:
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")

In [None]:
df_aligned_words = remover.transform(df_tokens)
df_aligned_words.show(100) 

In [None]:
# explode - helps so that each word appears in a separate row so we can count frequency
df_exploded = df_aligned_words.withColumn("word", explode(col("filtered_words")))
df_exploded.show(100)

In [None]:
df_word_count = df_exploded.groupBy("word").count().orderBy("count", ascending=False)
df_word_count.show(10)

In [None]:
spark.stop()