In [None]:
import warnings
import pandas as pd
import pyspark
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import FloatType
from pyspark.sql.functions import when, col, to_date, lag
from pyspark.sql.functions import to_timestamp, count, isnan, isnull, mean, rand

warnings.filterwarnings("ignore")

In [None]:
mongo_uri = "mongodb://hadoop-vm.internal.cloudapp.net:27017/ca2"

# Spark version 3.2.3
# MongoDB version 6.0.5
# Java Version 11

# create a spark session
# Jars dependencies available in maven repository
# https://mvnrepository.com/search?q=mongodb-driver-sync
spark = SparkSession.builder \
    .appName('Tweets') \
    .config("spark.mongodb.read.connection.uri", mongo_uri) \
    .config("spark.mongodb.write.connection.uri", mongo_uri) \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1") \
    .config("spark.jars.packages", "org.mongodb:mongodb-driver-core:4.9.1") \
    .config("spark.jars.packages", "org.mongodb:mongodb-driver-sync:4.9.1") \
    .config("spark.jars.packages", "org.mongodb:bson:4.9.1") \
    .getOrCreate()


# Load vaccine tweets for analysis
> Note: vaccin_tweets_2_202004 collection include missing month of April 2020

In [None]:
# read data from mongodb collection "tweets" into a dataframe "df"
df_tweets = spark.read \
    .format("mongodb") \
    .option("connection.uri", mongo_uri) \
    .option("database", "ca2") \
    .option("collection", "vaccin_tweets_2_202004") \
    .load()

df_sentiment = spark.read \
    .format("mongodb") \
    .option("connection.uri", mongo_uri) \
    .option("database", "ca2") \
    .option("collection", "vaccin_tweets_2_202004_sentiment") \
    .load()

In [None]:
# Create a new column 'is_retweet' based on the presence of 'RT' in the 'text' column
df_tweets = df_tweets.withColumn("is_retweet", (col("text").like("RT%")).cast("integer"))

In [None]:
df_tweets.show()

In [None]:
df_sentiment.show()

In [None]:
# Join data frames
df = df_tweets.join(df_sentiment, on="_id", how="inner")


In [None]:
# Convert timestamp to a format usable for analysis
df = df.withColumn("timestamp", to_timestamp(df["timestamp"]))
df = df.withColumn("date", df["timestamp"].cast("date"))
df = df.withColumn("s_probability", col("s_probability").cast("float"))

In [None]:
df.createOrReplaceTempView("tweets")
spark.sql("SELECT MAX(date) maxdate, MIN(date) mindate FROM tweets").show()

## Initial dates

In [None]:
# Filter 1 year data
start_date_str = '2020-04-01'
start_date = datetime.strptime(start_date_str, "%Y-%m-%d")

df = df.filter(f"date >= date'{start_date_str}'")

In [None]:
df.createOrReplaceTempView("tweets")
spark.sql("SELECT MAX(date) maxdate, MIN(date) mindate FROM tweets").show()

In [None]:
# Encode sentiment and calculate weights score
df = df.withColumn("sentiment_encoded", 
                    when(col("sentiment") == "positive", 2)
                   .when(col("sentiment") == "negative", -1)
                   .otherwise(1))
df = df.withColumn("sentiment_score", col("sentiment_encoded") * col("s_probability"))
df = df.withColumn("sentiment_encoded", col("sentiment_encoded").cast("int"))
df = df.withColumn("sentiment_score", col("sentiment_score").cast("float"))

In [None]:
df.describe()

In [None]:
df[["date","is_retweet","sentiment_encoded","s_probability","sentiment_score"]].show()

In [None]:
# Aggregate sentiment by date and retweet rates to capture the retweet feature
df_grouped = df.groupBy("date").agg(mean("sentiment_score").alias("avg_sentiment_score"))
df_grouped = df.groupBy("date").agg(mean("sentiment_score").alias("avg_sentiment_score"), 
                                    mean("is_retweet").alias("retweet_rate"))

# Ensure order of data
df_grouped = df_grouped.orderBy("date")


In [None]:
df_grouped.show()

## Daily seasonality

In [None]:
def build_vector(lag_periods, df_grouped, skip_to):
    window = Window.orderBy("date")
    
    # Create groups for 1,2,3,4,5,6,7 weekly seasons.
    for i in lag_periods:
        df_grouped = df_grouped.withColumn(f'sentiment_score_lag_{i}', lag(col('avg_sentiment_score'), i).over(window))
        
    # Filter out the first 7 rows with NAN
    filter_d = f"date > date'{skip_to}'"
    df_grouped = df_grouped.filter(filter_d)
    
    input_cols = [f'sentiment_score_lag_{i}' for i in lag_periods]
    input_cols.append("retweet_rate")
    input_cols
    
    # Assembler Vector - A feature transformer that merges multiple columns into a vector column.
    # https://spark.apache.org/docs/3.1.3/api/python/reference/api/pyspark.ml.feature.VectorAssembler.html
    va = VectorAssembler(
        inputCols=input_cols,
        outputCol='features')
    
    return va.transform(df_grouped)


## Linear regression

# Daily lagging periods up to 7 Days

Generates features:
- retweet_rate
- sentiment_score_lag_1 day
- sentiment_score_lag_2 days
- sentiment_score_lag_3 days
- sentiment_score_lag_4 days
- sentiment_score_lag_5 days
- sentiment_score_lag_6 days
- sentiment_score_lag_7 days

In [None]:
va_df = build_vector([1,2,3,4,5,6,7], df_grouped, (start_date + timedelta(days=7)))

In [None]:
va_df.select(["date"]).show(1)

In [None]:
# Prepare train and test datasets
split_date = "2020-12-31"
train = va_df.filter(va_df.date <= split_date)
test = va_df.filter(va_df.date > split_date)

# Linear regression model
lr = LinearRegression(featuresCol='features', labelCol='avg_sentiment_score')

# Fit the model
lr_model_d = lr.fit(train)

# Make predictions
predictions = lr_model_d.transform(test)

# Evaluate model
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='avg_sentiment_score')

print("MSE: ", lr_model_d.summary.meanSquaredError)
print("MAE: ", lr_model_d.summary.meanAbsoluteError)
print("R-squared: ", lr_model_d.summary.r2) 
print(f"RMSE: {evaluator.evaluate(predictions)}")


## Plot predictions

In [None]:
mdata = lr_model_d.transform(test)
x_ax = range(0, mdata.count())
y_pred = mdata.select("prediction").collect()
y_orig = mdata.select("avg_sentiment_score").collect()

plt.plot(x_ax, y_orig, label="original")
plt.plot(x_ax, y_pred, label="predicted")
plt.title("Tweets Sentiment test and predicted data")
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.legend(loc='best',fancybox=True, shadow=True)
plt.grid(True)
plt.show() 

# Weekly lagging periods 1, 7, 14, 21, 28, 35 days:

Generates features:
- retweet_rate
- sentiment_score_lag_1 day
- sentiment_score_lag_7 days
- sentiment_score_lag_14 days
- sentiment_score_lag_21 days
- sentiment_score_lag_28 days
- sentiment_score_lag_35 days


In [None]:
va_df = build_vector([1, 7, 14, 21, 28, 35], df_grouped, (start_date + timedelta(days=35)))

In [None]:
va_df.select(["date"]).show(1)

In [None]:
# Prepare train and test datasets
split_date = "2021-01-01"
train = va_df.filter(va_df.date <= split_date)
test = va_df.filter(va_df.date > split_date)

# Linear regression model
lr = LinearRegression(featuresCol='features', labelCol='avg_sentiment_score')

# Fit the model
lr_model_w = lr.fit(train)

# Make predictions
predictions = lr_model_w.transform(test)

# Evaluate model
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='avg_sentiment_score')

print("MSE: ", lr_model_w.summary.meanSquaredError)
print("MAE: ", lr_model_w.summary.meanAbsoluteError)
print("R-squared: ", lr_model_w.summary.r2) 
print(f"RMSE: {evaluator.evaluate(predictions)}")


In [None]:
train.count(),test.count()

In [None]:
mdata = lr_model_w.transform(test)
x_ax = range(0, mdata.count())
y_pred = mdata.select("prediction").collect()
y_orig = mdata.select("avg_sentiment_score").collect()

plt.plot(x_ax, y_orig, label="original")
plt.plot(x_ax, y_pred, label="predicted")
plt.title("Tweets Sentiment test and predicted data")
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.legend(loc='best',fancybox=True, shadow=True)
plt.grid(True)
plt.show() 

## Biweekly laggin periods 1, 14, 28, 42, 56, 70, 84 days:

Generates features:
- retweet_rate
- sentiment_score_lag_1 day
- sentiment_score_lag_14 days
- sentiment_score_lag_28 days
- sentiment_score_lag_42 days
- sentiment_score_lag_56 days
- sentiment_score_lag_70 days
- sentiment_score_lag_84 days


In [None]:
va_df = build_vector([1, 14, 28, 42, 56, 70, 84], df_grouped, (start_date + timedelta(days=84+3)))

In [None]:
# Prepare train and test datasets
split_date = "2021-02-01"
train = va_df.filter(va_df.date <= split_date)
test = va_df.filter(va_df.date > split_date)

# Linear regression model
lr = LinearRegression(featuresCol='features', labelCol='avg_sentiment_score')

# Fit the model
lr_model_bw = lr.fit(train)

# Make predictions
predictions = lr_model_bw.transform(test)

# Evaluate model
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='avg_sentiment_score')

print("MSE: ", lr_model_bw.summary.meanSquaredError)
print("MAE: ", lr_model_bw.summary.meanAbsoluteError)
print("R-squared: ", lr_model_bw.summary.r2) 
print(f"RMSE: {evaluator.evaluate(predictions)}")


In [None]:
train.count(), test.count()

In [None]:
mdata = lr_model_bw.transform(test)
x_ax = range(0, mdata.count())
y_pred = mdata.select("prediction").collect()
y_orig = mdata.select("avg_sentiment_score").collect()

plt.plot(x_ax, y_orig, label="original")
plt.plot(x_ax, y_pred, label="predicted")
plt.title("Tweets Sentiment test and predicted data")
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.legend(loc='best',fancybox=True, shadow=True)
plt.grid(True)
plt.show() 

# Best model: Biweekly lagging features

va_df = build_vector([1, 14, 28, 42, 56, 70, 84], df_grouped, (start_date + timedelta(days=84+3)))

Feature Selection:
```
 'retweet_rate'
 'sentiment_score_lag_1'
 'sentiment_score_lag_14'
 'sentiment_score_lag_28'
 'sentiment_score_lag_42'
 'sentiment_score_lag_56'
 'sentiment_score_lag_70'
 'sentiment_score_lag_84'
```
> RMSE: 0.0895482549334093
```
MSE:  0.021009647134220523
MAE:  0.10486859980823686
R-squared:  0.1471076439767326
RMSE: 0.0895482549334093
``` 


In [None]:
lr_model_bw