In [116]:
import numpy as np
import pandas as pd

In [117]:
# get data from csv file
thread_df = pd.read_csv('data/uk_subreddit_threads.csv', encoding='latin-1')
comment_df = pd.read_csv('data/uk_subreddit_comments.csv')

# drop location column
comment_df = comment_df.drop(columns=['location'])
# rename submission_id column to thread_id
comment_df = comment_df.rename(columns={'submission_id': 'thread_id'})
comment_df.shape
# convert comment_df created column to datetime where format is %Y-%m-%d %H:%M:%S
comment_df['created'] = pd.to_datetime(comment_df['created'], format='%Y-%m-%d %H:%M:%S')
# extract year from comment_df created column
comment_df['year'] = comment_df['created'].dt.year
# extract quarter from comment_df created column
comment_df['quarter'] = comment_df['created'].dt.quarter
# drop missing values
comment_df = comment_df.dropna()
# drop rows with missing values
thread_df = thread_df.dropna()


In [118]:
# replace \n with space in comment_df body column
comment_df['body'] = comment_df['body'].str.replace('\n', ' ')
# replace all non-alphanumeric characters with space in comment_df body column
comment_df['body'] = comment_df['body'].str.replace('[^a-zA-Z0-9]', ' ')
# remove all rows where body column has a single word
comment_df = comment_df[comment_df['body'].str.split().str.len() > 1]
comment_df

  comment_df['body'] = comment_df['body'].str.replace('[^a-zA-Z0-9]', ' ')


Unnamed: 0,body,score,created,id,thread_id,year,quarter
0,Checked supercharge info United Kingdom 118 s...,703,2023-07-15 21:19:43,js3upk8,150lepk,2023,3
1,I m in England I have an EV Today and tomo...,112,2023-07-15 22:03:13,js40lpa,150lepk,2023,3
2,Lol There s definitely a lot of people over ...,106,2023-07-15 21:13:16,js3ttv8,150lepk,2023,3
3,must have been a Brexiteer How s that going,166,2023-07-15 21:14:11,js3tyf4,150lepk,2023,3
4,A guy stopped me outside the store in Connecti...,25,2023-07-15 23:06:22,js48xjo,150lepk,2023,3
...,...,...,...,...,...,...,...
21163,Airbus is planning to launch their full hydrog...,3,2021-07-14 14:41:34,h55iuh5,ok1ej7,2021,3
21164,Might be banning domestic flights like France ...,1,2021-07-16 18:06:14,h5f1rhk,ok1ej7,2021,3
21165,Even worse is that they are building a new non...,1,2021-07-16 18:08:21,h5f222f,ok1ej7,2021,3
21167,Agreed Full hydrogen is today and especiall...,5,2021-07-14 15:00:45,h55l5dn,ok1ej7,2021,3


In [119]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Initialize the SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# Sample text for sentiment analysis
text = "I love this product! It's amazing."

# Analyze the sentiment
scores = analyzer.polarity_scores(text)

# Print the sentiment scores
print(scores)

# Determine overall sentiment
if scores['compound'] >= 0.05:
    print("Positive sentiment")
elif scores['compound'] <= -0.05:
    print("Negative sentiment")
else:
    print("Neutral sentiment")


# define function to get sentiment scores
def get_sentiment_scores(
        df,
        text_col
):
    """Get sentiment scores for each row in text_col of df."""
    
    # Initialize the SentimentIntensityAnalyzer
    analyzer = SentimentIntensityAnalyzer()
    
    # Get sentiment scores for each row in text_col
    sentiment_scores = df[text_col].apply(lambda x: analyzer.polarity_scores(x))
    
    # Convert list of dicts into df
    sentiment_scores_df = pd.DataFrame(list(sentiment_scores))
    
    # Add index to sentiment_scores_df
    sentiment_scores_df.index = df.index
    
    # Add sentiment scores to df
    df = pd.concat([df, sentiment_scores_df], axis=1)
    
    return df

# apply get_sentiment_scores function to comment_df and store in comment_df column "scores"
comment_df = get_sentiment_scores(comment_df, 'body')

comment_df

{'neg': 0.0, 'neu': 0.266, 'pos': 0.734, 'compound': 0.8516}
Positive sentiment


Unnamed: 0,body,score,created,id,thread_id,year,quarter,neg,neu,pos,compound
0,Checked supercharge info United Kingdom 118 s...,703,2023-07-15 21:19:43,js3upk8,150lepk,2023,3,0.117,0.835,0.049,-0.4404
1,I m in England I have an EV Today and tomo...,112,2023-07-15 22:03:13,js40lpa,150lepk,2023,3,0.019,0.911,0.071,0.6598
2,Lol There s definitely a lot of people over ...,106,2023-07-15 21:13:16,js3ttv8,150lepk,2023,3,0.149,0.749,0.102,-0.6652
3,must have been a Brexiteer How s that going,166,2023-07-15 21:14:11,js3tyf4,150lepk,2023,3,0.000,1.000,0.000,0.0000
4,A guy stopped me outside the store in Connecti...,25,2023-07-15 23:06:22,js48xjo,150lepk,2023,3,0.046,0.954,0.000,-0.2263
...,...,...,...,...,...,...,...,...,...,...,...
21163,Airbus is planning to launch their full hydrog...,3,2021-07-14 14:41:34,h55iuh5,ok1ej7,2021,3,0.000,0.940,0.060,0.2023
21164,Might be banning domestic flights like France ...,1,2021-07-16 18:06:14,h5f1rhk,ok1ej7,2021,3,0.000,0.749,0.251,0.7717
21165,Even worse is that they are building a new non...,1,2021-07-16 18:08:21,h5f222f,ok1ej7,2021,3,0.333,0.667,0.000,-0.7184
21167,Agreed Full hydrogen is today and especiall...,5,2021-07-14 15:00:45,h55l5dn,ok1ej7,2021,3,0.000,0.939,0.061,0.5647


In [120]:
# drop columns negative, neutral, and positive
comment_df = comment_df.drop(columns=['neg', 'neu', 'pos'])
# rename compound column to sentiment
comment_df = comment_df.rename(columns={'compound': 'sentiment'})
comment_df

Unnamed: 0,body,score,created,id,thread_id,year,quarter,sentiment
0,Checked supercharge info United Kingdom 118 s...,703,2023-07-15 21:19:43,js3upk8,150lepk,2023,3,-0.4404
1,I m in England I have an EV Today and tomo...,112,2023-07-15 22:03:13,js40lpa,150lepk,2023,3,0.6598
2,Lol There s definitely a lot of people over ...,106,2023-07-15 21:13:16,js3ttv8,150lepk,2023,3,-0.6652
3,must have been a Brexiteer How s that going,166,2023-07-15 21:14:11,js3tyf4,150lepk,2023,3,0.0000
4,A guy stopped me outside the store in Connecti...,25,2023-07-15 23:06:22,js48xjo,150lepk,2023,3,-0.2263
...,...,...,...,...,...,...,...,...
21163,Airbus is planning to launch their full hydrog...,3,2021-07-14 14:41:34,h55iuh5,ok1ej7,2021,3,0.2023
21164,Might be banning domestic flights like France ...,1,2021-07-16 18:06:14,h5f1rhk,ok1ej7,2021,3,0.7717
21165,Even worse is that they are building a new non...,1,2021-07-16 18:08:21,h5f222f,ok1ej7,2021,3,-0.7184
21167,Agreed Full hydrogen is today and especiall...,5,2021-07-14 15:00:45,h55l5dn,ok1ej7,2021,3,0.5647


In [132]:
sentiment_df = comment_df.groupby(['year', 'quarter'])['sentiment'].mean().reset_index()
# order by year and quarter
sentiment_df = sentiment_df.sort_values(by=['year', 'quarter'])
# convert year and quarter to datetime column
sentiment_df['Date'] = pd.to_datetime(sentiment_df['year'].astype(str) + 'Q' + sentiment_df['quarter'].astype(str))
# drop year and quarter columns
sentiment_df = sentiment_df.drop(columns=['year', 'quarter'])

sentiment_df
# save sentiment_df to csv
sentiment_df.to_csv('data/sentiment_df.csv', index=False)


In [135]:
# rename Date column to ds and sentiment column to yhat
sentiment_df = sentiment_df.rename(columns={'Date': 'ds', 'sentiment': 'yhat'})
sentiment_df

Unnamed: 0,yhat,ds
0,0.505211,2014-07-01
1,0.0,2014-10-01
2,0.366067,2015-01-01
3,0.81548,2015-04-01
4,0.351689,2015-07-01
5,0.2906,2015-10-01
6,0.000433,2016-04-01
7,0.07145,2016-10-01
8,0.0,2017-01-01
9,0.144136,2017-04-01


In [153]:
# used google colab to run prophet
# read the csv file with forecasted data
forecast_df = pd.read_csv('data/forecast.csv')
# convert ds column to datetime
forecast_df['ds'] = pd.to_datetime(forecast_df['ds'])
# keep only ds and yhat columns
forecast_df = forecast_df[['ds', 'yhat']]
forecast_df

Unnamed: 0,ds,yhat
0,2010-01-01,0.348019
1,2010-04-01,0.441002
2,2010-07-01,0.412506
3,2010-10-01,0.312627
4,2011-01-01,0.375605
5,2011-04-01,0.528448
6,2011-07-01,0.389983
7,2011-10-01,0.333853
8,2012-01-01,0.400207
9,2012-04-01,0.205088


In [154]:
# add rows from forecast_df to sentiment_df where ds does not exist in sentiment_df
sentiment_df = sentiment_df.append(forecast_df[~forecast_df['ds'].isin(sentiment_df['ds'])])
# sort sentiment_df by ds
sentiment_df = sentiment_df.sort_values(by=['ds'])
sentiment_df


  sentiment_df = sentiment_df.append(forecast_df[~forecast_df['ds'].isin(sentiment_df['ds'])])


Unnamed: 0,yhat,ds
0,0.348019,2010-01-01
1,0.441002,2010-04-01
2,0.412506,2010-07-01
3,0.312627,2010-10-01
4,0.375605,2011-01-01
5,0.528448,2011-04-01
6,0.389983,2011-07-01
7,0.333853,2011-10-01
8,0.400207,2012-01-01
9,0.205088,2012-04-01


In [155]:
# save sentiment_df to csv as ev_sentiment_df
sentiment_df.to_csv('data/ev_sentiment_df.csv', index=False)