In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
import datetime as dt
import os 
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import altair as alt

import nltk
from nltk.tokenize import word_tokenize
#nltk.download('punkt')
#nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
#nltk.download('stopwords')

# Read the datasets into dataframes

In [None]:
reddit_df = pd.read_csv("/kaggle/input/reddit-news/rednewswscpt2.csv")
assert reddit_df.created.min() == "2019-11-14 01:43:54"
assert reddit_df.created.max() == "2021-04-29 23:58:58"
assert reddit_df.shape == (1059314, 6)

In [None]:
reddit_df.sample(2)

In [None]:
using_original_sp500 = True
stock_df = pd.read_csv("/kaggle/input/original-sp-500/sp500.csv", index_col=0).rename({"index":"date"}, axis=1)
assert stock_df.shape==(359073, 8)
assert len(stock_df.groupby("stock").count().reset_index().stock.unique()) == 429

### Add change (field+"_chg") and percent-change (field+"_per_chg") fields to the Stock Dataset

The following takes quite a while (over 90 minutes on Kaggle) to complete, and can be circumvented by using the already-modified dataset included. Uncomment the next cell to skip the manipulation.

In [None]:
### Uncomment the below line to skip this manipulation cell and load the preprocessed data from a file
using_original_sp500 = False

if using_original_sp500:
    # set a list of the fields to mess with in the dataframe 
    fields = ['low', 'open', 'vol', 'high', 'close', 'adj_close']

    stocks = stock_df.stock.unique()

    for stock in stocks:

        start_index = stock_df.where(stock_df.stock == stock).first_valid_index()
        last_index = stock_df.where(stock_df.stock == stock).last_valid_index()

        # iterate through each field
        for field in fields:
            # for each field, set the "change" fields for the first row of each stock
            #   to zero, as there has been no change from the prior row (since I don't)
            #   have the prior row
            stock_df[stock_df['stock']==stock].loc[start_index, field+"_chg"] = 0
            stock_df[stock_df['stock']==stock].loc[start_index, field+"_per_chg"] = 0


        # iterate through each row after the first
        for i in range (start_index+1, len(stock_df)):
            # iterate through each base field
            for field in fields:
                # set the field's "change" value to be the current value minus the 
                #   previous day's value
                stock_df.loc[i, field+"_chg"] = stock_df.loc[i, field] - stock_df.loc[i-1, field]
                # check if the previous day's value is zero, so we don't divide by zero
                if stock_df.loc[i-1, field] == 0:
                    # if it is, just set it to zero
                    stock_df.loc[i, field+"_per_chg"] = 0
                else:
                    # otherwise, let's set the "percent change" value to the current
                    #  change divided by the previous day's value, *100 as a percent.
                    stock_df.loc[i, field+"_per_chg"] = stock_df.loc[i, field+"_chg"] / stock_df.loc[i-1, field] * 100
else:
    # Skip the processing and load the pre-processed file
    stock_df = pd.read_csv("../input/sp-500-v2-processed/sp500_v2.csv")
stock_df.head(10)

In [None]:
# assertions
assert stock_df.shape == (357357, 21)

## Clean the Reddit Dataframe

In [None]:
#Separate date and time and take only necessary columns 
reddit_df['created']= pd.to_datetime(reddit_df['created'])
reddit_df['date'] = [d.date() for d in reddit_df['created']]
reddit_df['time'] = [d.time() for d in reddit_df['created']]
reddit_df = reddit_df.loc[:,["link","title","date","time", "score"]]
reddit_df['title'] = reddit_df.title.apply(str.lower)


In [None]:
#extract website names from links
reddit_df['link'] = reddit_df['link'].str.extract('.*\://(?:www.)?([^\/]+)', expand=True)
reddit_df.sample(2)

In [None]:
def get_covid_posts(df):
    # set keywords that identify a post as covid related, borrowed from https://www.henryford.com/blog/2020/04/covid19-key-terms-to-know 
    keywords = ["covid","virus","corona","flu","vaccine","mask","symptom","ventilator","PPE","social distancing","quarantine","super spreader",
                "flatten the curve","antibody","antibodies","epidemic","pandemic","outbreak","n95","herd immunity"]
    # set a mask to only return covid-related posts.
    mask = df.title.apply(lambda x: any(item for item in keywords if item in x))
    return df[mask]

covid_df = get_covid_posts(reddit_df)
covid_df

In [None]:
#Get only English entries
covid_df_scores = covid_df[covid_df['title'].map(lambda x: x.isascii())]

#Only reliable news sources
#Keep only entries where the source link ocurs more than 200 times. This gets rid of "spam" entries
#that can be harmful to the analysis
links = covid_df_scores.loc[:,['link']]
links['count'] = 1
links_gr = links.groupby("link").sum().reset_index()
linkss = links_gr[links_gr["count"] > 200]


## Sentiment Analysis

In [None]:
#Lets start our sentiment analysis
vader = SentimentIntensityAnalyzer()
scores = covid_df_scores['title'].apply(vader.polarity_scores).tolist()

In [None]:
scores_df = pd.DataFrame(scores)
covid_df_scores.reset_index(drop=True, inplace=True)
reddit_df_scores = covid_df_scores.join(scores_df, rsuffix='_right')
reddit_df_scores["count"] = 1
reddit_df_scores

In [None]:
mean_scores = reddit_df_scores.groupby(['date']).agg({"compound":"sum", "count":"sum"}).reset_index()
mean_scores.head(5)

In [None]:
mean_scores.sort_values(by="count", ascending=False)

In [None]:
date_selection = alt.selection_single(on="mouseover", encodings=["color"])

mean_scores['date']= mean_scores['date'].apply(str)

compound_scores_chart = alt.Chart(mean_scores).mark_bar(size = 1).encode(
    x = alt.X("date:T"),
    y = alt.Y("compound:Q", title ="sentiment score sum"), 
).properties(
    title="Sentiment Score by Day"
).add_selection(date_selection).encode(
        tooltip=['date:T'],
)

count_chart = alt.Chart(mean_scores).mark_line(size = 1).encode(
    x = alt.X("date:T"),
    y = alt.Y("count:Q")
).properties(
    title="Number of News Stories per day"
).add_selection(date_selection).encode(
        tooltip=['date:T'],
) 

compound_scores_chart | count_chart

In [None]:
count_chart = alt.Chart(mean_scores).mark_line(color="orange",size = 1).encode(
    x = alt.X("date:T"),
    y = alt.Y("count:Q")
)
sentiment_and_volume = alt.layer(compound_scores_chart, count_chart).resolve_scale(
    y = 'independent'
).configure_axisRight(
  labelColor='orange',
  titleColor='orange'
).properties(
    title="Sentiment and Volume of News Stories"
)


sentiment_and_volume

March 11th, and October 2nd have the highest sentiment scores. 

In [None]:
import datetime as dt
reddit_df[reddit_df['date']==dt.date(2020,10,2)].sample(10)

October 2, 2020 was the date Trump tested positive for COVID-19.

In [None]:
# find the max and min scores for normalization
score_max_val = reddit_df_scores.score.max()
score_min_val = reddit_df_scores.score.min()

# normalize the scores between 0 and 1
reddit_df_scores['score_norm'] = reddit_df_scores.score.apply(lambda x: (x-score_min_val)/(score_max_val-score_min_val))

In [None]:
#Lets take only the top 5000 stories, according to score. 
top_scores = reddit_df_scores.sort_values("score_norm", ascending=False).head(5000)
top_scores

In [None]:
#Map score over time to see which news stories have recieved the most upvotes, or if there are any trends
selection = alt.selection_single(on="mouseover", encodings=["color"])

top_scores['date']= top_scores['date'].apply(str)
top_scores['time']= top_scores['time'].apply(str)
alt.Chart(top_scores).mark_circle().encode(
    x = alt.X("date:T"),
    y = alt.Y("score")
).properties(
    width = 1500
).add_selection(selection).encode(
        tooltip=['title:N',"score:Q", "date:T"],
)

In [None]:
#Plot only the top stories of each day. 
daily_top_news = top_scores.groupby(by="date").max().reset_index()

alt.Chart(daily_top_news).mark_circle().encode(
    x = alt.X("date:T"),
    y = alt.Y("score_norm")
).properties(
    width = 1500
).add_selection(selection).encode(
        tooltip=['title:N',"score:Q", "date:T"],
)


# Stock Data Cleaning

In [None]:
stock_df

In [None]:
# Let's ensure the date field is a datetime date
stock_df.date = pd.to_datetime(stock_df.date)


In [None]:
stock_df.describe()

In [None]:
# let's combine the stock values to get an overall trend. 
#.  Eventually, we'll want to look at individual stocks, but that's beyond the scope of this project
stock_df_compiled = stock_df.groupby(by="date").sum().reset_index()
stock_df_compiled

# Combining the two data sets 

In [None]:
selection = alt.selection_single(on="mouseover", encodings=["color"])

stock_compiled_chart = alt.Chart(stock_df_compiled).mark_line().encode(
    x = alt.X("date:T"),
    y = alt.Y("close:Q", title="Dollars (in thousands")
).properties(
    width = 500, 
    title = "S&P 500 Close overlapped with Sentiment Scores of Reddit Data"
).add_selection(selection).encode(
        tooltip=['vol:N'],
)

alt.layer(stock_compiled_chart, compound_scores_chart).resolve_scale(
    y = 'independent'
)


In [None]:
# get the %change of volume, summed for all stocks by date
vol_chg_by_date = stock_df.groupby('date').sum().vol_per_chg.reset_index()

# set the date field to datetime
vol_chg_by_date = stock_df_compiled.copy()
vol_chg_by_date.date = pd.to_datetime(vol_chg_by_date.date)

# pick only stock values after 2020-01-01
#vol_chg_by_date = vol_chg_by_date[vol_chg_by_date.date>="2019-11-01"]

# get the max amount of volume %change for all days. This can be either positive
#   (for positive volume change) or negative (for reduced volume), so find the one
#    that is the greatest absolute val
vol_max_val = max(vol_chg_by_date.vol_per_chg.max(), vol_chg_by_date.vol_per_chg.min()*-1)

# normalize the volume change between -1 and +1, with -1 being the most reduction in
#   volume, and +1 being the most increase in volume day to day
vol_chg_by_date["norm_chg"] = vol_chg_by_date.vol_per_chg.apply(lambda x: (x)/(vol_max_val))

vol_chg_by_date.sample(5)


In [None]:
mean_scores.sample(5)


In [None]:
import seaborn as sns
mean_scores.date = pd.to_datetime(mean_scores.date)

sentiment_stock = pd.merge(vol_chg_by_date, mean_scores, how='inner', on="date")

sns.lineplot(x=sentiment_stock["compound"],y=sentiment_stock["norm_chg"])

In [None]:
sns.lineplot(x=vol_chg_by_date["date"],y=vol_chg_by_date["norm_chg"])
plt.xticks(rotation=75);

In [None]:
daily_top_news.date = pd.to_datetime(daily_top_news.date)
sentiment_stock_final = pd.merge(sentiment_stock, daily_top_news, how='inner', on="date")
sentiment_stock_final.head(5)


In [None]:
# let's plot the scores to see what we're dealing with

sns.lineplot(x=sentiment_stock_final.score_norm, y=sentiment_stock_final.norm_chg)
plt.xticks(rotation=75);

we see a large spike (400k) around 2020-10 (this will be around 2020-10-02 or 
2020-10-03, as determined later), as well as a spike (300k) just before 
2020-12 (again, we'll later see this is 2020-11-09). So these were busy days
with lots of post activity and upvotes.  

In [None]:
# let's do the same thing for the percent of volume change per day.

# get the %change of volume, summed for all stocks by date
vol_chg_by_date = stock_df.groupby('date').sum().vol_per_chg.reset_index()

# set the date field to datetime
vol_chg_by_date.date = pd.to_datetime(vol_chg_by_date.date)

# pick only stock values after 2020-01-01
vol_chg_by_date = vol_chg_by_date[vol_chg_by_date.date>="2020-01-01"]

# get the max amount of volumne %change for all days. This can be either positive
#   (for positive volume change) or negative (for reduced volume), so find the one
#    that is the greatest absolute val
vol_max_val = max(vol_chg_by_date.vol_per_chg.max(), vol_chg_by_date.vol_per_chg.min()*-1)
#vol_min_val = vol_max_val * -1

# normalize the volume change between -1 and +1, with -1 being the most reduction in
#   volume, and +1 being the most increase in volume day to day
vol_chg_by_date["norm_chg"] = vol_chg_by_date.vol_per_chg.apply(lambda x: (x)/(vol_max_val))

vol_chg_by_date

In [None]:
# let's plot the volume %change day to day and see what we're working with

sns.lineplot(x=vol_chg_by_date.date ,y=vol_chg_by_date.vol_per_chg)
plt.xticks(rotation=75)

Again we see spikes in volume change (increased stock trading) around 2020-11 
or 2020-12, with some additional hotspots around 2020-06/07, 2020-10, 
2020-12/2021-01, and 2021-04

In [None]:
stock_df_compiled.date = pd.to_datetime(stock_df_compiled.date)
compiled_stock_news = pd.merge(stock_df_compiled, daily_top_news, how="inner")

In [None]:
# Create a selection that chooses the nearest point & selects based on x-value
nearest = alt.selection(type='single', nearest=True, on='mouseover',
                        fields=['date'], empty='none')

# The basic line
daily_news_stock_chart = alt.Chart(sentiment_stock_final).mark_line().encode(
    x='date:T',
    y='close:Q',
    #color='stock:N'  ##used when looking at non-compiled data 
).properties(
    width = 1000
).add_selection(nearest).encode(
        tooltip=['title:N','date:T', 'link:N'],
)



alt.layer(daily_news_stock_chart, compound_scores_chart).resolve_scale(
    y = 'independent'
)

In [None]:
# The basic line
daily_news_stock_chart = alt.Chart(sentiment_stock_final).mark_line().encode(
    x='date:T',
    y='close:Q',
    #color='stock:N'  ##used when looking at non-compiled data 
).properties(
    width = 1000
)

mark1 = alt.Chart(sentiment_stock_final[sentiment_stock_final.date=="2020-01-30"]).mark_circle(size=50, color="black"
).encode(
    x="date:T",
    y="close:Q",
    text="title:N"
)
label1 = alt.Chart(sentiment_stock_final[sentiment_stock_final.date=="2020-01-30"]).mark_text(    
    fontWeight=500, 
    align='center',
    lineBreak="is",
    baseline='line-top',
    dy=-40
).encode(
    x="date:T",
    y="close:Q",
    text="title:N"
)

mark2 = alt.Chart(sentiment_stock_final[sentiment_stock_final.date=="2020-02-24"]).mark_circle(size=50, color="black"
).encode(
    x="date:T",
    y="close:Q",
    text="title:N"
)
label2 = alt.Chart(sentiment_stock_final[sentiment_stock_final.date=="2020-02-24"]).mark_text(    
    fontWeight=500,
    align='left',
    baseline='line-top',
    dy=-15
).encode(
    x="date:T",
    y="close:Q",
    text="title:N"
)

mark3 = alt.Chart(sentiment_stock_final[sentiment_stock_final.date=="2020-03-11"]).mark_circle(size=50, color="black"
).encode(
    x="date:T",
    y="close:Q",
    text="title:N"
)
label3 = alt.Chart(sentiment_stock_final[sentiment_stock_final.date=="2020-03-11"]).mark_text(    
    fontWeight=500,
    align='right',
    lineBreak = "the",
    baseline='middle',
    dy=0, 
    dx=-15
).encode(
    x="date:T",
    y="close:Q",
    text="title:N"
)

mark4 = alt.Chart(sentiment_stock_final[sentiment_stock_final.date=="2020-03-23"]).mark_circle(size=50, color="black"
).encode(
    x="date:T",
    y="close:Q",
    text="title:N"
)
label4 = alt.Chart(sentiment_stock_final[sentiment_stock_final.date=="2020-03-23"]).mark_text(    
    fontWeight=500,
    align='left',
    lineBreak = "'s",
    baseline='middle',
    dy=15, 
    dx=0
).encode(
    x="date:T",
    y="close:Q",
    text="title:N"
)

mark5 = alt.Chart(sentiment_stock_final[sentiment_stock_final.date=="2020-04-30"]).mark_circle(size=50, color="black"
).encode(
    x="date:T",
    y="close:Q",
    text="title:N"
)
label5 = alt.Chart(sentiment_stock_final[sentiment_stock_final.date=="2020-04-30"]).mark_text(    
    fontWeight=500,
    align='left',
    lineBreak = "'s",
    baseline='middle',
    dy=30, 
    dx=0
).encode(
    x="date:T",
    y="close:Q",
    text="title:N"
)
mark6 = alt.Chart(sentiment_stock_final[sentiment_stock_final.date=="2020-10-02"]).mark_circle(size=50, color="black"
).encode(
    x="date:T",
    y="close:Q",
    text="title:N"
)
label6 = alt.Chart(sentiment_stock_final[sentiment_stock_final.date=="2020-10-02"]).mark_text(    
    fontWeight=500,
    align='left',
    lineBreak = ",",
    baseline='middle',
    dy=15, 
    dx=0
).encode(
    x="date:T",
    y="close:Q",
    text="title:N"
)

final_chart = daily_news_stock_chart + label1 + mark1 +label2+mark2+label3+mark3+label4+mark4+label6+mark6 

final = alt.layer(final_chart, compound_scores_chart).resolve_scale(
    y = 'independent'
)

final

In [None]:
alt.layer(final, count_chart).resolve_scale(
    y = 'independent'
)