# Project Outline

This project looks at the stock price of Roku. By doing a time series analysis and also a sentiment analysis on social media for the stock price of Roku we will try and predict future prices.

Project flow:
 * Gather stock price (Gathering_Data notebook)
 * Gather posts from twitter and reddit (Gathering_Data notebook)
 * Clean up and do EDA (Gathering_Data notebook)
 * Do a sentiment analysis for each day (This notebook)
 * Do a time series analysis on both stock price and sentiment individually (This notebook)
 * Combine the sentiment and stock price to try and predict future prices (This notebook)

# Importing packages and loading data

In [1]:
# Importing packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import twint
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
import datetime as dt
import time
import re
import nltk
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [2]:
# Loading data
T_df = pd.read_csv('TwitterData.csv', header=None)
T_df.columns = ['id', 'date', 'timezone', 'tweet', 'language', 'cashtags', 
                'username', 'day', 'hour', 'nlikes', 'nreplies', 'nretweets', 'search']
R_df = pd.read_csv('redditdata.csv')
S_df = pd.read_csv('StockData.csv')

# Functions

In [3]:
# This function will remove emojis from the text

def RemoveEmojis(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [43]:
# This function is to run text through spacy/textblob pipeline
def Sentiment(text):
    
    #Initializing Sentiment analyzer
    SIA = SentimentIntensityAnalyzer()
    
    # Words to add to the sentiment lexicon
    new_words = {
        'bullish': 3.4,
        'bull':3.4,
        'long':3.4,
        'longs':3.4,
        'Holding':3.4,
        'holding':3.4,
        'Hodl':3.4,
        'hodl':3.4,
        'Call':3.4,
        'Calls':3.4,
        'call':3.4,
        'calls':3.4,
        'Put':-3.9,
        'Puts':-3.9,
        'put':-3.9,
        'puts':-3.9,
        'bear':-3.9,
        'bearish':-3.9,
        'short': -3.9,
        'shorts':-3.9
        }

    # Updating lexicon
    SIA.lexicon.update(new_words)
    
    # Running
    output = SIA.polarity_scores(text)
    
    return output['compound']

# Clean up and EDA

## Clean up Twitter EDA

In [5]:
print(T_df.shape)
T_df.head()

(666, 13)


Unnamed: 0,id,date,timezone,tweet,language,cashtags,username,day,hour,nlikes,nreplies,nretweets,search
0,913765506456473600,2017-09-29 09:00:48,-500,"$1900 in 10 minutes on $ROKU, so glad the mark...",en,['roku'],eatsleepprofit,5,9,69,8,2,$ROKU
1,913841066473869313,2017-09-29 14:01:03,-500,$ROKU is already up almost 100% since its IPO....,en,['roku'],Stocktwits,5,14,40,5,23,$ROKU
2,914147418316398592,2017-09-30 10:18:23,-500,Mystified by $ROKU's IPO? @JimCramer's breakin...,en,['roku'],MadMoneyOnCNBC,6,10,29,5,11,$ROKU
3,913892627845570560,2017-09-29 17:25:56,-500,"$ROKU Long. One Ticker, a few trades. Couldve ...",en,['roku'],SmashTheBid,5,17,28,4,0,$ROKU
4,913843670872444928,2017-09-29 14:11:24,-500,Love #Pennystocks Nice Friday for my 2 account...,en,"['vcel', 'izea', 'gluu', 'roku']",EbogeyEd,5,14,24,1,3,$ROKU


In [6]:
# Making date in dataframe just the date and no time
T_df['date'] = T_df.date.apply(lambda x: x.split(' ',1)[0])

# Dropping any scraped entries that aren't english
T_df.drop(T_df[T_df['language'] != 'en'].index, inplace = True)

# Only keeping date and tweet
T_df = T_df[['date','tweet']]

In [7]:
# Removing any url links
T_df['tweet_LinkRemoved'] = T_df.tweet.apply(
    lambda x: re.sub(r'https\S+', '', x))

# Removing any emojis
T_df['tweet_Link+Emoji_Removed'] = T_df.tweet_LinkRemoved.apply(
    lambda x: RemoveEmojis(x))

# Change things to lower case
T_df['tweet_cleaned_lowercase'] = T_df['tweet_Link+Emoji_Removed'].apply(
    lambda x: " ".join(x.lower() for x in x.split()))

# Remove punctuations
T_df['tweet_cleaned_lower_NoPunct'] = T_df['tweet_cleaned_lowercase'].str.replace(
    '[^\w\s]', '')

# Remove stopwards
stop = stopwords.words('english')
T_df['tweet_FinalClean'] = T_df['tweet_cleaned_lower_NoPunct'].apply(
    lambda x: " ".join(x for x in x.split() if x not in stop))

## Clean up Reddit

In [8]:
R_df.head()

Unnamed: 0,ID,Num_Comments,Score,Subreddit,Title,Upvote_Ratio,Created,Created_utc,Self_text,Date
0,730n8l,50,113,investing,ROKU up 28% shortly after IPO,0.91,1506640000.0,1506611000.0,http://www.hollywoodreporter.com/news/roku-sto...,2017-09-28
1,72ztnr,0,35,stocks,9/28 Thursday Stock Market Movers & News,0.89,1506631000.0,1506603000.0,#Good morning traders of the r/stocks sub! Hap...,2017-09-28
2,730zps,31,26,wallstreetbets,WHEN TO SHORT ROKU,0.85,1506643000.0,1506614000.0,"this company does nothing new, i think 24~ SHO...",2017-09-28
3,7314h8,37,20,wallstreetbets,ROKU now listex on NASDAQ. Up 35% so far,0.95,1506644000.0,1506615000.0,,2017-09-28
4,730jwz,33,14,wallstreetbets,"$ROKU IPO, to the moon...?",0.9,1506639000.0,1506610000.0,Why isn't this thing available on my POS broke...,2017-09-28


In [9]:
# The things we care about are date, subreddit, title, and self_text
R_df = R_df[['Date','Subreddit','Title','Self_text']]

# Combining the title and self text
R_df['Self_text'] = R_df['Self_text'].fillna('.')
R_df['Combined_text'] = R_df['Title'] + ' ' + R_df['Self_text']

In [10]:
# Removing any links
R_df['Combined_text_LinksRemoved'] = R_df.Combined_text.apply(
    lambda x: re.sub(r'https\S+', '', x))
R_df['Combined_text_LinksRemoved'] = R_df.Combined_text_LinksRemoved.apply(
    lambda x: re.sub(r'http\S+', '', x))

# Removing any emojis
R_df['Combined_text_Link+Emoji_Removed'] = R_df.Combined_text_LinksRemoved.apply(lambda x: RemoveEmojis(x))

# Change things to lower case
R_df['Combined_text_cleaned_lowercase'] = R_df['Combined_text_Link+Emoji_Removed'].apply(
    lambda x: " ".join(x.lower() for x in x.split()))

# Remove punctuations
R_df['Combined_text_cleaned_lower_NoPunct'] = R_df['Combined_text_cleaned_lowercase'].str.replace(
    '[^\w\s]', '')

# Remove stopwards
stop = stopwords.words('english')
R_df['Combined_text_FinalClean'] = R_df['Combined_text_cleaned_lower_NoPunct'].apply(
    lambda x: " ".join(x for x in x.split() if x not in stop))

In [11]:
R_df

Unnamed: 0,Date,Subreddit,Title,Self_text,Combined_text,Combined_text_LinksRemoved,Combined_text_Link+Emoji_Removed,Combined_text_cleaned_lowercase,Combined_text_cleaned_lower_NoPunct,Combined_text_FinalClean
0,2017-09-28,investing,ROKU up 28% shortly after IPO,http://www.hollywoodreporter.com/news/roku-sto...,ROKU up 28% shortly after IPO http://www.holly...,ROKU up 28% shortly after IPO \n\n>The video s...,ROKU up 28% shortly after IPO \n\n>The video s...,roku up 28% shortly after ipo >the video strea...,roku up 28 shortly after ipo the video streami...,roku 28 shortly ipo video streaming device mak...
1,2017-09-28,stocks,9/28 Thursday Stock Market Movers & News,#Good morning traders of the r/stocks sub! Hap...,9/28 Thursday Stock Market Movers & News #Good...,9/28 Thursday Stock Market Movers & News #Good...,9/28 Thursday Stock Market Movers & News #Good...,9/28 thursday stock market movers & news #good...,928 thursday stock market movers news good mo...,928 thursday stock market movers news good mor...
2,2017-09-28,wallstreetbets,WHEN TO SHORT ROKU,"this company does nothing new, i think 24~ SHO...",WHEN TO SHORT ROKU this company does nothing n...,WHEN TO SHORT ROKU this company does nothing n...,WHEN TO SHORT ROKU this company does nothing n...,when to short roku this company does nothing n...,when to short roku this company does nothing n...,short roku company nothing new think 24 short ...
3,2017-09-28,wallstreetbets,ROKU now listex on NASDAQ. Up 35% so far,.,ROKU now listex on NASDAQ. Up 35% so far .,ROKU now listex on NASDAQ. Up 35% so far .,ROKU now listex on NASDAQ. Up 35% so far .,roku now listex on nasdaq. up 35% so far .,roku now listex on nasdaq up 35 so far,roku listex nasdaq 35 far
4,2017-09-28,wallstreetbets,"$ROKU IPO, to the moon...?",Why isn't this thing available on my POS broke...,"$ROKU IPO, to the moon...? Why isn't this thin...","$ROKU IPO, to the moon...? Why isn't this thin...","$ROKU IPO, to the moon...? Why isn't this thin...","$roku ipo, to the moon...? why isn't this thin...",roku ipo to the moon why isnt this thing avail...,roku ipo moon isnt thing available pos brokers
...,...,...,...,...,...,...,...,...,...,...
613,2021-05-11,wallstreetbets,"$FUBO - Archegos, Misreported Earnings, and th...",Reporting AH ( 45 mins ) so i dont have much t...,"$FUBO - Archegos, Misreported Earnings, and th...","$FUBO - Archegos, Misreported Earnings, and th...","$FUBO - Archegos, Misreported Earnings, and th...","$fubo - archegos, misreported earnings, and th...",fubo archegos misreported earnings and the qu...,fubo archegos misreported earnings quiet perio...
614,2021-05-12,investing,Best Dip Stocks to buy right now,"Hey y'all,\n\nI have alot of cash lying around...","Best Dip Stocks to buy right now Hey y'all,\n\...","Best Dip Stocks to buy right now Hey y'all,\n\...","Best Dip Stocks to buy right now Hey y'all,\n\...","best dip stocks to buy right now hey y'all, i ...",best dip stocks to buy right now hey yall i ha...,best dip stocks buy right hey yall alot cash l...
615,2021-05-13,Daytrading,"The Only Watch List You Need May 13, 2021",*The original Gap Watch List Poster! The one t...,"The Only Watch List You Need May 13, 2021 *The...","The Only Watch List You Need May 13, 2021 *The...","The Only Watch List You Need May 13, 2021 *The...","the only watch list you need may 13, 2021 *the...",the only watch list you need may 13 2021 the o...,watch list need may 13 2021 original gap watch...
616,2021-05-20,wallstreetbets,PUBM DD #1,\n\nA little bit about myself for context:\n\...,PUBM DD #1 \n\nA little bit about myself for ...,PUBM DD #1 \n\nA little bit about myself for ...,PUBM DD #1 \n\nA little bit about myself for ...,pubm dd #1 a little bit about myself for conte...,pubm dd 1 a little bit about myself for contex...,pubm dd 1 little bit context 1 work enterprise...


In [12]:
R_df.iloc[0]['Combined_text_LinksRemoved']

'ROKU up 28% shortly after IPO \n\n>The video streaming device maker had late Wednesday set its initial public offering price at $14 per share, or a market value of $1.3 billion.\n\n>The stock of video streaming device maker Roku rose in its stock market debut on Thursday. It opened just after 10:35 a.m. ET at $15.78, up 12.7 percent.\n\n>About 10 minutes later, it was up 28 percent at $17.94.\n\n>The company had recently filed for an initial public offering and late Wednesday had set its IPO price at $14 per share, which raised about $219 million and made for a market value of $1.3 billion.\n\n>The  Los Gatos, Calif.-based company, which makes TV set-top boxes and other devices, listed its stock on the Nasdaq under the ticker symbol ROKU. Its filing have shown that Roku lost $42.8 million in 2016 on nearly $399 million in revenue, up 25 percent from 2015.\n\n>Roku, led by CEO Anthony Wood, has also disclosed that its users streamed more than 6.7 billion hours of programming on its pla

# Exploring sentiment analysis, Vader

In [13]:
# Figuring out the max and min of the scores in the lexicon
SIA = SentimentIntensityAnalyzer()

maxi = 0
mini = 0

for k,v in SIA.lexicon.items():

    if v > maxi:
        maxi = v
    if v < mini:
        mini = v
        
print('Maximum points for a word/emoticon', maxi)
print('Minimum points for a word/emoticon', mini)

Maximum points for a word/emoticon 3.4
Minimum points for a word/emoticon -3.9


Added custom words with scores as either the maximum or minimum in the Sentiment Function located earlier in the notebook. Vader can take in emoji's, going to try a tweet examples with an emoji and without the emojis to see which one is more accurate.

In [14]:
# Tweet with the links removed
T_df.iloc[0]['tweet_LinkRemoved']

"$1900 in 10 minutes on $ROKU, so glad the market is picking up with multiple runners literally everyday. I really hope this doesn't stop🙏🏼😁  "

In [15]:
print('Score with the emoji: ',Sentiment(T_df.iloc[0]['tweet_LinkRemoved']))
print('Score without the emoji: ',Sentiment(T_df.iloc[0]['tweet_Link+Emoji_Removed']))

Score with the emoji:  0.8989
Score without the emoji:  0.838


It looks like the tweet with the emoji is more accurate, for future sentiment analysis the emoji's will be used in the text.

# Combining Reddit and Twitter Data

In [16]:
T_df.head()

Unnamed: 0,date,tweet,tweet_LinkRemoved,tweet_Link+Emoji_Removed,tweet_cleaned_lowercase,tweet_cleaned_lower_NoPunct,tweet_FinalClean
0,2017-09-29,"$1900 in 10 minutes on $ROKU, so glad the mark...","$1900 in 10 minutes on $ROKU, so glad the mark...","$1900 in 10 minutes on $ROKU, so glad the mark...","$1900 in 10 minutes on $roku, so glad the mark...",1900 in 10 minutes on roku so glad the market ...,1900 10 minutes roku glad market picking multi...
1,2017-09-29,$ROKU is already up almost 100% since its IPO....,$ROKU is already up almost 100% since its IPO....,$ROKU is already up almost 100% since its IPO....,$roku is already up almost 100% since its ipo....,roku is already up almost 100 since its ipo he...,roku already almost 100 since ipo heres went p...
2,2017-09-30,Mystified by $ROKU's IPO? @JimCramer's breakin...,Mystified by $ROKU's IPO? @JimCramer's breakin...,Mystified by $ROKU's IPO? @JimCramer's breakin...,mystified by $roku's ipo? @jimcramer's breakin...,mystified by rokus ipo jimcramers breaking it ...,mystified rokus ipo jimcramers breaking
3,2017-09-29,"$ROKU Long. One Ticker, a few trades. Couldve ...","$ROKU Long. One Ticker, a few trades. Couldve ...","$ROKU Long. One Ticker, a few trades. Couldve ...","$roku long. one ticker, a few trades. couldve ...",roku long one ticker a few trades couldve done...,roku long one ticker trades couldve done lot b...
4,2017-09-29,Love #Pennystocks Nice Friday for my 2 account...,Love #Pennystocks Nice Friday for my 2 account...,Love #Pennystocks Nice Friday for my 2 account...,love #pennystocks nice friday for my 2 account...,love pennystocks nice friday for my 2 accounts...,love pennystocks nice friday 2 accounts 1200 v...


In [17]:
R_df.head()

Unnamed: 0,Date,Subreddit,Title,Self_text,Combined_text,Combined_text_LinksRemoved,Combined_text_Link+Emoji_Removed,Combined_text_cleaned_lowercase,Combined_text_cleaned_lower_NoPunct,Combined_text_FinalClean
0,2017-09-28,investing,ROKU up 28% shortly after IPO,http://www.hollywoodreporter.com/news/roku-sto...,ROKU up 28% shortly after IPO http://www.holly...,ROKU up 28% shortly after IPO \n\n>The video s...,ROKU up 28% shortly after IPO \n\n>The video s...,roku up 28% shortly after ipo >the video strea...,roku up 28 shortly after ipo the video streami...,roku 28 shortly ipo video streaming device mak...
1,2017-09-28,stocks,9/28 Thursday Stock Market Movers & News,#Good morning traders of the r/stocks sub! Hap...,9/28 Thursday Stock Market Movers & News #Good...,9/28 Thursday Stock Market Movers & News #Good...,9/28 Thursday Stock Market Movers & News #Good...,9/28 thursday stock market movers & news #good...,928 thursday stock market movers news good mo...,928 thursday stock market movers news good mor...
2,2017-09-28,wallstreetbets,WHEN TO SHORT ROKU,"this company does nothing new, i think 24~ SHO...",WHEN TO SHORT ROKU this company does nothing n...,WHEN TO SHORT ROKU this company does nothing n...,WHEN TO SHORT ROKU this company does nothing n...,when to short roku this company does nothing n...,when to short roku this company does nothing n...,short roku company nothing new think 24 short ...
3,2017-09-28,wallstreetbets,ROKU now listex on NASDAQ. Up 35% so far,.,ROKU now listex on NASDAQ. Up 35% so far .,ROKU now listex on NASDAQ. Up 35% so far .,ROKU now listex on NASDAQ. Up 35% so far .,roku now listex on nasdaq. up 35% so far .,roku now listex on nasdaq up 35 so far,roku listex nasdaq 35 far
4,2017-09-28,wallstreetbets,"$ROKU IPO, to the moon...?",Why isn't this thing available on my POS broke...,"$ROKU IPO, to the moon...? Why isn't this thin...","$ROKU IPO, to the moon...? Why isn't this thin...","$ROKU IPO, to the moon...? Why isn't this thin...","$roku ipo, to the moon...? why isn't this thin...",roku ipo to the moon why isnt this thing avail...,roku ipo moon isnt thing available pos brokers


In [18]:
# Combining the Twitter dataframe and the Reddit Dataframe
temp_T_df = T_df[['date','tweet_LinkRemoved']].copy()
temp_T_df.columns = ['Date','Text']

temp_R_df = R_df[['Date','Combined_text_LinksRemoved']].copy()
temp_R_df.columns = ['Date','Text']

Sentiment_df = pd.concat([temp_T_df,temp_R_df])

In [19]:
print('Check to see if combination worked.')
print('Twitter dataframe number of entries: ', len(T_df))
print('Reddit dataframe number of entries: ', len(R_df))
print('Combined dataframe number of entries: ', len(Sentiment_df))

Check to see if combination worked.
Twitter dataframe number of entries:  565
Reddit dataframe number of entries:  618
Combined dataframe number of entries:  1183


# Running the sentiment function to find sentiment scores

In [44]:
Sentiment_df['Score'] = Sentiment_df.Text.apply(lambda x: Sentiment(x))

In [45]:
Sentiment_df

Unnamed: 0,Date,Text,Score
0,2017-09-29,"$1900 in 10 minutes on $ROKU, so glad the mark...",0.8989
1,2017-09-29,$ROKU is already up almost 100% since its IPO....,0.0000
2,2017-09-30,Mystified by $ROKU's IPO? @JimCramer's breakin...,0.0000
3,2017-09-29,"$ROKU Long. One Ticker, a few trades. Couldve ...",0.5647
4,2017-09-29,Love #Pennystocks Nice Friday for my 2 account...,0.9222
...,...,...,...
613,2021-05-11,"$FUBO - Archegos, Misreported Earnings, and th...",0.8991
614,2021-05-12,"Best Dip Stocks to buy right now Hey y'all,\n\...",0.9050
615,2021-05-13,"The Only Watch List You Need May 13, 2021 *The...",0.9868
616,2021-05-20,PUBM DD #1 \n\nA little bit about myself for ...,0.9992


In [46]:
Text_to_check = Sentiment_df[Sentiment_df['Score'] == 0]
Text_to_check

Unnamed: 0,Date,Text,Score
1,2017-09-29,$ROKU is already up almost 100% since its IPO....,0.0
2,2017-09-30,Mystified by $ROKU's IPO? @JimCramer's breakin...,0.0
15,2017-12-08,$ROKU Next week there will be a big storm in S...,0.0
23,2017-12-22,First time since after $ROKU earnings in Nov n...,0.0
24,2017-12-30,"Oh, what a year. 📈 #BestOf2017 stocks from IB...",0.0
...,...,...,...
478,2020-12-01,$ROKU I can’t quit you .,0.0
502,2020-12-24,"Finally upgraded to the next leve(nio, Roku, e...",0.0
512,2021-01-05,Quibi Selling Catalog To Roku: Report .,0.0
531,2021-01-20,$NFLX $DIS $ROKU Streaming Services Comparison...,0.0


In [55]:
print('Texts from ',len(Text_to_check),' had a sentiment score of 0 (they analyzer did not understand or they were neutral).')
print('That is ', len(Text_to_check)/len(Sentiment_df)*100,'% of the entries we have.')

Texts from  173  had a sentiment score of 0 (they analyzer did not understand or they were neutral).
That is  14.623837700760777 % of the entries we have.


# Combining the scores for each day

2021-02-19    8
2020-12-24    8
2019-12-24    8
2020-12-18    8
2020-04-17    8
             ..
2020-10-13    1
2020-08-17    1
2019-03-26    1
2018-05-09    1
2019-09-28    1
Name: Date, Length: 519, dtype: int64