## Import Packages

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
nltk.download('vader_lexicon')
from datetime import datetime
import spacy as sp
nlps = sp.load('en')
from spacy.matcher import PhraseMatcher, Matcher
from collections import Counter

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Preprocessing

In [None]:
data = pd.read_csv('/kaggle/input/reddit-rwallstreetbets/r_wallstreetbets_posts.csv')

In [None]:
# load tickers
import requests # allows for file downloading
import os

filename = 'ticker.txt'
url = 'https://www.sec.gov/include/ticker.txt'
if os.path.exists(filename):
    print(f'{filename} already downloaded')
else:
    r = requests.get(url, allow_redirects=True)
    with open(filename, 'wb') as file:
        file.write(r.content)
    print(f'{filename} has been downloaded')


In [None]:
# put tickers into dataframe
tickers = pd.read_csv('ticker.txt', delimiter = "\t", header=None)
tickers.columns = ['symbol', 'code']

# Note: some tickers are the same as common words or letters such as NAN, K, AND
tickers['symbol'] = tickers['symbol'].str.upper() 
print(tickers.iloc[4101])

# dropping 'NAN' North American Nickel ticker for now
tickers = tickers.dropna()
print(tickers.isnull().values.any())
tickers.head(5)

In [None]:
data.columns

In [None]:
data = data.drop(columns = ['awarders', 'over_18', 'author_flair_text', 'removed_by','full_link','author'])

In [None]:
print(data.columns)
print(data.shape)

# analysis of date and scores
max_date, min_date = datetime.fromtimestamp(max(data.created_utc)), datetime.fromtimestamp(min(data.created_utc))
print('date ranges: ', min_date, max_date)
print('score range ', max(data.score), min(data.score))
print('num comments range ', max(data.num_comments), min(data.num_comments))

# find score and comments distributions
no_score_data = data[data.score==0]
low_score_data =data[data.score<= 100]
mid_score_data = data[(data.score <= 1000) & (data.score > 100)]
high_score_data = data[data.score > 1000]
print('scores: ', len(no_score_data), len(low_score_data), len(mid_score_data), len(high_score_data))
print('avg score: ', np.mean(data.score.values), 'std score: ', np.std(data.score.values))

In [None]:
# keep posts with score > 100 or num comments > 100 or total awards > 10
# determine best threshold for keeping posts
df = data[(data.score >= 100) | (data.num_comments >= 50) | (data.total_awards_received >= 10)] 
print('comments avg and std: ', df.num_comments.mean(), df.num_comments.std())
print(df.shape)

## Vader Sentiment Scores

In [None]:
# Calculate polarity scores
sia = SIA()
# baseline compound is pos if >= 0.05, neg if <= -0.05 and neu else
# change baseline to 0 for binary classification
def calculate_sentiment(text):
    pol_score = sia.polarity_scores(text)
    if pol_score['compound'] >= 0.05:
        return 1
    elif pol_score['compound'] <= -0.05:
        return -1
    else:
        return 0

In [None]:
# add sentiment and compound col to df
df['sentiment'] = df.apply(lambda x: calculate_sentiment(x.title), axis=1)
df['compound'] = df.apply(lambda x: sia.polarity_scores(x.title)['compound'], axis=1)
df.head(3)

In [None]:
# counting sentiments 
sentiments = [-1, 0, 1]
sentiments_count = [sum(df.sentiment == -1), sum(df.sentiment == 0), sum(df.sentiment == 1)]
df.sentiment.value_counts()

## TextBlob Sentiment Scores

In [None]:
# Calculating TextBlob Scores

from textblob import TextBlob

def calculate_sentiment_tb(text,compound=True):
    '''
    text: string input
    compound: if true, return compound score. Otherwise return -1, 0, or 1
    
    '''
    blob = TextBlob(text)
    
    if compound: 
        return blob.polarity
    
    else: 
        if blob.polarity >= 0.05:
            return 1
        elif blob.polarity <= -0.05:
            return -1
        else:
            return 0
        
df['tb_sentiment'] = df.apply(lambda x: calculate_sentiment_tb(x.title), axis=1)

In [None]:
df.head(3)

## Finding Stock Tickers in Posts

In [None]:
# stocks to find,
stocks = ['SPY', 'GME', 'AMC', 'TSLA', 'PLTR', 'APPL', 'AMD', 'BB', 'AMZN', 'NIO', 'NVDA', 'MU', 'RH', 'SNAP', 'NOK', 'SPCE']
company_names = {'S&P': 'SPY', 'GAMESTOP': 'GME', 'AMC': 'AMC', 'TESLA': 'TSLA', 'PALANTIR': 'PLTR', 'APPLE': 'APPL', 'AMD': 'AMD', 'BLACKBERRY': 'BB',
                 'AMAZON': 'AMZN', 'NIO': 'NIO', 'NVIDIA': 'NVDA', 'MICRON': 'MU', 'RESTORATION': 'RH', 'SNAPCHAT': 'SNAP', 'NOKIA': 'NOK', 'VIRGIN': "SPCE"}

In [None]:
# pattern matching with most frequent stocks 
nlp_freq = sp.blank('en')
matcher_freq = PhraseMatcher(nlp_freq.vocab, attr='TEXT')
freq_token_list = [nlp_freq(item) for item in stocks + list(company_names)]
matcher_freq.add('Freq Stocks', freq_token_list)

In [None]:
# find tickers and company names only from specified list
def find_tickers_and_names(title):
    doc = nlp_freq(title.upper())
    matches = matcher_freq(doc)
    found_items = set([str(doc[match[1]: match[2]]).replace(' ', '') for match in matches])
    tickers_list = set()
    for item in found_items:
        if item in company_names:
            tickers_list.add(company_names[item])
        else:
            tickers_list.add(item)
    for ticker in tickers_list:
        ticker_freq[ticker] += 1
    return '_'.join(tickers_list)

In [None]:
ticker_freq = Counter()
df['orgs'] = df.apply(lambda x: find_tickers_and_names(x.title), axis=1)
print(ticker_freq, sum(ticker_freq.values()))

In [None]:
# create new dataframe for only posts with specific organizations mentioned
df_stocks = df[df.orgs != '']
print(df_stocks.shape)
df_stocks.head()

## Finding Posts by Stock Ticker

In [None]:
def get_daily_sentiment(dataframe, ticker, sentiment_type='compound'):
    '''
    get the daily sentiment assosicated with a stock
    
    ticker: string input
    sentiment_type: 'sentiment', 'compound',or 'tb_sentiment'
    
    '''
    df_ticker = dataframe.copy()[dataframe.orgs.str.contains(ticker)]
    df_ticker['Date'] = pd.to_datetime(df_ticker['created_utc'],unit='s').dt.date
    df_ticker_scores = df_ticker.groupby(df_ticker['Date'])[sentiment_type].mean()
    df_ticker_scores = df_ticker_scores.to_frame()
    df_ticker_scores.reset_index(inplace=True)

    
    return df_ticker_scores

In [None]:
df_tsla_scores = get_daily_sentiment(df,'TSLA',sentiment_type='tb_sentiment')

## Getting Stock Market Data

In [None]:
pip install yfinance

In [None]:
import yfinance as yf

In [None]:
def get_market_data(ticker, df_scores, min_date,drop_NaN=True): 
    '''
    get the daily price of a stock with sentiment scores
    min_date: earilest date to get price. 'YYYY-MM-DD'
    
    fills in 'NaN' when market data not available (weekends & holidays) 
    
    '''
    ticker = yf.Ticker(ticker)
    min_date = pd.to_datetime(min_date).date()
    max_date = max(df_scores.Date)
    hist = ticker.history(start=min_date, end=max_date)
    
    df_scores_date = df_scores[df_scores.Date > min_date]
    df_scores_date.set_index('Date', inplace=True, drop=True)
    
    df_wsb = pd.concat([hist, df_scores_date], axis=1)
    
    if drop_NaN: 
        df_wsb = df_wsb.dropna()
    
    return df_wsb

In [None]:
df_tsla_wsb = get_market_data('TSLA',df_tsla_scores,'2020-01-01',drop_NaN=True)

## Train Test Split

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit

dataset = df_tsla_wsb.values

# Set the X without Open or Date
X_MinMax = MinMaxScaler()
y_MinMax = MinMaxScaler()
X = X_MinMax.fit_transform(dataset[:, 1:])
y = y_MinMax.fit_transform(dataset[:, 0].reshape(-1, 1)) # open price

#X = (dataset[:, 1:6])
#y = (dataset[:, 0].reshape(-1, 1))

tscv = TimeSeriesSplit()

for train_index, test_index in tscv.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    
X_train_scale = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_test_scale = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

## LSTM

In [None]:
# Create LSTM trained on sentiment analysis data
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout

model = Sequential()



model.add(LSTM(units = 50, return_sequences = True, input_shape = (X_train.shape[1], 1)))
model.add(Dropout(0.2))

model.add(LSTM(units = 50, return_sequences = True))
model.add(Dropout(0.2))

model.add(LSTM(units = 50, return_sequences = True))
model.add(Dropout(0.2))

model.add(LSTM(units = 50))
model.add(Dropout(0.2))

model.add(Dense(units = 1))

model.compile(optimizer = 'adam', loss = 'mean_squared_error')


model.fit(X_train_scale, y_train, epochs = 225, batch_size = 32)

In [None]:
from sklearn.metrics import mean_squared_error

predicted_stock_tsla = model.predict(X_test_scale)

testScore = np.sqrt(mean_squared_error(y_test[:], predicted_stock_tsla[:,0]))

print('Root mean square error is {}'.format(testScore))

plt.plot(y_MinMax.inverse_transform(predicted_stock_tsla),label='testing predicted')
plt.plot(y_MinMax.inverse_transform(y_test),label='testing actual')
plt.title('TSLA Open Value Prediction with TextBlob Sentiment')
plt.legend()
plt.show()
plt.clf()
plt.cla()
plt.close()


In [None]:
predicted_stock_tsla_train = model.predict(X_train_scale)

trainScore = np.sqrt(mean_squared_error(y_train[:], predicted_stock_tsla_train[:,0]))

print('Root mean square error is {}'.format(trainScore))

plt.plot(y_MinMax.inverse_transform(predicted_stock_tsla_train),label='training predicted')
plt.plot(y_MinMax.inverse_transform(y_train),label='training actual')
plt.title('TSLA Open Value Prediction with TextBlob Sentiment')
plt.legend()
plt.show()
plt.clf()
plt.cla()
plt.close()

## LSTM Without Sentiment Input

In [None]:
# Test the model without sentiment

# Set the X without Open or Date
X_MinMax = MinMaxScaler()
y_MinMax = MinMaxScaler()
X = X_MinMax.fit_transform(dataset[:, 1:-1])
y = y_MinMax.fit_transform(dataset[:, 0].reshape(-1, 1))

tscv = TimeSeriesSplit()

for train_index, test_index in tscv.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    
X_train_scale = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_test_scale = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))


# Create LSTM trained on sentiment analysis data
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout

model = Sequential()



model.add(LSTM(units = 50, return_sequences = True, input_shape = (X_train.shape[1], 1)))
model.add(Dropout(0.2))

model.add(LSTM(units = 50, return_sequences = True))
model.add(Dropout(0.2))

model.add(LSTM(units = 50, return_sequences = True))
model.add(Dropout(0.2))

model.add(LSTM(units = 50))
model.add(Dropout(0.2))

model.add(Dense(units = 1))

model.compile(optimizer = 'adam', loss = 'mean_squared_error')


model.fit(X_train_scale, y_train, epochs = 225, batch_size = 32)

In [None]:
predicted_stock_tsla = model.predict(X_test_scale)

testScore = np.sqrt(mean_squared_error(y_test[:], predicted_stock_tsla[:,0]))

print('Root mean square error is {}'.format(testScore))

plt.plot(y_MinMax.inverse_transform(predicted_stock_tsla),label='predicted')
plt.plot(y_MinMax.inverse_transform(y_test),label='actual')
plt.legend()
plt.title('Without Sentiment Data tsla test')
plt.show()
plt.clf()
plt.cla()
plt.close()


In [None]:
predicted_stock_tsla_train = model.predict(X_train_scale)

trainScore = np.sqrt(mean_squared_error(y_train[:], predicted_stock_tsla_train[:,0]))

print('Root mean square error is {}'.format(trainScore))

plt.plot(y_MinMax.inverse_transform(predicted_stock_tsla_train),label='predicted')
plt.plot(y_MinMax.inverse_transform(y_train),label='actual')
plt.legend()
plt.title('Without Sentiment Data tsla train')
plt.show()
plt.clf()
plt.cla()
plt.close()