Please note that some packages (and jupyter notebook specifically) require specific installed versions.

To install jupyter dashboards and enable interactive viz, use the following commands

```
conda install notebook==5.5
conda install -c conda-forge jupyter_dashboards
jupyter nbextension enable jupyter_dashboards --py --sys-prefix
```

In [None]:
# Import packages
import pandas as pd
import numpy as np
import ipywidgets as widgets
from ipywidgets import interactive
import matplotlib.pyplot as plt
from datetime import datetime
from datetime import timedelta
from IPython.display import Javascript
from plotnine import ggplot, aes, geom_line, scale_x_date, ylab, theme, geom_col, coord_flip, scale_x_discrete, geom_bar
from plotnine import labs, scale_fill_gradient, scale_fill_manual, element_text

#packages for LDA
import sklearn
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
import smart_open
from gensim import matutils, models
import scipy.sparse
import re
import string
import os

#packages for log reg
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

In [None]:
# Load data
comments = pd.read_csv('all_comments_withSentimentTickers_newestdata.csv')
posts = pd.read_csv('all_posts_withSentimentTickers_newestdata.csv')
prices = pd.read_csv('sp500_prices.csv')
info = pd.read_csv('sp500_tickers.csv')

prices['Ticker_Date'] = pd.to_datetime(prices['Ticker_Date'])
prices['date'] = prices['Ticker_Date'].dt.date

posts['created_utc'] = pd.to_datetime(posts['created_utc'],unit='s', origin='unix')
posts['date'] = posts['created_utc'].dt.date

comments['created_utc'] = pd.to_datetime(comments['created_utc'],unit='s', origin='unix')
comments['date'] = comments['created_utc'].dt.date
comments['body'] = comments['body'].astype(str)

# Filter down dataset and expand on each mentioned ticker
ticker_comments = comments[comments['tickers_mentioned'].notnull()].copy()
ticker_comments['ticker_list'] = [x.replace(" ","").split(",") for x in ticker_comments['tickers_mentioned']]

ticker_comments = ticker_comments.explode('ticker_list')
ticker_comments = ticker_comments[ticker_comments['ticker_list'] != 'A'].copy() #filtering out bad data


In [None]:
#perform LDA

addsw = 'going', 'would', 'go', 'think', 'money', 'dont', 'let', 'could', 'never', 'oh', 'oooh', 'got', 'like', 'youre', 'well', 'la', 'im', 'ive', 'whats', 'theyve', 'ohohh', 'youve', 'cant', 'wanna', 'another',  'theres', 'know', 'one', 'want', 'good', 'get','ill', 'market', 'time', 'stocks', 'people', 'buy', 'stock', 'years', 'inflation', 'trading','make'


stopwords = stopwords.words('english')

for word in addsw: 
    stopwords.append(word)
    
def split_comment(comment):
    x = [word for word in comment.split() if word not in stopwords]
    return x

def clean_comments(comment):
    comment = str(comment)
    comment = comment.lower()
    comment = re.sub(r'[\(\[].*?[\)\]]', '', comment)
    comment = re.sub('[%s]' % re.escape(string.punctuation), '', comment)
    comment = os.linesep.join([s for s in comment.splitlines() if s])
    return comment

#comments['words'] = comments['body'].apply(lambda x: split_comment(x))

#filter a dataset for last 3 days
mx_dt = comments['created_utc'].max().date()
recent_comments = comments[comments['created_utc'].dt.date >= (mx_dt - timedelta(days=3))].copy()
recent_comments['body'] = recent_comments['body'].apply(lambda x: clean_comments(x))
recent_comments['words'] = recent_comments['body'].apply(lambda x: split_comment(x))
combined_comments = recent_comments.groupby('subreddit_id')['body'].apply(','.join).reset_index()
cv = CountVectorizer(stop_words=stopwords, ngram_range=(1,2), analyzer='word')
data_cv = cv.fit_transform(combined_comments.body)
data_stop = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_stop.index = combined_comments.index
tds = data_stop.transpose()
smart_open.open = smart_open.smart_open

sparse_counts = scipy.sparse.csr_matrix(tds)
corpus = matutils.Sparse2Corpus(sparse_counts)
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=10)
print("LDA: Relevant topics for last 3 days")
#lda.print_topics()
x = lda.print_topics()

d = {}
for i in range(len(x)):
    lda0 = x[i][1]
    sample = lda0.split('"')
    lst = []
    for j in range(len(sample)):
        if j % 2 != 0:
            lst.append(sample[j])
    df_name = 'df_{}'.format(i)
    d[df_name] = pd.DataFrame(lst)

df = pd.concat([d['df_0'], d['df_1']], axis=1)
df = df.transpose().reset_index().drop(["index"], axis=1)
df = df.rename(index={0: 'Topic 1', 1: 'Topic 2'})
df

In [None]:
# Define our interactive GUI elements
tickers = ticker_comments.groupby('ticker_list').count()['id'].sort_values(ascending=False)
tickers = list(set(tickers[:20].index))
tickers.sort()

ticker_dropdown = widgets.Dropdown(
    options=tickers,
    value=tickers[0],
    description='Stock Ticker:',
    disabled=False,
)

start_date = min(comments['date'])
end_date = max(comments['date'])
date_range = pd.date_range(start_date, end_date, freq='D')

date_slider = widgets.SelectionRangeSlider(
    options=[(date.strftime('%m/%d/%y'), date) for date in date_range],
    index = (0, len(date_range)-1),
    orientation='horizontal',
    layout={'width': '500px'}
)
# Hook up interactivity to output cells
def ticker_filter(ticker=ticker_dropdown.value, date_range = date_slider.value):
    display(Javascript("Jupyter.notebook.execute_cells([5])"))
    display(Javascript("Jupyter.notebook.execute_cells([6])"))
    display(Javascript("Jupyter.notebook.execute_cells([7])"))
    display(Javascript("Jupyter.notebook.execute_cells([8])"))
    display(Javascript("Jupyter.notebook.execute_cells([9])"))
    display(Javascript("Jupyter.notebook.execute_cells([10])"))
interactive(ticker_filter, ticker=ticker_dropdown, date_range = date_slider)

In [None]:
# Define functions to generate dashboard elements
def show_ticker_info(ticker):
    ticker_info = info.loc[info['Symbol'] == ticker]
    display(ticker_info.style.hide_index())

def filter_data(ticker, start_date, end_date, full_df):
    return full_df[(full_df['ticker_list'] == ticker)
                  &(full_df['date'] >= pd.to_datetime(start_date))
                  &(full_df['date'] <= pd.to_datetime(end_date))].copy()
    
def ticker_lines_new(ticker_df,start_date=date_slider.value[0], end_date=date_slider.value[1]):
    ticker_df = ticker_df.groupby('date').agg(total_weighted_sentiment=('weighted_sentiment', 'mean')).reset_index()
    ticker_values = prices.loc[prices['Symbol'] == ticker]
    df = ticker_df.merge(ticker_values, on='date')
    
    fig,ax = plt.subplots(dpi=100)
    ax.plot(df['date'],df['total_weighted_sentiment'],color='k',label='weighted sentiment')
    ax2 = ax.twinx()
    ax2.plot(df['date'],df['log_return'],color='g',label='log return')
    for tick in ax.get_xticklabels():
        tick.set_rotation(45)
    plt.grid(True)
    fig.legend(prop={'size': 7})
    plt.title(ticker + ' Weighted Sentiment vs Log Return')
    ax.set_xlabel('Date')
    ax.set_ylabel('Weighted Sentiment Score')
    ax2.set_ylabel('log return')
    plt.tight_layout()
    plt.show()
    
def ticker_mentions(start_date=date_slider.value[0], end_date=date_slider.value[1]):
    df = ticker_comments.copy()
    df = df[(df['date'] >= start_date)
           &(df['date'] <= end_date)]
    ticker_list = df['ticker_list'].value_counts().index.tolist()[::-1]
    ticker_list = ticker_list[-30:]
    ticker_cat = pd.Categorical(df['ticker_list'], categories=ticker_list)
    df = df.assign(ticker_cat = ticker_cat)
    df['is_active'] = [1 if x==ticker else 0 for x in df['ticker_list']]

    fig,ax = plt.subplots(figsize=(6,5),dpi=100)
    df.groupby('ticker_cat').count()['is_active'].plot.barh(color='g')
    plt.ylabel('Ticker')
    plt.xlabel('Count of Mentions')
    plt.title('Top 30 Ticker Mentions over Date Range')
    plt.tight_layout()
    plt.show()

def subreddit_sentiment(ticker_df,start_date=date_slider.value[0], end_date=date_slider.value[1]):
    fig,ax = plt.subplots(dpi=100)
    ticker_df.groupby('subreddit').agg({'weighted_sentiment':'mean'}).plot.bar(color='g',legend=False,ax=ax)
    plt.ylabel('Average Weighted Sentiment')
    plt.title('Average Weighted Sentiment by Subreddit: Ticker ' + ticker)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
    
def train_logistic_regression(data: pd.DataFrame, ticker: str):
    """Function to train logistic regression.
    Outputs:
        - clf: Trained model. Can be used for predictions
        - report: Dictionary with model report (e.g. accuracy, precision, recall). Can be used to display model characteristics in dashboard
        - cmd: Matplotlib figure with visual representation of confusion matrix. Can also be plotted in dashboard
    """
    
    ticker_data = data.copy()
    # Filter by ticker
    #ticker_data = data_local.loc[data_local['tickers_mentioned'] == ticker]
    prices_local = prices.loc[prices['Symbol'] == ticker].copy()
    
    # Group by day (since we are predicting next day)
    X = data.groupby(['created_utc']).agg(
        #upvote_ratio=('upvote_ratio', 'mean'), 
        total_awards_received=('total_awards_received', 'mean'),
        weighted_sentiment=('weighted_sentiment', 'mean'),
        n_rows=('weighted_sentiment', 'count')
        ).reset_index()
    
    # Add next day column so we can use that to join with prices data
    X['next_day'] = pd.to_datetime(X.created_utc + pd.Timedelta(days=1)).dt.date
    
    # Add column to prices indicate whether price increased on a given date or not (1, 0)
    prices_local['increased_next_day'] = (prices_local['return'] >= 0).astype(int)
    #prices_local['Ticker_Date'] = pd.to_datetime(prices_local['Ticker_Date'])
    
    # Join the two datasets to get X & y in single df
    data = X.merge(prices_local, left_on=['next_day'],
                   right_on=['date'])[[#'tickers_mentioned',
                                              'created_utc', 'weighted_sentiment',
                                                        #'upvote_ratio',
                                                        'total_awards_received', 'n_rows', 'increased_next_day']]
    
    
    # Split into features and label
    ticker_X = data[['weighted_sentiment', 
                                   #'upvote_ratio',
                                   'total_awards_received', 'n_rows']]
    ticker_y = data['increased_next_day']
    # Train model
    clf = LogisticRegression()
    clf.fit(ticker_X, ticker_y)
    # Predict
    y_pred = clf.predict(ticker_X)
    # Get scores
    conf_m = confusion_matrix(ticker_y, y_pred, normalize='all')
    report = classification_report(ticker_y, y_pred, output_dict=True)

    # Make confusion matrix plot
    #ConfusionMatrixDisplay(conf_m)
    print("Binary Next Day Increase Logistic Regression Model")
    print("Next Day Prediction: ", clf.predict(ticker_X.iloc[-1:]))
    return pd.DataFrame(report)#, cmd

In [None]:
ticker = ticker_dropdown.value

ticker_dataframe = filter_data(ticker,start_date,end_date,ticker_comments)

show_ticker_info(ticker)

In [None]:
start_date=date_slider.value[0]
end_date=date_slider.value[1]
ticker_lines_new(ticker_dataframe, start_date, end_date)

In [None]:
ticker_mentions(start_date,end_date)

In [None]:
subreddit_sentiment(ticker_dataframe,start_date,end_date)

In [None]:
train_logistic_regression(ticker_dataframe,ticker)