In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals
import numpy as np
import matplotlib as mpl
import matplotlib.pylab as plt # for plotting
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, plot_confusion_matrix
import tensorflow as tf
from tensorflow import keras

In [194]:
from tensorflow.keras.layers import Conv1D, Dense, Dropout, Input, Concatenate, GlobalMaxPooling1D
from tensorflow.keras.models import Model

In [195]:
trump = pd.read_pickle("data/trump_sentiment_labelled.pkl")
pol = pd.read_pickle("data/politician_sentiment_labelled.pkl")
stocks = pd.read_pickle("data/df_stocks.pkl")

In [196]:
def organize_by_user(politician_df):
    handles = ['@SenTedCruz', '@SenWarren', '@RepAOC', 
             '@KamalaHarris', '@SenatorLeahy', '@SenSanders', 
             '@GOPLeader', '@SenSchumer', '@SenatorDurbin',
             '@CoryBooker', '@SenJohnThune', '@RepDanCrenshaw', 
             '@Liz_Cheney', '@SenJohnBarrasso', '@IlhanMN']
    users = {}

    for index, row in politician_df.iterrows():
        handle = row['user']
        if handle in handles and handle not in users:
            users[handle] = [row]
        elif handle in handles:
            users[handle].append(row)

    return users

def get_avgd_array(df):
    # convert to an np array and then organize tweets by day
    tweet_array = np.array(df)
    tweet_dict = {}
    avg_tweet_array = []

    for row in tweet_array:
        curr_date = row[1].date()
        if curr_date in tweet_dict:
            tweet_dict[curr_date].append(row)
        else:
            tweet_dict[curr_date] = [row]

    for day in tweet_dict.keys():

        avg_neg = 0
        avg_pos = 0
        avg_neu = 0
        avg_comp = 0
        avg_retweets = 0
        avg_favorites = 0
        for tweet in tweet_dict[day]:
            avg_neg += tweet[-4]
            avg_pos += tweet[-3]
            avg_neu += tweet[-2]
            avg_comp += tweet[-1]
            avg_favorites += tweet[4]
            avg_retweets += tweet[3]

        tweets_per_day = len(tweet_dict[day])
        avg_neg /= tweets_per_day
        avg_pos /= tweets_per_day
        avg_neu /= tweets_per_day
        avg_comp /= tweets_per_day
        avg_retweets /= tweets_per_day
        avg_favorites /= tweets_per_day

        elem = [day, avg_neg, avg_pos, avg_neu, avg_comp, avg_retweets, avg_favorites]
        avg_tweet_array.append(elem)

    return np.array(avg_tweet_array)

def moving_average(x, w):
    return np.convolve(x, np.ones(w), 'valid') / w


def add_moving_average(tweet_array):
    init_cols = tweet_array.shape[1]
    for i in range(1, init_cols):
        sma = moving_average(tweet_array[:, i], 3)
        sma.reshape(len(sma), 1)
        sma = np.insert(sma, [0, 0], values=0)
        tweet_array = np.insert(tweet_array, tweet_array.shape[1], sma, axis=1)
    return tweet_array

In [None]:
users = organize_by_user(pol)

In [None]:
cols = list(pol.columns.values)
for key in users:
    users[key] = pd.DataFrame(users[key], columns=cols)
    users[key] = get_avgd_array(users[key])
    users[key] = add_moving_average(users[key])
    

In [None]:
column_names = ["Date", "avg_neg", "avg_pos", "avg_neu", "avg_comp", "avg_retweets", "avg_favorites", "three_day_neg", "three_day_pos", "three_day_neu", "three_day_comp", "three_day_retweets", "three_day_favorites"]
data_types = {"Date":'datetime64[ns]', "avg_neg" : 'float64', "avg_pos": 'float64', "avg_neu": 'float64',
                  "avg_comp": 'float64', "avg_retweets": 'float64', "avg_favorites": 'float64',
                  "three_day_neg": 'float64', "three_day_pos": 'float64', "three_day_neu": 'float64',
                  "three_day_comp": 'float64', "three_day_retweets": 'float64', "three_day_favorites": 'float64'}

target = stocks.take([0,8], axis=1)
for key in users:
    users[key] = pd.DataFrame(users[key], columns=column_names)
    users[key] = users[key].astype(data_types)
    users[key] = pd.merge(target, users[key], how='left').dropna()

In [None]:
COLOR = 'white'
mpl.rcParams['text.color'] = COLOR
mpl.rcParams['axes.labelcolor'] = COLOR
mpl.rcParams['xtick.color'] = COLOR
mpl.rcParams['ytick.color'] = COLOR
scores = []
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']} 
  
fig, axes = plt.subplots(nrows=5, ncols=3, figsize=(20,20))
for user, ax in zip(users, axes.flatten()):
    xtrain, xtest, ytrain, ytest = train_test_split(
        users[user].drop(['Date','daily_gain'], axis=1),
        users[user]['daily_gain'],
        test_size=0.25,
        shuffle=False,
        random_state=0
    )
    grid = GridSearchCV(SVC(), param_grid, refit = True, scoring='accuracy')
    model = grid.fit(xtrain, ytrain)
    print(user)
    print(grid.best_params_)
    print(grid.best_estimator_)
    plot_confusion_matrix(model, xtest, ytest, ax=ax)
    ax.title.set_text(user)
    scores.append([user,classification_report(
        ytest, model.predict(xtest), output_dict=True, zero_division=0)['accuracy']])
plt.tight_layout()
print(pd.DataFrame(scores, columns=['Politician','Accuracy']))
    