In [67]:
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
import os
from sklearn.model_selection import train_test_split, StratifiedKFold
from sktime.forecasting.model_selection import SlidingWindowSplitter
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline, Pipeline
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

In [68]:
rs=121
stocks=['aapl','googl','nvda','dal','xom','cvx','vz']
best_spreads=[22,6,13,21,11,7,24]

In [69]:
init_dir_path = str(os.getcwd())
data=pd.read_csv(init_dir_path+'/stock_hmm_output.csv',engine='python').drop(['Unnamed: 0'],axis=1)
smooth_data=pd.read_csv(init_dir_path+'/stock_hmm_output_kalman.csv',engine='python').drop(['Unnamed: 0'],axis=1)
features=['Return','Volatility','HighR','LowR']
target='State'
data[target] = smooth_data[target] 

In [70]:
final_test=[]
stocks_data=[]
ft_size=400
for x in data.groupby(['Ticker']):
    final_test.append(x[1][-ft_size:])
    stocks_data.append(x[1][:-ft_size])

In [71]:
def create_sliding_window(train_df, features, target, window=7):
    splitter = SlidingWindowSplitter(window_length=window, step_length=1, fh=[1])
    train_x, train_y= [], []

    for idx in splitter.split(train_df):
        X = train_df.iloc[idx[0]][features].values.flatten()
        y = train_df.iloc[idx[1]][target].values.flatten()[0]
        train_x.append(X)
        train_y.append(y)

    return (train_x, train_y)

In [72]:
testing_lists = {}
for ticker_idx, ticker_df in enumerate(final_test):
    ticker = list(ticker_df["Ticker"])[0]
    (test_x, test_y) = create_sliding_window(ticker_df, features, target, window=best_spreads[ticker_idx])
    testing_lists[ticker] = [test_x,test_y] #spread->stock index->time series split->data/label

In [73]:
training_lists = {}
for ticker_idx, ticker_df in enumerate(stocks_data):
    ticker = list(ticker_df["Ticker"])[0]
    (train_x, train_y) = create_sliding_window(ticker_df, features, target, window=best_spreads[ticker_idx])
    training_lists[ticker] = [train_x,train_y] #spread->stock index->time series split->data/label

In [74]:
with open('best_logistic.pkl','rb') as f:
    best_logmodels=pickle.load(f)

In [75]:
best_models=[]
for i in range(len(stocks)):
    best_models.append(best_logmodels[best_spreads[i]][i])
best_models[stocks.index('aapl')]=SVC(random_state=rs)
best_models[stocks.index('vz')]=SVC(random_state=rs)

In [78]:
accuracies=pd.DataFrame(index=['f1_score','accuracy_score','baseline_f1','baseline_accuracy'])
for (i,stock) in enumerate(stocks):
    model=best_models[i]
    model.fit(X=training_lists[stock][0],y=training_lists[stock][1])
    y_pred=model.predict(testing_lists[stock][0])
    y_true=testing_lists[stock][1]
    y_naive=[0]*len(y_true)
    accuracies[stock]=[f1_score(y_pred,y_true,pos_label=0),accuracy_score(y_pred,y_true),f1_score(y_true,y_naive,pos_label=0),accuracy_score(y_true,y_naive)]

In [79]:
accuracies

Unnamed: 0,aapl,googl,nvda,dal,xom,cvx,vz
f1_score,0.961988,0.935385,0.828452,0.998693,0.968023,0.951841,0.949405
accuracy_score,0.931217,0.889182,0.789203,0.997416,0.941489,0.913706,0.913486
baseline_f1,0.962963,0.933896,0.812214,0.994805,0.957004,0.924966,0.918845
baseline_accuracy,0.928571,0.875989,0.683805,0.989664,0.917553,0.860406,0.849873
