In [37]:
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
import os
from sklearn.model_selection import train_test_split, StratifiedKFold
from sktime.forecasting.model_selection import SlidingWindowSplitter
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline, Pipeline
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

In [38]:
rs=121
stocks=['aapl','googl','nvda','dal','xom','cvx','vz']
best_spreads=[22,6,13,21,11,7,24]

In [39]:
init_dir_path = str(os.getcwd())
data=pd.read_csv(init_dir_path+'/stock_hmm_output.csv',engine='python').drop(['Unnamed: 0'],axis=1)
smooth_data=pd.read_csv(init_dir_path+'/stock_hmm_output_kalman.csv',engine='python').drop(['Unnamed: 0'],axis=1)
features=['Return','Volatility','HighR','LowR']
target='State'
data[target] = smooth_data[target] 

In [40]:
final_test=[]
stocks_data=[]
ft_size=400
for x in data.groupby(['Ticker']):
    final_test.append(x[1][-ft_size:])
    stocks_data.append(x[1][:-ft_size])

In [41]:
ft=[]
for x in final_test:
    ft.append(x[pd.to_datetime(x['Date'])<= pd.to_datetime('2024-12-31')])
final_test=ft

In [42]:
def create_sliding_window(train_df, features, target, window=7):
    splitter = SlidingWindowSplitter(window_length=window, step_length=1, fh=[1])
    train_x, train_y= [], []

    for idx in splitter.split(train_df):
        X = train_df.iloc[idx[0]][features].values.flatten()
        y = train_df.iloc[idx[1]][target].values.flatten()[0]
        train_x.append(X)
        train_y.append(y)

    return (train_x, train_y)

In [43]:
testing_lists = {}
for ticker_idx, ticker_df in enumerate(final_test):
    ticker = list(ticker_df["Ticker"])[0]
    (test_x, test_y) = create_sliding_window(ticker_df, features, target, window=best_spreads[ticker_idx])
    testing_lists[ticker] = [test_x,test_y] #spread->stock index->time series split->data/label

In [44]:
training_lists = {}
for ticker_idx, ticker_df in enumerate(stocks_data):
    ticker = list(ticker_df["Ticker"])[0]
    (train_x, train_y) = create_sliding_window(ticker_df, features, target, window=best_spreads[ticker_idx])
    training_lists[ticker] = [train_x,train_y] #spread->stock index->time series split->data/label

In [45]:
with open('best_logistic.pkl','rb') as f:
    best_logmodels=pickle.load(f)

In [46]:
best_models=[]
for i in range(len(stocks)):
    best_models.append(best_logmodels[best_spreads[i]][i])
best_models[stocks.index('aapl')]=SVC(random_state=rs)
best_models[stocks.index('vz')]=SVC(random_state=rs)

In [49]:
accuracies=pd.DataFrame(index=['f1_score','accuracy_score','baseline_f1','baseline_accuracy'])
for (i,stock) in enumerate(stocks):
    model=best_models[i]
    model.fit(X=training_lists[stock][0],y=training_lists[stock][1])
    y_pred=model.predict(testing_lists[stock][0])
    y_true=testing_lists[stock][1]
    y_naive=[1]*len(y_true)
    accuracies[stock]=[f1_score(y_pred,y_true,pos_label=0),accuracy_score(y_pred,y_true),f1_score(y_true,y_naive,pos_label=0),accuracy_score(y_true,y_naive)]

In [50]:
accuracies

Unnamed: 0,aapl,googl,nvda,dal,xom,cvx,vz
f1_score,0.980892,0.943396,0.852018,1.0,0.974194,0.955696,0.956667
accuracy_score,0.964072,0.901493,0.808696,1.0,0.951807,0.92,0.925501
baseline_f1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
baseline_accuracy,0.050898,0.107463,0.295652,0.0,0.063253,0.134286,0.146132
