In [21]:
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
import optuna
import os
import cupy as cp
from sklearn.model_selection import train_test_split, StratifiedKFold
from sktime.forecasting.model_selection import SlidingWindowSplitter
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline, Pipeline
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

In [2]:
rs=121
stocks=['aapl','googl','nvda','dal','xom','cvx','vz']

In [3]:
init_dir_path = str(os.getcwd())
data=pd.read_csv(init_dir_path+'/stock_hmm_output.csv',engine='python').drop(['Unnamed: 0'],axis=1)
smooth_data=pd.read_csv(init_dir_path+'/stock_hmm_output_kalman.csv',engine='python').drop(['Unnamed: 0'],axis=1)
features=['Smooth_Return','Volatility','Smooth_HighR','Smooth_LowR']
target='State'
data[target] = smooth_data[target] 

In [4]:
final_test=[]
stocks_data=[]
ft_size=400
for x in data.groupby(['Ticker']):
    final_test.append(x[1][-ft_size:])
    stocks_data.append(x[1][:-ft_size])

In [5]:
testing_size=500
ns=5
tss=TimeSeriesSplit(n_splits=ns,test_size=testing_size)
spread_range = range(2, 25)

In [6]:
with open('saved_training_lists_kalman_unsmoothened.pkl', 'rb') as f:
    training_lists = pickle.load(f)

with open('saved_testing_lists_kalman_unsmoothened.pkl', 'rb') as f:
    testing_lists = pickle.load(f)

In [None]:
def accuracies_model(model_dict, window, stock_ind, map=True):
    a=0
    y_true_all=[]
    y_pred_all=[]
    if map:
        model = model_dict[window][stock_ind]
    else:
        model = model_dict
    for i in range(ns):
        model.fit(X=training_lists[window][stock_ind][i][0],y=training_lists[window][stock_ind][i][1])
        y_pred=model.predict(testing_lists[window][stock_ind][i][0])
        y_true=testing_lists[window][stock_ind][i][1]
        a+= accuracy_score(y_true,y_pred)
        y_true_all=y_true_all+list(y_true)
        y_pred_all=y_pred_all+list(y_pred)
    A=confusion_matrix(y_true=y_true_all,y_pred=y_pred_all)
    f1_score = (2*A[0][0]/(2*A[0][0]+A[0][1]+A[1][0]))
    a=a/ns
    return (a,f1_score)

In [8]:
def plot_accuracies(accuracies, model_type): #accuracies needs to be a nested dictionary window->stock index
    for stock_ind in range(len(stocks)):
        vals = [[],[]]
        colors = {0: 'blue', 1: 'orange'}
        labels = {0: "accuracy score", 1: "f1_score"}
        for window in accuracies.keys():
            vals[0].append(accuracies[window][stock_ind][0])
            vals[1].append(accuracies[window][stock_ind][1])
        plt.figure(figsize=(6, 4))
        for i in range(2):
            plt.plot(list(accuracies.keys()), vals[i], marker='o',c=colors[i], label = labels[i])
        plt.title(f"{model_type} scores for {stocks[stock_ind]}")
        plt.xlabel("Window size")
        plt.ylabel("Score")
        plt.legend()
        plt.tight_layout()
        plt.show()

In [22]:
def objective_with_args(spread, stock_ind):
    def inner(trial):
        param = {
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
            'n_estimators': trial.suggest_int('n_estimators', 100, 300),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'reg_alpha': trial.suggest_categorical('reg_alpha', [0, 0.1, 0.5, 1.0]),
            'reg_lambda': trial.suggest_categorical('reg_lambda', [0.5, 1.0, 2.0, 5.0]),
            'min_split_loss': trial.suggest_categorical('min_split_loss', [0, 0.1, 0.3, 0.5]),
            'device': 'cuda'
        }
        model = XGBClassifier(**param)
        training_list = training_lists[spread][stock_ind]
        testing_list = testing_lists[spread][stock_ind]
        y_true_all = []
        y_pred_all = []
        for i in range(ns):
            X_train = cp.asarray(training_list[i][0])
            y_train = cp.asarray(training_list[i][1])
            X_test = cp.asarray(testing_list[i][0])
            y_test = testing_list[i][1]

            if X_test.shape[0] == 0:
                continue

            model.fit(X_train, y_train)
            y_pred_all.extend(model.predict(X_test))
            y_true_all.extend(y_test)

        return f1_score(y_true_all, y_pred_all)
    return inner


from functools import partial

best_XGB = {}
for spread in spread_range:
    best_XGB[spread] = {}
    for s_ind in range(len(stocks)):
        study = optuna.create_study(direction="maximize")
        study.optimize(objective_with_args(spread, s_ind), n_trials=50)
        best_params = study.best_params
        best_XGB[spread][s_ind] = XGBClassifier(**best_params, tree_method="hist", device="cuda")


[I 2025-04-15 20:24:31,987] A new study created in memory with name: no-name-2a7da001-dfd9-493e-9266-92ea6bb61b7f
[I 2025-04-15 20:24:39,924] Trial 0 finished with value: 0.6936866718628215 and parameters: {'max_depth': 7, 'learning_rate': 0.1677174772140236, 'n_estimators': 300, 'subsample': 0.5974014002665347, 'colsample_bytree': 0.5765292849045573, 'reg_alpha': 1.0, 'reg_lambda': 1.0, 'min_split_loss': 0.3}. Best is trial 0 with value: 0.6936866718628215.
[I 2025-04-15 20:24:42,179] Trial 1 finished with value: 0.7085624509033779 and parameters: {'max_depth': 7, 'learning_rate': 0.07210132293908188, 'n_estimators': 176, 'subsample': 0.8239674282468097, 'colsample_bytree': 0.5590444492386522, 'reg_alpha': 0.5, 'reg_lambda': 1.0, 'min_split_loss': 0.5}. Best is trial 1 with value: 0.7085624509033779.
[I 2025-04-15 20:24:43,875] Trial 2 finished with value: 0.7088607594936709 and parameters: {'max_depth': 3, 'learning_rate': 0.18280856282648505, 'n_estimators': 277, 'subsample': 0.5172

In [23]:
with open('best_xgb.pkl', 'wb') as f:
    pickle.dump(best_XGB, f)