In [2]:
import numpy as np
import pandas as pd
import pickle as pkl
from purged_cv import *
import matplotlib.pyplot as plt

## Extract Trading Dates

In [3]:
with open("./Data/stocks.pkl", "rb") as file: 
    unlabeled = pkl.load(file)

In [4]:
trading_days = pd.to_datetime(unlabeled["DlyCalDt"]).unique()

## Read in labeled Data

In [5]:
with open("./Data/stocks_labeled.pkl", "rb") as file:
    data = pkl.load(file)

In [6]:
#encode unique identifier per stock using Ticker and Permno number. Later encoded as categorial variable
data["id"] = data["ticker"] + data["PERMNO"].astype(str)

## Hyperparameter Training Lightgbm

In [7]:
data_cv = data[(data["Date"]>= "2016-01-01") & (data["Date"]<= "2017-01-01")]

In [8]:
#drop forward looking columns from Features
X_cv = data_cv.drop(columns=["ticker","PERMNO", "Date", "End Time", "Return of Label", "Label", "Tomorrow_Return", "id"])
y_cv = data_cv["Label"]

# Lightgbm expects class labels from 0 to num_class
mapping = {-1: 0, 0: 1, 1: 2}
y_mv = y_cv.map(mapping).astype(int)

y_lgbm = y_mv.values

In [15]:
X_cv.columns

Index(['DlyVol', 'DlyClose', 'DlyLow', 'DlyHigh', 'DlyOpen', 'SMA_20',
       'SMA_50', 'EMA_20', 'EMA_50', 'ret_1d', 'ret_5d', 'ret_10d', 'ret_20d',
       'MACD', 'MACD_Signal', 'HL_range', 'Gap_OC', 'Gap_CC', 'RSI', 'Return',
       'Lag_Return_1', 'Lag_Return_2', 'Lag_Return_3'],
      dtype='object')

In [14]:
y_mv.value_counts()

Label
2    34366
0    29339
1      943
Name: count, dtype: int64

In [9]:
X_cv.describe()

Unnamed: 0,DlyVol,DlyClose,DlyLow,DlyHigh,DlyOpen,SMA_20,SMA_50,EMA_20,EMA_50,ret_1d,...,MACD,MACD_Signal,HL_range,Gap_OC,Gap_CC,RSI,Return,Lag_Return_1,Lag_Return_2,Lag_Return_3
count,64648.0,64648.0,64648.0,64648.0,64648.0,64648.0,64648.0,64648.0,64648.0,64648.0,...,64648.0,64648.0,64648.0,64648.0,64648.0,64648.0,64648.0,64648.0,64648.0,64648.0
mean,1931247.0,44.216548,43.550537,44.820024,44.171408,44.166211,44.084291,44.163086,44.049775,0.002104,...,0.040072,0.044038,0.036613,0.000523,0.002104,51.176179,0.000674,0.000478,0.000313,-0.000385
std,6891542.0,62.83745,62.049341,63.567695,62.826213,62.803247,62.657005,62.762918,62.506175,0.089494,...,1.36594,1.276629,0.033554,0.089704,0.089494,13.435076,0.047977,0.025971,0.022933,0.025939
min,342.0,0.0672,0.06,0.076,0.0715,0.09705,0.150952,0.110629,0.158293,-0.801945,...,-58.47094,-52.357484,0.0,-0.799941,-0.801945,5.857022,-1.619208,-1.619208,-0.691244,-1.619208
25%,126923.5,12.46,12.25,12.68725,12.4775,12.444375,12.474675,12.457452,12.498887,-0.021127,...,-0.231213,-0.213845,0.017348,-0.005159,-0.021127,41.668752,-0.021353,-0.008577,-0.008939,-0.009595
50%,420825.0,28.2,27.72,28.68,28.17,28.2105,28.290388,28.246786,28.375921,0.00491,...,0.031016,0.031592,0.027397,0.0,0.00491,51.365556,0.004898,0.00073,0.000377,0.0
75%,1489733.0,55.55,54.57,56.369925,55.4125,55.273625,55.05915,55.327284,55.201436,0.022236,...,0.367622,0.349191,0.044444,0.005457,0.022236,60.77075,0.021993,0.009852,0.009654,0.009155
max,375088600.0,838.09,832.4,841.29,841.02,829.754,805.5564,821.799709,804.503844,11.578354,...,27.196777,24.848194,1.39083,14.211868,11.578354,98.549751,2.531977,2.531977,0.46646,2.531977


In [28]:
# categorial_feats = ["ticker"]
# for c in categorial_feats:
#     X_cv[c] = X_cv[c].astype('category')

In [11]:
#function to create Hyperparameter combinations
from itertools import product
def param_grid_dicts(param_dict):
    keys = list(param_dict.keys())
    values_product = product(*[param_dict[k] for k in keys])
    for combo in values_product:
        yield dict(zip(keys, combo))

In [12]:
#iterable Hyperparameter Combinations
cv_params = {'max_depth': [3, 5, 7, 10, 15, -1], 
             'n_estimators': [100, 200, 300],
             'learning_rate': [0.01, 0.05, 0.1]
             }  

params_comb = param_grid_dicts(cv_params)

In [13]:
import lightgbm as lgb
from purged_cv import purged_kfold_indices
from sklearn.metrics import f1_score, log_loss

n_samples = X_cv.shape[0]
n_splits = 5

folds = purged_kfold_indices(n_samples = n_samples , n_splits= n_splits, purge_size=20)

results = []

for params in params_comb:

    weighted_f1_scores = []
    log_loss_cv = [] 

    for fold, (train_idx, test_idx) in enumerate(folds, 1):
        
        # Extract training and test data for this fold.
        X_t, y_t = X_cv.iloc[train_idx , : ], y_lgbm[train_idx]
        X_val, y_val = X_cv.iloc[test_idx, : ], y_lgbm[test_idx]

        # Fit the model using Lightgbm.
        lgbm = lgb.LGBMClassifier(
                            objective='multiclass',
                            num_class=3,
                            **params,
                            num_leaves=32,
                            random_state=42,
                            )

        #fit model using early stopping.
        lgbm.fit(X_t,
                 y_t,
                 eval_set=[(X_val, y_val)],
                 eval_metric='multi_logloss',
                 callbacks=[lgb.log_evaluation(period=1)])

        #compute 
        y_pred = lgbm.predict(X_val)
        weighted_f1 = f1_score(y_pred=y_pred, y_true=y_val, average="weighted")
        weighted_f1_scores.append(weighted_f1)

        preds = lgbm.predict_proba(X_val)
        log_loss_cv.append(log_loss(y_val, preds))


    mean_F1 = np.mean(weighted_f1_scores)
    mean_log_loss = np.mean(log_loss_cv)
    
    result_entry = {
        **params,
        "weighted F1" : mean_F1,
        "avg confidence" : mean_log_loss,
    }
    results.append(result_entry)

    print(f"mean F1 Score: {mean_F1},  mean Log Loss: {mean_log_loss}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001838 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5865
[LightGBM] [Info] Number of data points in the train set: 51698, number of used features: 23
[LightGBM] [Info] Start training from score -0.789696
[LightGBM] [Info] Start training from score -4.235771
[LightGBM] [Info] Start training from score -0.631961
[1]	valid_0's multi_logloss: 0.757708
[2]	valid_0's multi_logloss: 0.757562
[3]	valid_0's multi_logloss: 0.75744
[4]	valid_0's multi_logloss: 0.757301
[5]	valid_0's multi_logloss: 0.757167
[6]	valid_0's multi_logloss: 0.757036
[7]	valid_0's multi_logloss: 0.75691
[8]	valid_0's multi_logloss: 0.756808
[9]	valid_0's multi_logloss: 0.756689
[10]	valid_0's multi_logloss: 0.756578
[11]	valid_0's multi_logloss: 0.75647
[12]	valid_0's multi_logloss: 0.756372
[13]	valid_0's multi_logloss: 0.756268
[14]	valid_0's multi_logloss: 0.756163
[15]	valid_0's 

KeyboardInterrupt: 

In [15]:
df_results = pd.DataFrame(results)

In [16]:
df_results

Unnamed: 0,max_depth,n_estimators,learning_rate,weighted F1,avg confidence
0,7,300,0.1,"[0.5455566222298995, 0.5505277327353084, 0.539...","[0.7460877087909162, 0.7387722792961876, 0.746..."
1,10,100,0.01,"[0.4409400033161395, 0.4637879378879299, 0.455...","[0.746298263664628, 0.7413807515271472, 0.7435..."
2,10,100,0.05,"[0.5188123136520922, 0.5294563218836832, 0.519...","[0.7388425247699817, 0.7343976926891153, 0.737..."
3,10,100,0.1,"[0.5294861294779922, 0.545005413065035, 0.5298...","[0.739135298310351, 0.7337157197021171, 0.7377..."
4,10,200,0.01,"[0.4837223002422972, 0.5083877803772416, 0.490...","[0.7422596620449757, 0.7370815550034608, 0.740..."
5,10,200,0.05,"[0.5319322764073258, 0.5435046332628763, 0.531...","[0.737559802460464, 0.7329931358660217, 0.7375..."
6,10,200,0.1,"[0.5386805300765702, 0.5462766572724054, 0.540...","[0.7415736625757827, 0.7367933407227608, 0.740..."
7,10,300,0.01,"[0.5013833215011695, 0.5189201803922634, 0.504...","[0.7408492074097844, 0.7353762708236778, 0.738..."
8,10,300,0.05,"[0.5381738191720914, 0.5470698731764836, 0.536...","[0.7379062000382096, 0.7336060564062857, 0.738..."
9,10,300,0.1,"[0.5410220147539746, 0.550589973178757, 0.5424...","[0.7460286986185255, 0.740313740413049, 0.7443..."
