In [1]:
import numpy as np
import pandas as pd
from datetime import datetime

from sklearn.datasets import load_breast_cancer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.model_selection import TimeSeriesSplit

#from matplotlib import pyplot
import matplotlib.pyplot as plt

import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

In [2]:
def read_csv(file):
    df = pd.read_csv(file, parse_dates=True, index_col=0)
    return df

In [3]:
def f1_eval(y_pred, dtrain):
    y_true = dtrain.get_label()
    err = f1_score(y_true, np.round(y_pred))
    return 'f1_err', err

In [4]:
def print_confusion_matrix(tn, fp, fn, tp):
    print("    ","True", "False")
    print("True ", " ", tp, "  ", fp)
    print("False", " ",fn,"  ", tn)
    print("_______________________________________")
    print("---------------------------------------")

## XGBoost


In [5]:
X_train = read_csv('output/\CompleteIndexesWeeklyTrainKalman_0.csv')
X_test = read_csv('output/\CompleteIndexesWeeklyTestKalman_0.csv')['2015-01-02':'2020-03-13']

y = pd.read_csv('input/\sp500_target_regimes.csv', parse_dates=True)
y.index = y['date'].values
y = y[['regime']]
y = y['regime']=='BEAR'
y = pd.DataFrame (y, columns = ['regime'])

y_train = y.loc['2000-01-01':'2015-01-01']
y_test = y.loc['2015-01-02':]

best_params = {'colsample_bytree': 0.7,
               'gamma': 0,
               'learning_rate': 0.01, 
               'max_delta_step': 1,
               'max_depth': 1, 
               'min_child_weight': 3, 
               'n_estimators': 100, 
               'reg_alpha': 0.1, 
               'scale_pos_weight': 10, 
               'subsample': 0.5}

xgb_model = xgb.XGBClassifier(objective="binary:logistic",
                                  eval_metric = 'auc',
                                  learning_rate=best_params['learning_rate'], 
                                  n_estimators=best_params['n_estimators'], 
                                  min_child_weight=best_params['min_child_weight'], 
                                  gamma=best_params['gamma'],
                                  max_delta_step=best_params['max_delta_step'],
                                  max_depth=best_params['max_depth'],
                                  subsample=best_params['subsample'],
                                  scale_pos_weight=best_params['scale_pos_weight'],
                                  colsample_bytree=best_params['colsample_bytree'],
                                  reg_alpha=best_params['reg_alpha']
                              )
model = xgb_model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
print("_______________________________________")
print("______________Training_________________")
tn, fp, fn, tp = confusion_matrix(y_train, y_train_pred).ravel()
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f_score = 2*precision*recall/(precision+recall)  
print("F1 score", f_score)
print_confusion_matrix(tn, fp, fn, tp)
print("_______________Testing_________________")
y_pred = model.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f_score = 2*precision*recall/(precision+recall)
print("F1 score", f_score)
print_confusion_matrix(tn, fp, fn, tp)
print("---------------------------------------")

_______________________________________
______________Training_________________
F1 score 0.9346733668341709
     True False
True    372    52
False   0    358
_______________________________________
---------------------------------------
_______________Testing_________________
F1 score 0.8288288288288288
     True False
True    46    0
False   19    207
_______________________________________
---------------------------------------
---------------------------------------


In [None]:
def run_experiment_multiple_times(best_params, n=30):
    result_dict = {}
    result_dict= {'f1':0, 'tn':0, 'fp':0, 'fn':0, 'tp':0, 'auc':0, 'best_f1':0, 'best_model':None, 'best_f1':0, 'best_tn':0, 'best_fp':0, 'best_fn':0, 'best_tp':0}
    list_f1 = []
    for i in range(0,n):
        y = pd.read_csv('input/\sp500_target_regimes.csv', parse_dates=True)
        y.index = y['date'].values
        y = y[['regime']]
        y = y['regime']=='BEAR'
        y = pd.DataFrame (y, columns = ['regime'])

        y_train = y.loc['2000-01-01':'2015-01-01']
        y_test = y.loc['2015-01-02':'2020-03-06']
        
        X_train = read_csv('output/\CompleteIndexesWeeklyTrainKalman_'+str(i)+'.csv')
        X_train = X_train[['trade_Trade Policy EMV Fraction', 'vix_Volume', 'infectious_daily_infect_emv_index', 'GPR', 'sp500_Volume', 'Overall EMV Tracker', 'sp500_Close', 'vix_Close', 'dji_Close', 'dji_Volume', 'ndx_Close', 'ndx_Volume', 'n225_Close', 'n225_Volume', 'ftse_Close', 'ftse_Volume', 'hsi_Close', 'hsi_Volume', 'n100_Close', 'n100_Volume', 'trade_US Trade Policy Uncertainty', 'trade_Japanese Trade Policy Uncertainty']]
        X_test = read_csv('output/\CompleteIndexesWeeklyTestKalman_'+str(i)+'.csv')['2015-01-02':'2020-03-06']
        X_test = X_test[['trade_Trade Policy EMV Fraction', 'vix_Volume', 'infectious_daily_infect_emv_index', 'GPR', 'sp500_Volume', 'Overall EMV Tracker', 'sp500_Close', 'vix_Close', 'dji_Close', 'dji_Volume', 'ndx_Close', 'ndx_Volume', 'n225_Close', 'n225_Volume', 'ftse_Close', 'ftse_Volume', 'hsi_Close', 'hsi_Volume', 'n100_Close', 'n100_Volume', 'trade_US Trade Policy Uncertainty', 'trade_Japanese Trade Policy Uncertainty']]
        balance = np.sqrt(len(y_train[y_train['regime']==False])/len(y_train[y_train['regime']==True]))
        xgb_model = xgb.XGBClassifier(objective="binary:logistic",
                                          learning_rate=best_params['learning_rate'], 
                                          n_estimators=best_params['n_estimators'], 
                                          min_child_weight=best_params['min_child_weight'], 
                                          gamma=best_params['gamma'],
                                          max_delta_step=best_params['max_delta_step'],
                                          max_depth=best_params['max_depth'],
                                          subsample=best_params['subsample'],
                                          scale_pos_weight=best_params['scale_pos_weight'],
                                          colsample_bytree=best_params['colsample_bytree'],
                                          reg_alpha=best_params['reg_alpha']
                                      )

        model = xgb_model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        precision = tp/(tp+fp)
        recall = tp/(tp+fn)
        f_score = (2*precision*recall/(precision+recall))
        print('Round', i, "f1", f_score)
        list_f1.append(f_score)
        result_dict['f1'] += f_score
        result_dict['tn'] += tn
        result_dict['fp'] += fp
        result_dict['fn'] += fn
        result_dict['tp'] += tp
        if result_dict['best_f1'] < f_score:
            result_dict['best_f1'] = f_score
            result_dict['best_model'] = model
            result_dict['best_tn'] = tn
            result_dict['best_fp'] = fp
            result_dict['best_fn'] = fn
            result_dict['best_tp'] = tp
    print("STD f1", np.std(list_f1, axis=0))
    result_dict['f1'] = result_dict['f1']/n
    result_dict['tn'] = result_dict['tn']/n
    result_dict['fp'] = result_dict['fp']/n
    result_dict['fn'] = result_dict['fn']/n
    result_dict['tp'] = result_dict['tp']/n
        
    return result_dict     