In [1]:
import numpy as np
import pandas as pd
from datetime import datetime

from sklearn.datasets import load_breast_cancer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.model_selection import TimeSeriesSplit

import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

In [2]:
def read_csv(file):
    df = pd.read_csv(file, parse_dates=True, index_col=0)
    return df

In [3]:
def f1_eval(y_pred, dtrain):
    y_true = dtrain.get_label()
    err = 1-f1_score(y_true, np.round(y_pred))
    return 'f1_err', err

## XGBoost With PCA and With Feature Engineering

In [4]:
files = ['MeanWeeklyImputed', 'KNNWeeklyImputed', 'MovingAverageWeeklyImputed', 'RegressionWeeklyImputed', 'MIIWeeklyImputed']

for file in files:
    df = read_csv(file+'_PCA_training_FE.csv')
    print(file)
    X_test = read_csv(file + '_PCA_test_FE.csv')

    y = pd.read_csv('sp500_target_regimes.csv', parse_dates=True)
    y.index = y['date'].values
    y = y[['regime']]
    y = y['regime']=='BEAR'
    y = pd.DataFrame (y, columns = ['regime'])
       
    
    X_train = df
    y_train = y.loc['2000-01-01':'2015-01-01']
    y_test = y.loc['2015-01-02':]
    
    #print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
    
    dtrain = xgb.DMatrix(X_train, label=y_train)
    
    param_test1 = {
        'learning_rate':[0.01, 0.1],
        'n_estimators':[500, 1000],
        'max_depth':range(2,6,1),
        'subsample':np.arange(0.8,1,0.1),
        'min_child_weight':range(1,6,2),
        'colsample_bytree':np.arange(0.3,1,0.2),
    }
    
    tscv = TimeSeriesSplit(n_splits=3)
    
    balance = np.sqrt(len(y_train[y_train['regime']==False])/len(y_train[y_train['regime']==True]))
    gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier(objective="binary:logistic",
                                                          disable_default_eval_metric = 1,
                                                          learning_rate=0.01, 
                                                          n_estimators=100, 
                                                          random_state=27, 
                                                          min_child_weight=1, 
                                                          gamma=1, 
                                                          max_depth=2,
                                                          subsample=0.8,
                                                          colsample_bytree=0.3,
                                                          ), 
                            param_grid = param_test1, 
                            scoring='f1',
                            n_jobs=4,
                            iid=False, 
                            cv=tscv)

    gsearch1.fit(X_train, y_train, eval_metric=f1_eval)
    print(gsearch1.best_params_, gsearch1.best_score_)
    
    xgb_model = xgb.XGBClassifier(objective="binary:logistic",
                                  disable_default_eval_metric = 1,
                                  learning_rate=gsearch1.best_params_['learning_rate'], 
                                  n_estimators=gsearch1.best_params_['n_estimators'], 
                                  random_state=27, 
                                  min_child_weight=gsearch1.best_params_['min_child_weight'], 
                                  gamma=1, 
                                  max_depth=gsearch1.best_params_['max_depth'],
                                  subsample=gsearch1.best_params_['subsample'],
                                  colsample_bytree=gsearch1.best_params_['colsample_bytree'],
                                  )
    xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False, eval_metric=f1_eval)
    y_train_pred = xgb_model.predict(X_train)
    print("_______________________________________")
    print("______________Training_________________")
    tn, fp, fn, tp = confusion_matrix(y_train, y_train_pred).ravel()
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f_score = 2*precision*recall/(precision+recall)  
    print("    ","True", "False")
    print("True ", " ", tp, "  ", fp)
    print("False", " ",fn,"  ", tn)
    print("_______________________________________")
    print("F1 score", 1-f_score)
    print("---------------------------------------")
    print("_______________Testing_________________")
    y_pred = xgb_model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f_score = 2*precision*recall/(precision+recall)  
    print("    ","True", "False")
    print("True ", " ", tp, "  ", fp)
    print("False", " ",fn,"  ", tn)
    print("_______________________________________")
    print("F1 score", 1-f_score)
    print("---------------------------------------")


MeanWeeklyImputed
{'colsample_bytree': 0.90000000000000013, 'learning_rate': 0.01, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 500, 'subsample': 0.80000000000000004} 0.374907952872
_______________________________________
______________Training_________________
     True False
True    372    0
False   0    410
_______________________________________
F1 score 0.0
---------------------------------------
_______________Testing_________________
     True False
True    12    109
False   53    98
_______________________________________
F1 score 0.870967741935
---------------------------------------
KNNWeeklyImputed
{'colsample_bytree': 0.69999999999999996, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 500, 'subsample': 0.90000000000000002} 0.450394893433
_______________________________________
______________Training_________________
     True False
True    372    0
False   0    410
_______________________________________
F1 score 0.0
-----------------