In [34]:
import numpy as np
import pandas as pd
from datetime import datetime

from sklearn.datasets import load_breast_cancer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

In [35]:
def read_csv(file):
    df = pd.read_csv(file, parse_dates=True, index_col=0)
    return df

In [36]:
def f1_eval(y_pred, dtrain):
    y_true = dtrain.get_label()
    err = 1-f1_score(y_true, np.round(y_pred))
    return 'f1_err', err

In [37]:
def f2_eval(y_pred, dtrain):
    y_test = dtrain.get_label()
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f2_score = 5*precision*recall/(4*precision+recall)
    err = 1 - f2_score
    return 'f2_err', err

In [40]:
files = ['CompleteWeeklyIndexes.csv','MeanWeeklyImputed.csv', 'KNNWeeklyImputed.csv', 'MovingAverageWeeklyImputed.csv',  'RegressionWeeklyImputed.csv', 'MIIWeeklyImputed.csv']

for file in files:
    df = read_csv(file)

    print(file)
    if "MII" not in file:
        X_test = read_csv('CompleteWeeklyIndexes.csv')
    else:
        test_data = file.split('.')
        X_test = read_csv(test_data[0] + '_test.' + test_data[1])
    y = pd.read_csv('sp500_target_regimes.csv', parse_dates=True)
    y.index = y['date'].values
    y = y[['regime']]
    y = y['regime']=='BEAR'
    y = pd.DataFrame (y, columns = ['regime'])
       

    X_train = df.loc['2000-01-01':'2015-01-01']
    X_test = X_test.loc['2015-01-02':]
    y_train = y.loc['2000-01-01':'2015-01-01']
    y_test = y.loc['2015-01-02':]
    
    print(X_train.shape,y_train.shape, X_test.shape, y_test.shape )
    
    dtrain = xgb.DMatrix(X_train, label=y_train)
    
    param_test1 = {
        'max_depth':range(2,6,1),
        'min_child_weight':range(1,6,1),
        'gamma':np.arange(0.1,0.5,0.1),
        'max_delta_step':range(1,10,1),
        'scale_pos_weight':[75, 99, 100, 1000, 10000,np.sqrt(len(y_train[y_train['regime']==False])/len(y_train[y_train['regime']==True])),len(y_train[y_train['regime']==False])/len(y_train[y_train['regime']==True])]
    }
        
    #balance = np.sqrt(len(y_train[y_train['is_transition']==False])/len(y_train[y_train['is_transition']==True]))
    #balance = len(y_train[y_train['is_transition']==False])/len(y_train[y_train['is_transition']==True])
    balance = 1000
    '''
    gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier(objective="binary:logistic",
                                                          learning_rate=0.1, 
                                                          n_estimators=200, 
                                                          random_state=27, 
                                                          min_child_weight=2, 
                                                          gamma=2, 
                                                          max_depth=2,
                                                          max_delta_step=1,
                                                          eval_metric=['auc'], 
                                                          scale_pos_weight=balance), 
                            param_grid = param_test1, 
                            scoring='f1',
                            n_jobs=4,
                            iid=False, 
                            cv=5)

    gsearch1.fit(X_train, y_train)
    print(gsearch1.best_params_, gsearch1.best_score_)
    '''

    xgb_model = xgb.XGBClassifier(objective="binary:logistic", 
                                  disable_default_eval_metric = 1,
                                  #learning_rate=0.1, 
                                  n_estimators=1000,
                                  #random_state=27, 
                                  #min_child_weight=2,#gsearch1.best_params_['min_child_weight'], 
                                  max_delta_step=2,#gsearch1.best_params_['max_delta_step'],
                                  #gamma=0.1,#gsearch1.best_params_['gamma'], 
                                  max_depth=2,#gsearch1.best_params_['max_depth'], 
                                  #eval_metric=['auc'], 
                                  scale_pos_weight=balance)#gsearch1.best_params_['scale_pos_weight'])

    xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric=f1_eval, verbose=False, early_stopping_rounds=100)
    '''
    y_train_pred = xgb_model.predict(X_train, )
    print("_______________________________________")
    tn, fp, fn, tp = confusion_matrix(y_train, y_train_pred).ravel()
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f_score = 2*precision*recall/(precision+recall)
    f2_score = 5*precision*recall/(4*precision+recall)
    print("    ","True", "False")
    print("True ", " ", tp, "  ", fp)
    print("False", " ",fn,"  ", tn)
    print("_______________________________________")
    print("F1", f_score)
    print("F2", f2_score)
    print("---------------------------------------")
    '''
    print("_______________________________________")
    y_pred = xgb_model.predict(X_test, ntree_limit=xgb_model.best_ntree_limit)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f_score = 2*precision*recall/(precision+recall)  
    f2_score = 5*precision*recall/(4*precision+recall)
    print("    ","True", "False")
    print("True ", " ", tp, "  ", fp)
    print("False", " ",fn,"  ", tn)
    print("_______________________________________")
    print("F1", f_score)
    print("F2", f2_score)
    print("---------------------------------------")

CompleteWeeklyIndexes.csv
(782, 22) (782, 1) (272, 22) (272, 1)
_______________________________________
     True False
True    46    17
False   19    190
_______________________________________
F1 0.7187500000000001
F2 0.7120743034055728
---------------------------------------
MeanWeeklyImputed.csv
(782, 22) (782, 1) (272, 22) (272, 1)
_______________________________________
     True False
True    36    6
False   29    201
_______________________________________
F1 0.6728971962616823
F2 0.5960264900662252
---------------------------------------
KNNWeeklyImputed.csv
(782, 22) (782, 1) (272, 22) (272, 1)
_______________________________________
     True False
True    36    6
False   29    201
_______________________________________
F1 0.6728971962616823
F2 0.5960264900662252
---------------------------------------
MovingAverageWeeklyImputed.csv
(782, 22) (782, 1) (272, 22) (272, 1)
_______________________________________
     True False
True    3    0
False   62    207
________________

## PCA

In [15]:
files = ['MeanWeeklyImputedPCA.csv', 'KNNWeeklyImputedPCA.csv', 'MovingAverageWeeklyImputedPCA.csv', 'RegressionWeeklyImputedPCA.csv', 'MIIWeeklyImputedPCA.csv']

for file in files:
    df = read_csv(file)
    if 'Weekly' in file:
        print(file)

        test_data = file.split('.')
        X_test = read_csv(test_data[0] + '_test.' + test_data[1])

        y = pd.read_csv('sp500_target_regimes.csv', parse_dates=True)
        y.index = y['date'].values
        y = y[['regime']]
        y = y['regime']=='BEAR'
        
       
    else:
        continue
        #X_test = read_csv('CompleteIndexes.csv')
        #X_test = X_test.loc['2015-01-02':]
    
    X_train = df
    y_train = y.loc['2000-01-01':'2015-01-01']
    y_test = y.loc['2015-01-02':]
    
    dtrain = xgb.DMatrix(X_train, label=y_train)
    
    param_test1 = {
        'max_depth':range(2,6,1),
        'min_child_weight':range(1,6,1),
        'gamma':np.arange(0.1,0.5,0.1),
        'max_delta_step':range(1,10,1),
        'scale_pos_weight':[75, 99, 100, np.sqrt(len(y_train[y_train['regime']==False])/len(y_train[y_train['regime']==True])),len(y_train[y_train['is_transition']==False])/len(y_train[y_train['is_transition']==True]),1000]
        }
    
    balance = len(y_train[y_train['regime']==False])/len(y_train[y_train['regime']==True])
    '''
    gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier(objective="binary:logistic",
                                                          disable_default_eval_metric = 1,
                                                          learning_rate=0.1,
                                                          n_estimators=200, 
                                                          random_state=27, 
                                                          min_child_weight=2, 
                                                          gamma=3, 
                                                          max_depth=2, 
                                                          max_delta_step=1,
                                                          #eval_metric=['auc'], 
                                                          scale_pos_weight=balance), 
                            param_grid = param_test1, 
                            scoring='f1',
                            n_jobs=4,
                            iid=False, 
                            cv=5)

    gsearch1.fit(X_train, y_train, eval_metric=f1_eval)
    print(gsearch1.best_params_, gsearch1.best_score_)'''
    
    xgb_model = xgb.XGBClassifier(objective="binary:logistic",
                                  disable_default_eval_metric = 1,
                                  learning_rate=0.1, 
                                  n_estimators=500, 
                                  random_state=27,
                                  #max_delta_step=gsearch1.best_params_['max_delta_step'],
                                  #min_child_weight=gsearch1.best_params_['min_child_weight'],
                                  #gamma=gsearch1.best_params_['gamma'], 
                                  #max_depth=gsearch1.best_params_['max_depth'],  
                                  scale_pos_weight=balance)#gsearch1.best_params_['scale_pos_weight'])

    xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False, eval_metric=f1_eval, early_stopping_rounds=100)
    
    '''
    y_train_pred = xgb_model.predict(X_train, ntree_limit=xgb_model.best_ntree_limit)
    print("_______________________________________")
    tn, fp, fn, tp = confusion_matrix(y_train, y_train_pred).ravel()
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f_score = 2*precision*recall/(precision+recall)  
    print("    ","True", "False")
    print("True ", " ", tp, "  ", fp)
    print("False", " ",fn,"  ", tn)
    print("_______________________________________")
    print("Fscore", f_score)
    print("---------------------------------------")
    '''
    print("_______________________________________")
    y_pred = xgb_model.predict(X_test, ntree_limit=xgb_model.best_ntree_limit)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f_score = 2*precision*recall/(precision+recall)  
    print("    ","True", "False")
    print("True ", " ", tp, "  ", fp)
    print("False", " ",fn,"  ", tn)
    print("_______________________________________")
    print("Fscore", f_score)
    print("---------------------------------------")


MeanWeeklyImputedPCA.csv


KeyError: 'regime'

In [8]:
y = pd.read_csv('sp500_target_regimes.csv', parse_dates=True)
y.index = y['date'].values
y = y[['regime']]

In [9]:
y

Unnamed: 0,regime
2000-01-07,BEAR
2000-01-14,BEAR
2000-01-21,BEAR
2000-01-28,BEAR
2000-02-04,BEAR
2000-02-11,BEAR
2000-02-18,BEAR
2000-02-25,BEAR
2000-03-03,BEAR
2000-03-10,BEAR


In [10]:
teste = y['regime']=='BEAR'

In [28]:
df = pd.DataFrame (teste, columns = ['regime'])

In [29]:
df

Unnamed: 0,regime
2000-01-07,True
2000-01-14,True
2000-01-21,True
2000-01-28,True
2000-02-04,True
2000-02-11,True
2000-02-18,True
2000-02-25,True
2000-03-03,True
2000-03-10,True
