# Predictive Modelling: XGBoost

# Imports

In [118]:
%load_ext autoreload
%autoreload 2

# Pandas and numpy
import pandas as pd
import numpy as np

#
from IPython.display import display, clear_output
import sys
import time

# Libraries for Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from src.visualization.visualize import plot_corr_matrix, plot_multi, plot_norm_dist, plot_feature_importances

# Some custom tools
from src.data.tools import check_for_missing_vals

# Alpaca API
import alpaca_trade_api as tradeapi

# Pickle
import pickle
import os
from pathlib import Path

# To load variables from .env file into system environment
from dotenv import find_dotenv, load_dotenv

from atomm.Indicators import MomentumIndicators
from atomm.DataManager.main import MSDataManager
from atomm.Tools import calc_open_position
from src.visualization.visualize import plot_confusion_matrix
from atomm.Methods import BlockingTimeSeriesSplit, PurgedKFold           


import time, os

# scikit-learn
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, TimeSeriesSplit
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# For BayesianHyperparameter Optimization
from src.models.hyperparameter_optimization import search_space, BayesianSearch

# Visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
import matplotlib.gridspec as gridspec
#import matplotlib.style as style
from scipy import stats

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [95]:
# Load environment variables
load_dotenv(find_dotenv())

True

## Defining functions

In [21]:
def run_combinations(
        symbol,
        forecast_horizon,
        input_window_size,
        X_train,
        X_test,
        y_train,
        y_test,
        prices,
        model,
        hyper_optimize=False,
        n_eval=10,
        silent = False,
    
    ):
    start = time.time()
    cnt = 1
    df_score = pd.DataFrame(index=forecast_horizon, columns=input_window_size, dtype=float)
    df_score.rename_axis('Forecast horizon (days)').rename_axis('Input window (days)', axis=1)
    df_returns = df_score.copy()
    total = len(forecast_horizon)*len(input_window_size)
    # Loop over forecast horizons
    try:
        for i, fh in enumerate(forecast_horizon):
            # Loop over input / lookback windows
            for j, iw in enumerate(input_window_size):

                params = {}
                if hyper_optimize:
                    _, _, params = BayesianSearch(
                        search_space(model),
                        model,
                        X_train[symbol][[f'{x}_{iw}' for x in ti_list]], 
                        y_train[symbol][f'signal_{fh}'], 
                        X_test[symbol][[f'{x}_{iw}' for x in ti_list]],
                        y_test[symbol][f'signal_{fh}'],
                        num_eval=n_eval,
                        silent=True,
                    )
                clf = model(**params)
                clf.fit(X_train[symbol][[f'{x}_{iw}' for x in ti_list]], y_train[symbol][f'signal_{fh}'])
                y_pred = clf.predict(X_test[symbol][[f'{x}_{iw}' for x in ti_list]])
                # Calculate accuracy score ()
                a_score = accuracy_score(y_test[symbol][f'signal_{fh}'], y_pred)
                # Calculate returns generated by stratedy
                returns = calc_returns(y_pred, prices[symbol])
                cum_returns = returns['Cum_Returns_Strat'][-1]
                df_score.iloc[i, j] = a_score
                df_returns.iloc[i, j] = cum_returns
                if silent == False:
                    clear_output(wait=True)
                    display(f'Leg [{cnt} / {total}]; Test Score {a_score}; Test Returns {cum_returns}')
                cnt += 1
    except (KeyboardInterrupt, SystemExit):
        sys.exit(0)
    print(f'Elapsed time {round(time.time()-start, 0)}s.')
    return df_score, df_returns

In [5]:
def avg_model(  
    symbol_list,
    forecast_horizon,                                 
    input_window_size,                                  
    X_train,    
    X_test,    
    y_train,    
    y_test,    
    prices_test,
    model,
    silent = False
):
    results_dict, returns_dict = {}, {}
    cnt = 1
    start = time.time()
    for symb in symbol_list:
        results, returns = run_combinations(
            symb,                             
            forecast_horizon,                                 
            input_window_size,                                  
            X_train,    
            X_test,    
            y_train,    
            y_test,    
            prices_test,   
            model=model,
            silent=True
        )
        results_dict[symb] = results
        returns_dict[symb] = returns
        avg_results = pd.DataFrame(
            np.array([results_dict[x].values for x in results_dict.keys()]).mean(axis=0),
            columns=results.columns, index=results.index).style.background_gradient(cmap='BuGn')
        
        if silent == False:
            clear_output(wait=True)
            display(f'Leg [{cnt}/{len(symbol_list)}]; Elapsed Time {round(time.time()-start , 0)}s\n', avg_results)
        cnt += 1

    return avg_results, results_dict, returns_dict

In [51]:
def create_two_class_signal(df, symbol_list):
    df_signals = pd.DataFrame(pd.MultiIndex.from_product([[], []]),)# index=df.index)
    sign_dic = {}
    for symbol in symbol_list:
        data = df[symbol]
        dicti = {}
        for n in forecast_horizon:
            diff = (data['Close'].shift(-n) - data['Close'])/data['Close']
            signal = np.where(diff > 0, 1, 0)
            dicti[f'signal_{n}'] = signal
        sign_df = pd.DataFrame.from_dict(dicti)
        sign_dic[symbol] = sign_df
    df_signals = pd.concat(sign_dic.values(), keys=sign_dic.keys(), axis=1)
    return df_signals

In [57]:
def create_three_class_signal(df, symbol_list, threshold=None, multi=1):
    df_signals = pd.DataFrame(pd.MultiIndex.from_product([[], []]),)# index=df.index)
    df_op = df_signals.copy()
    sign_dic, op_dic = {}, {}
    for symbol in symbol_list:
        data = df[symbol]
        dicti, dict_op = {}, {}
        for n, thresh in zip(forecast_horizon, threshold):
            diff = (data['Close'].shift(-n) - data['Close'])/data['Close']
            signal1p = np.where(diff > multi*thresh/100, 1, 0)
            signal1n = np.where(diff < -multi*thresh/100, -1, 0)
            signal = signal1p + signal1n
            open_position = calc_open_position(signal)
            dicti[f'signal_{n}'] = signal
            dict_op[f'open_position_{n}'] = open_position
        sign_dic[symbol] = pd.DataFrame.from_dict(dicti)
        op_dic[symbol] = pd.DataFrame.from_dict(dict_op)
    df_signals = pd.concat(sign_dic.values(), keys=sign_dic.keys(), axis=1)
    df_op = pd.concat(op_dic.values(), keys=op_dic.keys(), axis=1)
    return df_signals, df_op

In [6]:
def calc_returns(signals, prices):
    returns = prices[['Close']].pct_change()
    returns['Signal_Strat'] = calc_open_position(signals)
    returns['Cum_Returns_Strat'] = (returns['Signal_Strat'] * returns['Close']).cumsum()
    returns['Cum_Returns_BH'] = returns['Close'].cumsum()
    return returns

## Loading the data

In [None]:
from src.data.tools import check_for_missing_vals

In [195]:
forecast_horizon = [1, 3, 5, 7, 10, 15, 20, 25, 30]
input_window_size = [3, 5, 7, 10, 15, 20, 25, 30]
threshold_list = [0.63, 1.15, 1.49, 1.79, 2.14, 2.65, 3.08, 3.48, 3.94]
ti_list = ['macd', 'rsi', 'stoc', 'roc', 'bbu', 'bbl', 'ema', 'atr', 'adx', 'cci', 'williamsr', 'stocd']
ti_list = ['sma', 'rsi', 'stoc', 'roc', 'ema', 'atr', 'adx', 'cci', 'williamsr', 'stocd']

symbol_list = df_moments.columns.get_level_values(0).unique()
symbol_list = ['GPS']
startd = datetime.datetime(2002, 1, 29)
endd = datetime.datetime(2012, 7, 30)

In [39]:
start = time.time()
df = pd.DataFrame(pd.MultiIndex.from_product([[], []]))
dic = {} 
#symb_list = dm.ReturnIndexConstituents('DJIA')+['DJIA']
for symb in symbol_list:
    #print(symb)
    symb_df = dm.ReturnData(
        symb,
        start_date=startd,
        end_date=endd,
        limit=None)
    dic[symb] = symb_df
    
prices = pd.concat(dic.values(), keys=dic.keys(), axis=1)
print(f'Elapsed time: {int(time.time()-start)}s')

Elapsed time: 0s


In [115]:
check_for_missing_vals(prices)

No missing values found in dataframe


In [218]:

def calcIndicators(data, symbol, lookback_windows, ti_list):
    mi = MomentumIndicators(data[symbol])
    df = pd.DataFrame(index=data.index)
    ti_dict = {
        'sma': MomentumIndicators(data[symbol]).calcSMA,
        'macd': MomentumIndicators(data[symbol]).calcMACD,
           'rsi': MomentumIndicators(data[symbol]).calcRSI,
           'stoc': MomentumIndicators(data[symbol]).calcSTOC,
           'roc': MomentumIndicators(data[symbol]).calcROC,
           'bbu': MomentumIndicators(data[symbol]).calcBB,
           'bbl': MomentumIndicators(data[symbol]).calcBB,
           'ema': MomentumIndicators(data[symbol]).calcEMA,
           'atr': MomentumIndicators(data[symbol]).calcATR,
           'adx': MomentumIndicators(data[symbol]).calcADX,
           'cci': MomentumIndicators(data[symbol]).calcCCI,
           'williamsr': MomentumIndicators(data[symbol]).calcWR,
            }
    for n in input_window_size:
        for ti in ti_list:
            if ti == 'stocd':
                df[f'{ti}_{n}'] = mi.calcEMA(3, df[f'stoc_{n}'])
            else:
                df[f'{ti}_{n}'] = ti_dict.get(ti)(n)

            # Shynkevich et al 2017 also has SMA, but not Bollinger Bands, MACD
    return df

In [219]:
cnt = 1
start = time.time()
total = len(symbol_list)
mom_dict = {} 
for symbol in symbol_list:
    res = calcIndicators(prices, symbol, input_window_size, ti_list)
    mom_dict[symbol] = res
    clear_output(wait=True)
    display(f'Leg [{cnt} / {total}]; Last symbol: {symbol}; Elapsed time: {round(time.time()-start, 0)}s')
    cnt += 1
df_moments = pd.concat(mom_dict.values(), keys=mom_dict.keys(), axis=1)

'Leg [1 / 1]; Last symbol: GPS; Elapsed time: 19.0s'

In [220]:
df_moments = df_moments.iloc[max(input_window_size):-max(forecast_horizon)]

In [221]:
df_moments.head()

Unnamed: 0_level_0,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS,GPS
Unnamed: 0_level_1,sma_3,rsi_3,stoc_3,roc_3,ema_3,atr_3,adx_3,cci_3,williamsr_3,stocd_3,sma_5,rsi_5,stoc_5,roc_5,ema_5,atr_5,adx_5,cci_5,williamsr_5,stocd_5,sma_7,rsi_7,stoc_7,roc_7,ema_7,atr_7,adx_7,cci_7,williamsr_7,stocd_7,sma_10,rsi_10,stoc_10,roc_10,ema_10,atr_10,adx_10,cci_10,williamsr_10,stocd_10,sma_15,rsi_15,stoc_15,roc_15,ema_15,atr_15,adx_15,cci_15,williamsr_15,stocd_15,sma_20,rsi_20,stoc_20,roc_20,ema_20,atr_20,adx_20,cci_20,williamsr_20,stocd_20,sma_25,rsi_25,stoc_25,roc_25,ema_25,atr_25,adx_25,cci_25,williamsr_25,stocd_25,sma_30,rsi_30,stoc_30,roc_30,ema_30,atr_30,adx_30,cci_30,williamsr_30,stocd_30
Epoch,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2,Unnamed: 43_level_2,Unnamed: 44_level_2,Unnamed: 45_level_2,Unnamed: 46_level_2,Unnamed: 47_level_2,Unnamed: 48_level_2,Unnamed: 49_level_2,Unnamed: 50_level_2,Unnamed: 51_level_2,Unnamed: 52_level_2,Unnamed: 53_level_2,Unnamed: 54_level_2,Unnamed: 55_level_2,Unnamed: 56_level_2,Unnamed: 57_level_2,Unnamed: 58_level_2,Unnamed: 59_level_2,Unnamed: 60_level_2,Unnamed: 61_level_2,Unnamed: 62_level_2,Unnamed: 63_level_2,Unnamed: 64_level_2,Unnamed: 65_level_2,Unnamed: 66_level_2,Unnamed: 67_level_2,Unnamed: 68_level_2,Unnamed: 69_level_2,Unnamed: 70_level_2,Unnamed: 71_level_2,Unnamed: 72_level_2,Unnamed: 73_level_2,Unnamed: 74_level_2,Unnamed: 75_level_2,Unnamed: 76_level_2,Unnamed: 77_level_2,Unnamed: 78_level_2,Unnamed: 79_level_2,Unnamed: 80_level_2
2002-03-13 00:00:00+00:00,14.053333,54.545455,0.0,0.002876,13.952834,0.463333,39.632587,-88.460636,43.055598,40.318194,13.916,82.905986,68.75,0.058422,13.813967,0.504,37.814499,23.676842,24.800034,81.424892,13.66,89.847722,84.496138,0.126817,13.66768,0.547143,35.36705,49.536699,21.985848,89.644508,13.204,77.580089,90.825694,0.125,13.501579,0.51,32.777065,73.43619,12.863089,92.969109,13.093333,55.149501,90.825694,0.046512,13.351301,0.605333,31.761456,97.890445,12.863089,92.965951,13.0235,55.841122,90.825694,0.07722,13.293404,0.6895,33.365919,115.203268,12.863089,92.948018,13.0108,52.211539,90.825694,0.034099,13.280151,0.6684,36.170173,137.310635,12.863089,90.715986,13.152333,50.372209,81.481484,0.006494,13.288051,0.655,39.399338,107.657642,17.647057,81.589264
2002-03-14 00:00:00+00:00,14.136667,69.230781,100.0,0.017668,14.176417,0.466667,43.686597,97.202193,10.526316,70.159097,14.094,84.496127,100.0,0.065877,14.009311,0.534,41.123202,129.85878,8.510634,90.712446,13.88,89.690729,100.0,0.119751,13.85076,0.521429,38.46044,91.224755,5.095538,94.822254,13.447,92.932866,100.0,0.203007,13.664928,0.537,35.497709,89.103113,3.124998,96.484555,13.182667,60.806445,100.0,0.102603,13.482388,0.63,33.75885,111.630149,3.041823,96.482976,13.0765,56.14849,100.0,0.07946,13.398794,0.699,34.734937,138.701218,3.041823,96.474009,13.0708,57.309941,100.0,0.116279,13.366293,0.6644,37.060206,162.299301,3.041823,95.357993,13.167667,51.845907,100.0,0.032999,13.35979,0.659,39.937484,144.123687,3.041823,90.794632
2002-03-15 00:00:00+00:00,14.176667,57.692303,51.111243,0.008535,14.178208,0.51,46.490518,57.576305,43.902413,60.63517,14.148,62.162197,51.111243,0.019411,14.066208,0.518,43.846947,92.47142,35.999966,70.911845,14.022857,77.173939,75.280959,0.075873,13.93307,0.531428,41.168939,99.304568,23.529393,85.051607,13.658,85.762736,89.108942,0.174814,13.758578,0.565,38.006205,81.005231,14.00777,92.796748,13.300667,65.337963,90.946528,0.142627,13.56959,0.618667,35.700798,99.259007,13.382889,93.714752,13.146,58.383599,90.946528,0.108679,13.473195,0.656,36.121556,122.981663,13.382889,93.710268,13.114,55.252919,90.946528,0.082443,13.428886,0.6616,38.001301,147.143435,13.382889,93.15226,13.160333,49.099839,90.946528,-0.015278,13.412706,0.656,40.541159,147.853361,13.382889,90.87058
2002-03-18 00:00:00+00:00,14.28,70.666721,36.363715,0.022222,14.219104,0.536667,47.892478,79.996567,36.842059,48.499443,14.17,55.789513,68.889011,0.007774,14.130805,0.462,45.662778,67.451847,34.146322,69.900428,14.13,73.584932,71.428683,0.055514,14.014803,0.511428,43.200314,88.22778,27.999973,78.240145,13.846,84.558846,90.90913,0.151858,13.849745,0.544,40.058611,79.867037,16.568036,91.852939,13.388667,62.406024,94.238707,0.102009,13.655891,0.581333,37.400003,88.586932,10.408914,93.976729,13.2415,62.042877,94.238707,0.154656,13.548129,0.6345,37.376116,109.368575,10.408914,93.974488,13.1764,57.831329,94.238707,0.122835,13.492818,0.6548,38.870004,131.208658,10.408914,93.695484,13.165667,50.666666,94.238707,0.011348,13.467371,0.651667,41.105887,147.212588,10.408914,92.554643
2002-03-19 00:00:00+00:00,14.38,70.270316,100.0,0.020833,14.459552,0.516666,54.094341,100.0,23.255801,74.249721,14.298,74.615376,100.0,0.045519,14.320537,0.516,50.540586,162.797165,16.949146,84.950214,14.242857,74.233154,100.0,0.056794,14.186102,0.52,47.474287,171.500983,14.705872,89.120072,14.03,84.328382,100.0,0.143079,14.004337,0.52,43.738174,141.131876,10.050243,95.92647,13.465333,61.16505,100.0,0.084871,13.786405,0.566667,40.262028,106.534238,6.557372,96.988365,13.3635,64.734301,100.0,0.199021,13.657831,0.6345,39.558982,131.486532,6.557372,96.987244,13.2384,57.788947,100.0,0.117871,13.585678,0.656,40.518173,152.592623,6.557372,96.847742,13.21,55.678907,100.0,0.099476,13.546895,0.648667,42.343972,176.14752,6.557372,96.277322


In [222]:
check_for_missing_vals(df_moments)

GPS
[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


# Two class labels

In [86]:
y_2c = create_two_class_signal(prices, symbol_list)
y_2c = y_2c[max(input_window_size):-max(forecast_horizon)]
y_2c.shape

(2586, 9)

In [117]:
l1 = []
for y in y_2c.columns.get_level_values(0).unique():
    l = []
    for x in y_2c[y].columns:
        c = y_2c[y][x].value_counts()
        s = c[0] + c[1]
        l.append([c[0]/s, c[1]/s])
    l1.append(l)
pd.DataFrame(np.mean(np.array(l1), axis=0).T, index=pd.Index(['0', '1']), columns=y_2c.columns.get_level_values(1).unique())

Unnamed: 0,signal_1,signal_3,signal_5,signal_7,signal_10,signal_15,signal_20,signal_25,signal_30
0,0.496906,0.481052,0.469451,0.465197,0.477572,0.476025,0.472931,0.464037,0.465197
1,0.503094,0.518948,0.530549,0.534803,0.522428,0.523975,0.527069,0.535963,0.534803


# Three class labels

In [85]:
#%%timeit

sym_list = symbol_list[:]

#dicti = {}
y_3c, y_3op = create_three_class_signal(prices, symbol_list, threshold=threshold_list)
y_3c_test, y_3op_test = create_three_class_signal(prices, symbol_list, threshold=threshold_list)

# Remove first and last 30 entries
y_3c = y_3c[max(input_window_size):-max(forecast_horizon)]
y_3op = y_3op[max(input_window_size):-max(forecast_horizon)]
y_3c.shape

(2586, 9)

In [97]:
l1 = []
for y in y_3c.columns.get_level_values(0).unique():
    l = []
    for x in y_3c[y].columns:
        c = y_3c[y][x].value_counts()
        s = c[0] + c[1] + c[-1]
        l.append([c[-1]/s, c[0]/s, c[1]/s])
    l1.append(l)
pd.DataFrame(np.mean(np.array(l1), axis=0).T, index=pd.Index(['-1', '0', '1']), columns=y_3c.columns.get_level_values(1).unique())

Unnamed: 0,signal_1,signal_3,signal_5,signal_7,signal_10,signal_15,signal_20,signal_25,signal_30
-1,0.347641,0.324053,0.323279,0.318252,0.323279,0.315159,0.312838,0.311678,0.312452
0,0.289637,0.305878,0.295437,0.298144,0.29505,0.304718,0.312838,0.311678,0.303558
1,0.362722,0.37007,0.381284,0.383604,0.381671,0.380124,0.374323,0.376643,0.383991


# Scaling the features

In [171]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [223]:
#scale = MinMaxScaler()
scale = StandardScaler()

In [224]:
scaled = scale.fit_transform(df_moments)

In [225]:
scaled.shape

(2586, 80)

In [226]:
X_scaled = pd.DataFrame(data=scaled, columns=df_moments.columns)
#X_scaled = X

# Train-Test Split

In [89]:
X_scaled = df_moments

In [227]:
# Use 70/30 train/test splits
test_p = .13

In [228]:
# Scaled, three-class
test_size = int((1 - test_p) * X_scaled.shape[0])
X_train, X_test, y_train, y_test = X_scaled[:test_size], X_scaled[test_size:], y_3c[:test_size], y_3c[test_size:]
prices_train, prices_test = prices[:test_size], prices[test_size:]

In [141]:
# Unscaled, two-class
test_size = int((1 - test_p) * X.shape[0])
X_train, X_test, y_train, y_test = X_scaled[:test_size], X_scaled[test_size:], y_2c[:test_size], y_2c[test_size:]
prices_train, prices_test = djia[:test_size], djia[test_size:]

In [142]:
X_test.index.min()

Timestamp('2008-12-09 00:00:00+0000', tz='UTC')

In [231]:
# Scaled, two-class
test_size = int((1 - test_p) * X.shape[0])
X_train, X_test, y_train, y_test = X_scaled[:test_size], X_scaled[test_size:], y_2c[:test_size], y_2c[test_size:]
prices_train, prices_test = djia[:test_size], djia[test_size:]

In [None]:
#test_size = test_p
#X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_3c, test_size=test_size, random_state=101)

# Model

In [229]:
symbol = 'GPS'
n = 15
# set up cross validation splits
tscv = TimeSeriesSplit(n_splits=5)
btscv = BlockingTimeSeriesSplit(n_splits=5)
#ppcv = PurgedKFold(n_splits=5)

## Single lookback/lookahead combination

In [232]:
clf = SVC(C=1000, gamma=1E-5)
clf.fit(X_train[symbol][[f'{x}_{n}' for x in ti_list]], y_train[symbol][f'signal_{n}'])

y_pred_svc = clf.predict(X_test[symbol][[f'{x}_{n}' for x in ti_list]])
print(classification_report(y_test[symbol][f'signal_{n}'], y_pred_svc))
print(confusion_matrix(y_test[symbol][f'signal_{n}'], y_pred_svc))
#plot_feature_importances(svc.coef_,
#                         X_scaled[symbol][[f'{x}_{n}' for x in ti_list]].columns, 
#                         model='SVC1', top_count=100)

              precision    recall  f1-score   support

           0       0.48      0.71      0.57       383
           1       0.65      0.42      0.51       504

    accuracy                           0.54       887
   macro avg       0.57      0.56      0.54       887
weighted avg       0.58      0.54      0.54       887

[[271 112]
 [294 210]]


In [143]:
clf = XGBClassifier(n_jobs=-1)
clf.fit(X_train[symbol][[f'{x}_{n}' for x in ti_list]], y_train[symbol][f'signal_{n}'])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [144]:
y_pred_xgb = clf.predict(X_test[symbol][[f'{x}_{n}' for x in ti_list]])
print(classification_report(y_test[symbol][f'signal_{n}'], y_pred_xgb))
print(confusion_matrix(y_test[symbol][f'signal_{n}'], y_pred_xgb))
#plot_feature_importances(svc.coef_,
#                         X_scaled[symbol][[f'{x}_{n}' for x in ti_list]].columns, 
#                         model='SVC1', top_count=100)

              precision    recall  f1-score   support

           0       0.48      0.59      0.53       383
           1       0.62      0.51      0.56       504

    accuracy                           0.54       887
   macro avg       0.55      0.55      0.54       887
weighted avg       0.56      0.54      0.54       887

[[227 156]
 [249 255]]


## All combinations

In [62]:
clf = XGBClassifier
score_xgb, returns_xgb = run_combinations(
    symbol=symbol,
    forecast_horizon=forecast_horizon,
    input_window_size=input_window_size,
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    y_test=y_test,
    prices=prices_test,
    model=clf,
    hyper_optimize=True,
    n_eval=5
)

plot_corr_matrix(score_xgb, mask_upper=False, show_annot=True, figsize=(8, 8))

 60%|██████    | 3/5 [00:02<00:01,  1.25it/s, best loss: -0.5381818181818182]


SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


## Hyperparamter Optimization: Bayesian Optimization, SVC

In [243]:
n1=15
n2=10

model = SVC
bsearch_svc = BayesianSearch(
    search_space(model),
    model,
    X_train[symbol][[f'{x}_{n1}' for x in ti_list]], 
    y_train[symbol][f'signal_{n2}'], 
    X_test[symbol][[f'{x}_{n1}' for x in ti_list]],
    y_test[symbol][f'signal_{n2}'],
    num_eval=50
)

100%|██████████| 50/50 [05:59<00:00,  7.19s/it, best loss: -0.5611307420494699]
##### Results #####
Score best parameters:  -0.5611307420494699
Best parameters:  {'C': 1411.7804441517403, 'cv': 'tscv', 'gamma': 0.04319429344053992, 'model': <class 'sklearn.svm._classes.SVC'>, 'normalize': 0, 'scale': 0}
Test Score:  0.4791431792559188
Parameter combinations evaluated:  50
Time elapsed:  361.3564758300781


## Hyperparamter Optimization: Bayesian Optimization

In [235]:
n=15

model = XGBClassifier
bsearch_xgb = BayesianSearch(
    search_space(XGBClassifier),
    model,
    X_train[symbol][[f'{x}_{n}' for x in ti_list]], 
    y_train[symbol][f'signal_{n}'], 
    X_test[symbol][[f'{x}_{n}' for x in ti_list]],
    y_test[symbol][f'signal_{n}'],
    num_eval=100
)

100%|██████████| 100/100 [01:00<00:00,  1.65it/s, best loss: -0.6029411764705882]
##### Results #####
Score best parameters:  -0.6029411764705882
Best parameters:  {'booster': 'gbtree', 'colsample_bytree': 0.15420999612919842, 'cv': 'btscv', 'gamma': 10, 'learning_rate': 19.555489854868977, 'max_depth': 151, 'model': <class 'xgboost.sklearn.XGBClassifier'>, 'n_estimators': 239, 'n_jobs': -1, 'subsample': 1}
Test Score:  0.5682074408117249
Parameter combinations evaluated:  100
Time elapsed:  60.92389416694641


In [241]:
n=15
num_eval=15

model = RandomForestClassifier
bsearch_svm = BayesianSearch(
    search_space(model),
    model,
    X_train[symbol][[f'{x}_{n}' for x in ti_list]], 
    y_train[symbol][f'signal_{n}'], 
    X_test[symbol][[f'{x}_{n}' for x in ti_list]],
    y_test[symbol][f'signal_{n}'],
    num_eval=num_eval
)

  0%|          | 0/15 [00:00<?, ?it/s, best loss: ?]


TypeError: cross_val_score() got an unexpected keyword argument 'average'

In [240]:
n=15
num_eval=500

model = KNeighborsClassifier
bsearch_svm = BayesianSearch(
    search_space(model),
    model,
    X_train[symbol][[f'{x}_{n}' for x in ti_list]], 
    y_train[symbol][f'signal_{n}'], 
    X_test[symbol][[f'{x}_{n}' for x in ti_list]],
    y_test[symbol][f'signal_{n}'],
    num_eval=num_eval
)

100%|██████████| 500/500 [01:35<00:00,  5.23it/s, best loss: -0.5856324830817282]
##### Results #####
Score best parameters:  -0.5856324830817282
Best parameters:  {'cv': 'cv', 'model': <class 'sklearn.neighbors._classification.KNeighborsClassifier'>, 'n_jobs': -1, 'n_neighbors': 20, 'normalize': 0, 'p': 9, 'scale': 0}
Test Score:  0.4633596392333709
Parameter combinations evaluated:  500
Time elapsed:  95.87441897392273


## Running on all 50 stocks

In [158]:
best_params = {'bootstrap': False, 'criterion': 'gini', 'max_depth': 218, 'max_features': 1, 'min_samples_leaf': 19, 'n_estimators': 423}
model_2a = RandomForestClassifier(n_jobs=-1, **best_params)
avg, _, _ = avg_model(  
    symbol_list,
    forecast_horizon,                                 
    input_window_size,                                  
    X_train,    
    X_test,    
    y_train,    
    y_test,    
    prices_test,
    model=model_2a,
    silent = False
)

'Leg [50/50]; Elapsed Time 2475.0s\n'

Unnamed: 0,3,5,7,10,15,20,25,30
1,0.507666,0.504838,0.504157,0.505792,0.504872,0.507462,0.506099,0.504055
3,0.511073,0.508245,0.510903,0.506133,0.507632,0.507802,0.508177,0.504974
5,0.514378,0.511652,0.511141,0.513629,0.516082,0.516695,0.51891,0.516082
7,0.516082,0.516491,0.516593,0.515639,0.519625,0.520307,0.519216,0.521193
10,0.517172,0.515945,0.515264,0.513254,0.518228,0.519932,0.519387,0.517581
15,0.524497,0.520852,0.522044,0.522215,0.524259,0.531687,0.525894,0.522964
20,0.52678,0.527325,0.529063,0.527802,0.529744,0.525213,0.521942,0.521772
25,0.528382,0.527121,0.530801,0.535128,0.532164,0.524872,0.523952,0.526269
30,0.535945,0.537922,0.537206,0.537956,0.538024,0.531857,0.532913,0.535196


In [159]:
avg

Unnamed: 0,3,5,7,10,15,20,25,30
1,0.413629,0.41339,0.411721,0.412368,0.409506,0.413663,0.414037,0.412129
3,0.406065,0.406814,0.409165,0.407836,0.406951,0.410971,0.411959,0.414514
5,0.402078,0.405486,0.406917,0.405077,0.408893,0.407939,0.406337,0.408382
7,0.40644,0.406746,0.40586,0.402351,0.406678,0.40954,0.406303,0.413799
10,0.407836,0.407359,0.407019,0.40385,0.408041,0.409915,0.408825,0.405451
15,0.402589,0.408484,0.400818,0.402249,0.405554,0.406269,0.39816,0.397683
20,0.396491,0.398705,0.398637,0.394719,0.397922,0.397002,0.39925,0.396559
25,0.397615,0.394617,0.398126,0.396934,0.400443,0.401193,0.399761,0.396899
30,0.396048,0.394991,0.395707,0.392811,0.397683,0.406269,0.401704,0.398535
