In [1]:
import numpy as np
import pandas as pd
import ta
import seaborn as sn
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from ta import add_all_ta_features
from ta import momentum
from ta.utils import dropna

In [2]:
#pip install xgboost

In [3]:
pip install autofeat

Note: you may need to restart the kernel to use updated packages.


In [4]:
def get_data(company_symbol):
    return pd.read_csv("dataset/"+company_symbol+".csv")

In [5]:
data = get_data("AMZN")
data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,5/15/1997,2.437500,2.500000,1.927083,1.958333,1.958333,72156000
1,5/16/1997,1.968750,1.979167,1.708333,1.729167,1.729167,14700000
2,5/19/1997,1.760417,1.770833,1.625000,1.708333,1.708333,6106800
3,5/20/1997,1.729167,1.750000,1.635417,1.635417,1.635417,5467200
4,5/21/1997,1.635417,1.645833,1.375000,1.427083,1.427083,18853200
...,...,...,...,...,...,...,...
6152,10/25/2021,3335.000000,3347.800049,3297.699951,3320.370117,3320.370117,2226000
6153,10/26/2021,3349.510010,3416.120117,3343.979980,3376.070068,3376.070068,2698300
6154,10/27/2021,3388.000000,3437.000000,3371.449951,3392.489990,3392.489990,2702200
6155,10/28/2021,3402.100098,3479.000000,3386.000000,3446.570068,3446.570068,5708700


In [6]:
data.dtypes

Date          object
Open         float64
High         float64
Low          float64
Close        float64
Adj Close    float64
Volume         int64
dtype: object

In [7]:
data.isna().any()

Date         False
Open         False
High         False
Low          False
Close        False
Adj Close    False
Volume       False
dtype: bool

In [8]:
data[data.columns[data.isna().any()]]

0
1
2
3
4
...
6152
6153
6154
6155
6156


In [9]:
cond1 = data.Date.isna() # aapl['Date'].isna()
cond2 = data.Date.str.contains(r'^\s*$', na=False)
cond3 = data.Date == ''

data_checked = data.assign(cond1= cond1, cond2= cond2, cond3= cond3)
print (data_checked)

            Date         Open         High          Low        Close  \
0      5/15/1997     2.437500     2.500000     1.927083     1.958333   
1      5/16/1997     1.968750     1.979167     1.708333     1.729167   
2      5/19/1997     1.760417     1.770833     1.625000     1.708333   
3      5/20/1997     1.729167     1.750000     1.635417     1.635417   
4      5/21/1997     1.635417     1.645833     1.375000     1.427083   
...          ...          ...          ...          ...          ...   
6152  10/25/2021  3335.000000  3347.800049  3297.699951  3320.370117   
6153  10/26/2021  3349.510010  3416.120117  3343.979980  3376.070068   
6154  10/27/2021  3388.000000  3437.000000  3371.449951  3392.489990   
6155  10/28/2021  3402.100098  3479.000000  3386.000000  3446.570068   
6156  10/29/2021  3300.020020  3374.820068  3273.320068  3372.429932   

        Adj Close    Volume  cond1  cond2  cond3  
0        1.958333  72156000  False  False  False  
1        1.729167  14700000  Fals

In [10]:
def simple_moving_average_5(close):
    return close.rolling(5, min_periods=1).mean()

In [11]:
def simple_moving_average_10(close):
    return close.rolling(10, min_periods=1).mean()

In [12]:
def stochastic_k(high, low, close):
    return (((close-low.rolling(14).min())/(high.rolling(14).max()-low.rolling(14).min()))*100)

In [13]:
def stochastic_d(stochastic_k):
    return stochastic_k.rolling(3,min_periods=1).mean()

In [14]:
def larry_williams_r(high, low, close):
    return (((high.rolling(14).max()-close)/(high.rolling(14).max()-low.rolling(14).min()))*100)

In [15]:
def rate_of_change(close):
    return ((close-close.shift(12))/close.shift(12))

In [16]:
def price_volume_trend(close, volume):
    return (((close-close.shift(1))/close.shift(1))*volume)

In [17]:
def accumulation_distribution_oscillator(high, low, close):
    return((high-close.shift(1))/(high-low))

In [18]:
def weighted_moving_average_10(close):
    return ((10*close+9*close.shift(1)+8*close.shift(2)+7*close.shift(3)
             +6*close.shift(4)+5*close.shift(5)+4*close.shift(6)+3*close.shift(7)
             +2*close.shift(8)+close.shift(9))/(10+9+8+7+6+5+4+3+2+1))

In [19]:
def exponential_moving_average_12(close):
    return (close.ewm(span=12, adjust=False).mean())

In [20]:
def exponential_moving_average_26(close):
    return (close.ewm(span=26, adjust=False).mean())

In [21]:
def moving_average_convergence_divergence(ema_12, ema_26):
    return (ema_12 - ema_26)

In [22]:
def momentum(close):
    return ((close/close.shift(14))*100)

In [23]:
def change(close):
    return (close.shift(-1) - close)

In [24]:
def direction(change):
    return (np.where(change>0, 1, 0))

In [25]:
def relative_strength_index(direction):
    return (100-(100/(1+(direction.rolling(14).sum()/14)/(14-direction.rolling(14).sum())/14)))

In [26]:
def channel_commodity_index(df, ndays): 
    df['TP'] = (df['High'] + df['Low'] + df['Close']) / 3 
    df['sma'] = df['TP'].rolling(ndays).mean()
    df['mad'] = df['TP'].rolling(ndays).apply(lambda x: pd.Series(x).mad())
    df['CCI'] = (df['TP'] - df['sma']) / (0.015 * df['mad']) 
    return df['CCI']

In [27]:
def disparity_index(close):
    return (((close - close.rolling(14).mean())/(close.rolling(14).mean()))*100)

In [28]:
def get_adx(high, low, close, lookback):
    plus_dm = high.diff()
    minus_dm = low.diff()
    plus_dm[plus_dm < 0] = 0
    minus_dm[minus_dm > 0] = 0
    
    tr1 = pd.DataFrame(high - low)
    tr2 = pd.DataFrame(abs(high - close.shift(1)))
    tr3 = pd.DataFrame(abs(low - close.shift(1)))
    frames = [tr1, tr2, tr3]
    tr = pd.concat(frames, axis = 1, join = 'inner').max(axis = 1)
    atr = tr.rolling(lookback).mean()
    
    plus_di = 100 * (plus_dm.ewm(alpha = 1/lookback).mean() / atr)
    minus_di = abs(100 * (minus_dm.ewm(alpha = 1/lookback).mean() / atr))
    dx = (abs(plus_di - minus_di) / abs(plus_di + minus_di)) * 100
    adx = ((dx.shift(1) * (lookback - 1)) + dx) / lookback
    adx_smooth = adx.ewm(alpha = 1/lookback).mean()
    return adx_smooth

In [29]:
def aroon(close):
    return (ta.trend.AroonIndicator(close, 25, True).aroon_indicator())

In [30]:
def compute_all_indicators(data):
    data['SMA5'] = simple_moving_average_5(data['Close'])
    data['SMA10'] = simple_moving_average_10(data['Close'])
    data['StochasticK'] = stochastic_k(data['High'], data['Low'], data['Close'])
    data['StochasticD'] = stochastic_d(data['StochasticK'])
    data['LarryWilliamsR'] = larry_williams_r(data['High'], data['Low'], data['Close'])
    data['ROC'] = rate_of_change(data['Close'])
    data['PVT'] = price_volume_trend(data['Close'], data['Volume'])
    data['ADO'] = accumulation_distribution_oscillator(data['High'], data['Low'], data['Close'])
    data['WMA10'] = weighted_moving_average_10(data['Close'])
    data['EMA12'] = exponential_moving_average_12(data['Close'])
    data['EMA26'] = exponential_moving_average_26(data['Close'])
    data['MACD'] = moving_average_convergence_divergence(data['EMA12'], data['EMA26'])
    data['Momentum'] = momentum(data['Close'])
    data['Change'] = change(data['Close'])
    data['Direction'] = direction(data['Change'])
    data['RSI'] = relative_strength_index(data['Direction'])
    data['CCI'] = channel_commodity_index(data, 14)
    data['DI'] = disparity_index(data['Close'])
    data['ADX'] = get_adx(data['High'], data['Low'], data['Close'], 14)
    data['Aroon'] = aroon(data['Close'])    
    return data

In [31]:
complete_data = compute_all_indicators(data)
complete_data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,...,Change,Direction,RSI,TP,sma,mad,CCI,DI,ADX,Aroon
0,5/15/1997,2.437500,2.500000,1.927083,1.958333,1.958333,72156000,1.958333,1.958333,,...,-0.229166,0,,2.128472,,,,,,0.0
1,5/16/1997,1.968750,1.979167,1.708333,1.729167,1.729167,14700000,1.843750,1.843750,,...,-0.020834,0,,1.805556,,,,,,-4.0
2,5/19/1997,1.760417,1.770833,1.625000,1.708333,1.708333,6106800,1.798611,1.798611,,...,-0.072916,0,,1.701389,,,,,,-8.0
3,5/20/1997,1.729167,1.750000,1.635417,1.635417,1.635417,5467200,1.757812,1.757812,,...,-0.208334,0,,1.673611,,,,,,-12.0
4,5/21/1997,1.635417,1.645833,1.375000,1.427083,1.427083,18853200,1.691667,1.691667,,...,-0.031250,0,,1.482639,,,,,,-16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6152,10/25/2021,3335.000000,3347.800049,3297.699951,3320.370117,3320.370117,2226000,3390.028027,3363.737036,46.075539,...,55.699951,1,0.675676,3321.956706,3336.762172,62.706724,-15.740435,-0.538225,22.462941,40.0
6153,10/26/2021,3349.510010,3416.120117,3343.979980,3376.070068,3376.070068,2698300,3376.412061,3376.611035,61.695644,...,16.419922,1,0.675676,3378.723388,3346.552647,58.910708,36.406218,0.884063,21.835137,40.0
6154,10/27/2021,3388.000000,3437.000000,3371.449951,3392.489990,3392.489990,2702200,3371.898047,3387.432031,68.942496,...,54.080078,1,0.910010,3400.313314,3353.450265,58.707812,53.216142,1.180228,20.421165,40.0
6155,10/28/2021,3402.100098,3479.000000,3386.000000,3446.570068,3446.570068,5708700,3374.210058,3402.103027,86.638952,...,-74.140136,0,0.910010,3437.190023,3363.291219,58.017979,84.914831,2.448428,19.442353,40.0


In [32]:
complete_data = complete_data.dropna()

In [33]:
complete_data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,...,Change,Direction,RSI,TP,sma,mad,CCI,DI,ADX,Aroon
14,6/5/1997,1.416667,1.541667,1.375000,1.541667,1.541667,5672400,1.489584,1.496354,34.375033,...,0.114583,1,0.282646,1.486111,1.540303,0.086788,-41.627398,0.558137,50.226281,-20.0
15,6/6/1997,1.515625,1.708333,1.510417,1.656250,1.656250,7807200,1.520834,1.522396,75.000055,...,0.031250,1,0.381194,1.625000,1.527406,0.070206,92.673942,8.400292,43.640597,-20.0
16,6/9/1997,1.656250,1.708333,1.656250,1.687500,1.687500,2352000,1.556250,1.541146,85.714286,...,-0.104167,0,0.381194,1.684028,1.526166,0.068612,153.386632,10.553252,30.868253,-20.0
17,6/10/1997,1.708333,1.708333,1.531250,1.583333,1.583333,5458800,1.577083,1.541146,68.421026,...,-0.041666,0,0.381194,1.607639,1.521453,0.062553,91.852991,3.982389,24.801838,-20.0
18,6/11/1997,1.593750,1.604167,1.531250,1.541667,1.541667,1188000,1.602083,1.542188,57.894870,...,0.062500,1,0.507614,1.559028,1.526910,0.063244,33.856459,0.704759,24.430750,-20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6151,10/22/2021,3421.000000,3429.840088,3331.300049,3335.550049,3335.550049,3133800,3415.302002,3356.330029,51.820291,...,-15.179932,0,0.675676,3365.563395,3330.055501,68.455299,34.580127,0.129382,23.568887,40.0
6152,10/25/2021,3335.000000,3347.800049,3297.699951,3320.370117,3320.370117,2226000,3390.028027,3363.737036,46.075539,...,55.699951,1,0.675676,3321.956706,3336.762172,62.706724,-15.740435,-0.538225,22.462941,40.0
6153,10/26/2021,3349.510010,3416.120117,3343.979980,3376.070068,3376.070068,2698300,3376.412061,3376.611035,61.695644,...,16.419922,1,0.675676,3378.723388,3346.552647,58.910708,36.406218,0.884063,21.835137,40.0
6154,10/27/2021,3388.000000,3437.000000,3371.449951,3392.489990,3392.489990,2702200,3371.898047,3387.432031,68.942496,...,54.080078,1,0.910010,3400.313314,3353.450265,58.707812,53.216142,1.180228,20.421165,40.0


In [34]:
complete_data = complete_data.drop(['TP', 'sma', 'mad', 'Change'], axis=1)

In [35]:
complete_data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,...,EMA12,EMA26,MACD,Momentum,Direction,RSI,CCI,DI,ADX,Aroon
14,6/5/1997,1.416667,1.541667,1.375000,1.541667,1.541667,5672400,1.489584,1.496354,34.375033,...,1.550516,1.668338,-0.117822,78.723435,1,0.282646,-41.627398,0.558137,50.226281,-20.0
15,6/6/1997,1.515625,1.708333,1.510417,1.656250,1.656250,7807200,1.520834,1.522396,75.000055,...,1.566783,1.667443,-0.100660,95.783114,1,0.381194,92.673942,8.400292,43.640597,-20.0
16,6/9/1997,1.656250,1.708333,1.656250,1.687500,1.687500,2352000,1.556250,1.541146,85.714286,...,1.585355,1.668929,-0.083574,98.780507,0,0.381194,153.386632,10.553252,30.868253,-20.0
17,6/10/1997,1.708333,1.708333,1.531250,1.583333,1.583333,5458800,1.577083,1.541146,68.421026,...,1.585044,1.662588,-0.077545,96.815247,0,0.381194,91.852991,3.982389,24.801838,-20.0
18,6/11/1997,1.593750,1.604167,1.531250,1.541667,1.541667,1188000,1.602083,1.542188,57.894870,...,1.578370,1.653631,-0.075261,108.029246,1,0.507614,33.856459,0.704759,24.430750,-20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6151,10/22/2021,3421.000000,3429.840088,3331.300049,3335.550049,3335.550049,3133800,3415.302002,3356.330029,51.820291,...,3363.883172,3356.629660,7.253513,104.569908,0,0.675676,34.580127,0.129382,23.568887,40.0
6152,10/25/2021,3335.000000,3347.800049,3297.699951,3320.370117,3320.370117,2226000,3390.028027,3363.737036,46.075539,...,3357.188856,3353.943768,3.245089,103.085070,1,0.675676,-15.740435,-0.538225,22.462941,40.0
6153,10/26/2021,3349.510010,3416.120117,3343.979980,3376.070068,3376.070068,2698300,3376.412061,3376.611035,61.695644,...,3360.093658,3355.582753,4.510905,103.496619,1,0.675676,36.406218,0.884063,21.835137,40.0
6154,10/27/2021,3388.000000,3437.000000,3371.449951,3392.489990,3392.489990,2702200,3371.898047,3387.432031,68.942496,...,3365.077709,3358.316622,6.761087,102.727085,1,0.910010,53.216142,1.180228,20.421165,40.0


In [36]:
#corrMatrix = complete_data.corr()
#print(corrMatrix)

In [37]:
#sn.heatmap(corrMatrix, annot=False)
#plt.show()

In [38]:
#corr_pairs = corrMatrix.unstack()
#corr_pairs

In [39]:
#sorted_pairs = corr_pairs.sort_values(kind="quicksort")
#sorted_pairs

In [40]:
#strong_pairs = sorted_pairs[abs(sorted_pairs) > 0.5]

#print(strong_pairs)

In [41]:
## strong_pairs[strong_pairs.index[0][0] == strong_pairs.index[0][1]]

#removed_diagonal = [(i, j) for (i, j) in strong_pairs.index if i!=j]
#len(removed_diagonal)

In [42]:
## # Create correlation matrix
## corr_matrix = complete_data.corr().abs()

## # Select upper triangle of correlation matrix
## upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool8))

## # Find index of feature columns with correlation greater than 0.95
## to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
## to_drop

In [43]:
## Find index of feature columns with correlation greater than 0.8

#correlated_features = set()
#for i in range(len(corrMatrix.columns)):
 #   for j in range(i):
  #      if abs(corrMatrix.iloc[i, j]) > 0.8:
   #         colname = corrMatrix.columns[i]
    #        correlated_features.add(colname)

In [44]:
#correlated_features

In [45]:
#complete_data = complete_data.drop(labels=correlated_features, axis=1)

In [46]:
#complete_data

In [47]:
target = complete_data['Direction']
complete_data = complete_data.drop(['Date', 'Direction'], axis=1)
complete_data

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,StochasticD,...,WMA10,EMA12,EMA26,MACD,Momentum,RSI,CCI,DI,ADX,Aroon
14,1.416667,1.541667,1.375000,1.541667,1.541667,5672400,1.489584,1.496354,34.375033,21.573495,...,1.497633,1.550516,1.668338,-0.117822,78.723435,0.282646,-41.627398,0.558137,50.226281,-20.0
15,1.515625,1.708333,1.510417,1.656250,1.656250,7807200,1.520834,1.522396,75.000055,39.382348,...,1.526705,1.566783,1.667443,-0.100660,95.783114,0.381194,92.673942,8.400292,43.640597,-20.0
16,1.656250,1.708333,1.656250,1.687500,1.687500,2352000,1.556250,1.541146,85.714286,65.029791,...,1.556724,1.585355,1.668929,-0.083574,98.780507,0.381194,153.386632,10.553252,30.868253,-20.0
17,1.708333,1.708333,1.531250,1.583333,1.583333,5458800,1.577083,1.541146,68.421026,76.378455,...,1.564394,1.585044,1.662588,-0.077545,96.815247,0.381194,91.852991,3.982389,24.801838,-20.0
18,1.593750,1.604167,1.531250,1.541667,1.541667,1188000,1.602083,1.542188,57.894870,70.676727,...,1.564489,1.578370,1.653631,-0.075261,108.029246,0.507614,33.856459,0.704759,24.430750,-20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6151,3421.000000,3429.840088,3331.300049,3335.550049,3335.550049,3133800,3415.302002,3356.330029,51.820291,75.141833,...,3385.798202,3363.883172,3356.629660,7.253513,104.569908,0.675676,34.580127,0.129382,23.568887,40.0
6152,3335.000000,3347.800049,3297.699951,3320.370117,3320.370117,2226000,3390.028027,3363.737036,46.075539,62.726254,...,3379.260036,3357.188856,3353.943768,3.245089,103.085070,0.675676,-15.740435,-0.538225,22.462941,40.0
6153,3349.510010,3416.120117,3343.979980,3376.070068,3376.070068,2698300,3376.412061,3376.611035,61.695644,53.197158,...,3381.502406,3360.093658,3355.582753,4.510905,103.496619,0.675676,36.406218,0.884063,21.835137,40.0
6154,3388.000000,3437.000000,3371.449951,3392.489990,3392.489990,2702200,3371.898047,3387.432031,68.942496,58.904560,...,3384.389489,3365.077709,3358.316622,6.761087,102.727085,0.910010,53.216142,1.180228,20.421165,40.0


### autofeat Classification

In [48]:
from sklearn.model_selection import train_test_split

In [49]:
from autofeat import AutoFeatClassifier
X_train, X_test, y_train, y_test = train_test_split(complete_data,target,test_size=0.3)
model = AutoFeatClassifier()
df = model.fit_transform(X_train, y_train)
y_pred = model.predict(X_test)

  x = um.multiply(x, x, out=x)


In [50]:
df_test = model.transform(X_test)
model.score(df_test,y_test)

0.5084102007596311

In [51]:
df

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,StochasticD,...,DI**3*log(RSI),sqrt(EMA26)*ROC,PVT/StochasticD,RSI/ADX,SMA5/SMA10,Abs(CCI)/DI,MACD*ROC**3,exp(ROC)/Momentum,RSI*StochasticD**3,LarryWilliamsR**3/DI
0,39.599998,39.619999,37.709999,38.939999,38.939999,9874100.0,38.628000,39.292000,46.666635,48.758603,...,0.009406,0.138432,-777.096138,0.050069,0.983101,-59.463225,0.000003,0.009800,105487.252619,-3.271172e+05
1,314.000000,316.500000,309.500000,311.730011,311.730011,7287500.0,321.096002,328.341998,4.507805,14.673513,...,400.367044,-1.706928,-8425.869319,0.006046,0.977932,-21.930971,0.009542,0.010292,643.458931,-1.379323e+05
2,64.160004,64.910004,61.520000,61.709999,61.709999,7336000.0,62.307999,63.312999,23.233679,43.951783,...,12.700205,0.082060,-5619.436498,0.019490,0.984126,-13.753787,0.000003,0.010015,43098.610325,-1.703435e+05
3,35.590000,35.770000,35.009998,35.549999,35.549999,12790500.0,36.120000,36.096000,14.099235,19.827287,...,15.237495,-0.464297,-4325.908926,0.023759,1.000665,-46.572552,0.000144,0.009578,2971.231041,-2.526073e+05
4,961.010010,965.609985,954.419983,955.099976,955.099976,2641800.0,967.402002,977.615002,1.491866,14.897200,...,8.603541,-0.409584,-1755.622249,0.025939,0.989553,-65.397976,0.000005,0.010109,1260.260797,-4.609104e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4294,41.500000,43.099998,41.500000,42.599998,42.599998,13556700.0,40.439999,39.356000,91.974317,96.129503,...,-76.116185,0.711929,3703.148056,0.023440,1.027543,24.245421,0.003276,0.010103,808381.382364,5.552096e+01
4295,1852.689941,1867.780029,1835.540039,1858.969971,1858.969971,3798200.0,1869.361987,1879.810998,29.074984,42.222197,...,10.525177,-0.953380,-482.758458,0.010070,0.994441,-44.573436,-0.000071,0.010138,15329.958696,-1.900656e+05
4296,32.639999,32.750000,32.200001,32.630001,32.630001,5192300.0,32.168000,31.226000,83.895118,87.335245,...,-124.963846,0.698937,54.716378,0.020311,1.030167,17.148756,0.003784,0.010105,450097.817546,6.114891e+02
4297,1926.310059,1957.000000,1820.729980,1846.089966,1846.089966,9817900.0,1810.802002,1802.890991,59.427486,61.115318,...,-0.047910,-2.852554,-2975.603949,0.017046,1.004388,87.235924,0.013216,0.009912,115873.466558,1.615488e+05


In [52]:
df_test

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,StochasticD,...,DI**3*log(RSI),sqrt(EMA26)*ROC,PVT/StochasticD,RSI/ADX,SMA5/SMA10,Abs(CCI)/DI,MACD*ROC**3,exp(ROC)/Momentum,RSI*StochasticD**3,LarryWilliamsR**3/DI
0,1.593750,1.604167,1.531250,1.541667,1.541667,1188000.0,1.602083,1.542188,57.894870,70.676727,...,-0.237341,0.035721,-442.333219,0.020778,1.038838,48.039771,-1.613140e-06,0.009517,1.792103e+05,105916.697897
1,14.310000,14.650000,13.720000,14.150000,14.150000,4489000.0,13.722000,12.981000,74.683544,67.562615,...,-281.338941,0.007974,-971.645866,0.023685,1.057083,10.576545,-4.257324e-09,0.010524,1.565500e+05,2175.450255
2,1.598958,1.916667,1.593750,1.911458,1.911458,12577200.0,1.609375,1.560417,98.809371,65.228143,...,-8578.997278,0.340760,39195.825164,0.025270,1.031375,16.603942,-3.688998e-05,0.010850,1.408766e+05,0.072433
3,65.000000,66.437500,60.000000,61.687500,61.687500,13777900.0,66.562500,65.481250,12.616822,33.489097,...,294.376803,-0.641821,-32267.804104,0.016714,1.016512,-20.468103,2.101693e-03,0.010453,1.061579e+04,-108437.903042
4,69.349998,69.629997,68.300003,68.660004,68.660004,7593500.0,69.220000,70.323000,5.607494,13.721014,...,37.565242,-0.388022,-1607.356660,0.006165,0.984315,-36.381773,-1.625359e-04,0.010235,7.301310e+02,-271487.157864
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1838,383.500000,389.690002,381.489990,385.959991,385.959991,2355300.0,388.649994,380.506000,69.368083,76.636070,...,-5.969624,0.870758,103.866378,0.015408,1.021403,27.407646,1.187047e-03,0.009659,3.041151e+05,11596.254719
1839,362.619995,365.000000,358.649994,361.079987,361.079987,4502200.0,355.508002,340.298001,88.661699,88.586146,...,-277.522876,2.932695,-227.003797,0.019605,1.044696,13.321827,5.748979e-02,0.009939,4.697164e+05,163.551777
1840,51.299999,52.119999,51.099998,51.939999,51.939999,5994700.0,50.880000,48.905999,98.444252,97.108089,...,830.467812,1.825613,214.680557,0.067107,1.040363,8.384467,3.715026e-02,0.010563,1.681636e+06,0.339337
1841,3134.000000,3140.000000,3105.100098,3105.459961,3105.459961,2916800.0,3122.254004,3156.079004,37.280376,41.744237,...,0.330323,1.890330,-672.957137,0.059646,0.989283,-21.070020,-8.453815e-04,0.010693,4.915049e+04,-261221.244464


### Generate New Features Based on autofeat Results

In [53]:
import math

In [54]:
df_test.columns

Index(['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'SMA5', 'SMA10',
       'StochasticK', 'StochasticD', 'LarryWilliamsR', 'ROC', 'PVT', 'ADO',
       'WMA10', 'EMA12', 'EMA26', 'MACD', 'Momentum', 'RSI', 'CCI', 'DI',
       'ADX', 'Aroon', 'sqrt(StochasticK)/RSI', 'sqrt(RSI)/Momentum',
       'sqrt(LarryWilliamsR)*RSI', 'ROC/MACD', 'log(RSI)/ADX', 'CCI**3*RSI**2',
       'DI**3*log(RSI)', 'sqrt(EMA26)*ROC', 'PVT/StochasticD', 'RSI/ADX',
       'SMA5/SMA10', 'Abs(CCI)/DI', 'MACD*ROC**3', 'exp(ROC)/Momentum',
       'RSI*StochasticD**3', 'LarryWilliamsR**3/DI'],
      dtype='object')

In [55]:
def sqrt_stochastic_k_rsi(stochastic_k, rsi):
    return(np.sqrt(stochastic_k)/rsi)

In [56]:
def sqrt_rsi_momentum(rsi, momentum):
    return (np.sqrt(rsi)/momentum)

In [57]:
def sqrt_lwr_rsi (larry_williams_r, rsi):
    return (np.sqrt(larry_williams_r)*rsi)

In [58]:
def roc_macd (roc, macd):
    return (roc / macd)

In [59]:
def log_rsi_adx (rsi, adx): 
    return (np.log(rsi)/adx)

In [60]:
def cci_3_rsi_2 (cci, rsi):
    return ((cci**3)*(rsi**2))

In [61]:
def di_3_log_rsi (di, rsi):
    return ((di**3)*(np.log(rsi)))

In [62]:
def sqrt_ema26_roc (ema26, roc):
    return ((np.sqrt(ema26))*roc)

In [63]:
def pvt_stochastic_d (pvt, stochastic_d):
    return (pvt / stochastic_d)

In [64]:
def rsi_adx (rsi, adx):
    return (rsi / adx)

In [65]:
def sma5_sma10 (sma5, sma10):
    return (sma5 / sma10)

In [66]:
def abs_cci_di (cci, di):
    return ((np.abs(cci))/di)

In [67]:
def macd_roc_3 (macd, roc):
    return (macd * (roc**3))

In [68]:
def exp_roc_momentum (roc, momentum):
    return ((np.exp(roc))/momentum)

In [69]:
def rsi_stochastic_d_3 (rsi, stochastic_d):
    return (rsi*(stochastic_d**3))

In [70]:
def lwr_3_di (larrywilliamsr, di):
    return ((larrywilliamsr**3)/di)

In [71]:
def compute_adv_indicators(complete_data):
    complete_data['SqrtStochasticK/RSI'] = sqrt_stochastic_k_rsi(complete_data['StochasticK'], complete_data['RSI'])
    complete_data['SqrtRSI/Momentum'] = sqrt_rsi_momentum(complete_data['RSI'], complete_data['Momentum'])
    complete_data['SqrtLarryWilliamsR*RSI'] = sqrt_lwr_rsi(complete_data['LarryWilliamsR'], complete_data['RSI'])
    complete_data['ROC/MACD'] = roc_macd(complete_data['ROC'], complete_data['MACD'])
    complete_data['log(RSI)/ADX'] = log_rsi_adx(complete_data['RSI'], complete_data['ADX'])
    complete_data['CCI**3*RSI**2'] = cci_3_rsi_2(complete_data['CCI'], complete_data['RSI'])
    complete_data['DI**3*log(RSI)'] = di_3_log_rsi(complete_data['DI'], complete_data['RSI'])
    complete_data['sqrt(EMA26)*ROC'] = sqrt_ema26_roc(complete_data['EMA26'], complete_data['ROC'])
    complete_data['PVT/StochasticD'] = pvt_stochastic_d(complete_data['PVT'], complete_data['StochasticD'])
    complete_data['RSI/ADX'] = rsi_adx(complete_data['RSI'], complete_data['ADX'])
    complete_data['SMA5/SMA10'] = sma5_sma10(complete_data['SMA5'], complete_data['SMA10'])
    complete_data['Abs(CCI)/DI'] = abs_cci_di(complete_data['CCI'], complete_data['DI'])
    complete_data['MACD*ROC**3'] = macd_roc_3(complete_data['MACD'], complete_data['ROC'])
    complete_data['exp(ROC)/Momentum'] = exp_roc_momentum(complete_data['ROC'], complete_data['Momentum'])
    complete_data['RSI*StochasticD**3'] = rsi_stochastic_d_3(complete_data['RSI'], complete_data['StochasticD'])
    complete_data['LarryWilliamsR**3/DI'] = lwr_3_di(complete_data['LarryWilliamsR'], complete_data['DI'])
    return complete_data

In [72]:
enhanced_data = compute_adv_indicators(complete_data)
enhanced_data

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,StochasticD,...,DI**3*log(RSI),sqrt(EMA26)*ROC,PVT/StochasticD,RSI/ADX,SMA5/SMA10,Abs(CCI)/DI,MACD*ROC**3,exp(ROC)/Momentum,RSI*StochasticD**3,LarryWilliamsR**3/DI
14,1.416667,1.541667,1.375000,1.541667,1.541667,5672400,1.489584,1.496354,34.375033,21.573495,...,-0.219695,-0.126013,23200.031167,0.005627,0.995475,74.582740,1.094086e-04,0.011522,2837.943324,506368.140652
15,1.515625,1.708333,1.510417,1.656250,1.656250,7807200,1.520834,1.522396,75.000055,39.382348,...,-571.690520,0.016449,14734.089628,0.008735,0.998974,11.032228,-2.080788e-07,0.010574,23283.665562,1860.042173
16,1.656250,1.708333,1.656250,1.687500,1.687500,2352000,1.556250,1.541146,85.714286,65.029791,...,-1133.539712,0.235743,682.415825,0.012349,1.009801,14.534537,-5.078456e-04,0.012150,104829.520071,276.260995
17,1.708333,1.708333,1.531250,1.583333,1.583333,5458800,1.577083,1.541146,68.421026,76.378455,...,-60.912828,0.173205,-4411.768205,0.015370,1.023319,23.064798,-1.879556e-04,0.011814,169847.491505,7907.703839
18,1.593750,1.604167,1.531250,1.541667,1.541667,1188000,1.602083,1.542188,57.894870,70.676727,...,-0.237341,0.035721,-442.333219,0.020778,1.038838,48.039771,-1.613140e-06,0.009517,179210.340654,105916.697897
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6151,3421.000000,3429.840088,3331.300049,3335.550049,3335.550049,3133800,3415.302002,3356.330029,51.820291,75.141833,...,-0.000849,1.306142,-1207.562926,0.028668,1.017570,267.271614,8.311207e-05,0.009781,286670.915387,864407.979133
6152,3335.000000,3347.800049,3297.699951,3320.370117,3320.370117,2226000,3390.028027,3363.737036,46.075539,62.726254,...,0.061126,0.314609,-161.502099,0.030080,1.007816,-29.245076,5.202433e-07,0.009754,166757.873350,-291335.534547
6153,3349.510010,3416.120117,3343.979980,3376.070068,3376.070068,2698300,3376.412061,3376.611035,61.695644,53.197158,...,-0.270883,1.540387,850.883530,0.030944,0.999941,41.180577,8.482059e-05,0.009923,101719.351473,63571.338116
6154,3388.000000,3437.000000,3371.449951,3392.489990,3392.489990,2702200,3371.898047,3387.432031,68.942496,58.904560,...,-0.155027,2.609694,223.114715,0.044562,0.995414,45.089696,6.174517e-04,0.010183,185991.442155,25382.451951


### autofeat Feature Selection

In [73]:
from autofeat import FeatureSelector
fsel = FeatureSelector(verbose=1)
selected_data = fsel.fit_transform(pd.DataFrame(enhanced_data), pd.DataFrame(target))

  return f(**kwargs)


[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 19 features after 5 feature selection runs
[featsel] 19 features after correlation filtering
[featsel] 16 features after noise filtering


In [74]:
selected_data

Unnamed: 0,Aroon,RSI/ADX,SMA5/SMA10,log(RSI)/ADX,CCI**3*RSI**2,sqrt(EMA26)*ROC,SqrtRSI/Momentum,exp(ROC)/Momentum,RSI*StochasticD**3,SqrtStochasticK/RSI,SqrtLarryWilliamsR*RSI,DI,ROC/MACD,DI**3*log(RSI),LarryWilliamsR**3/DI,StochasticD
0,-20.0,0.005627,0.995475,-0.025157,-5762.648442,-0.126013,0.006753,0.011522,2837.943324,20.743374,2.289690,0.558137,0.828031,-0.219695,506368.140652,21.573495
1,-20.0,0.008735,0.998974,-0.022100,115655.410942,0.016449,0.006446,0.010574,23283.665562,22.718741,1.905970,8.400292,-0.126551,-571.690520,1860.042173,39.382348
2,-20.0,0.012349,1.009801,-0.031244,524391.418850,0.235743,0.006250,0.012150,104829.520071,24.287347,1.440779,10.553252,-2.183477,-1133.539712,276.260995,65.029791
3,-20.0,0.015370,1.023319,-0.038886,112608.959634,0.173205,0.006377,0.011814,169847.491505,21.699427,2.142129,3.982389,-1.732272,-60.912828,7907.703839,76.378455
4,-20.0,0.020778,1.038838,-0.027753,9999.819625,0.035721,0.006595,0.009517,179210.340654,14.989470,3.293831,0.704759,-0.369090,-0.237341,105916.697897,70.676727
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6137,40.0,0.028668,1.017570,-0.016634,18878.014771,1.306142,0.007861,0.009781,286670.915387,10.653974,4.689973,0.129382,0.003108,-0.000849,864407.979133,75.141833
6138,40.0,0.030080,1.007816,-0.017453,-1780.437540,0.314609,0.007974,0.009754,166757.873350,10.046087,4.961708,-0.538225,0.001674,0.061126,-291335.534547,62.726254
6139,40.0,0.030944,0.999941,-0.017955,22029.429739,1.540387,0.007942,0.009923,101719.351473,11.624893,4.181791,0.884063,0.005895,-0.270883,63571.338116,53.197158
6140,40.0,0.044562,0.995414,-0.004618,124802.300702,2.609694,0.009286,0.010183,185991.442155,9.124252,5.071419,1.180228,0.006661,-0.155027,25382.451951,58.904560


In [75]:
from sklearn.preprocessing import StandardScaler

In [76]:
scaler = StandardScaler()  
scaler.fit(selected_data)

StandardScaler()

In [77]:

selected_scaled_data = scaler.transform(selected_data)

In [78]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(selected_scaled_data,target,test_size=0.3)

In [79]:
X_train = scaler.transform(X_train)  
X_test = scaler.transform(X_test) 

In [80]:
selected_scaled_data_df = pd.DataFrame(selected_scaled_data)
selected_scaled_data_1250 = selected_scaled_data_df.tail(1250)
selected_scaled_data_250 = selected_scaled_data_df.tail(250)
selected_scaled_data_125 = selected_scaled_data_df.tail(125)
target_1250 = target.tail(1250)
target_250 = target.tail(250)
target_125 = target.tail(125)

In [81]:
X_train_1250, X_test_1250, y_train_1250, y_test_1250 = train_test_split(selected_scaled_data_1250,target_1250,test_size=0.3)

In [82]:
X_train_250, X_test_250, y_train_250, y_test_250 = train_test_split(selected_scaled_data_250,target_250,test_size=0.3)

In [83]:
X_train_125, X_test_125, y_train_125, y_test_125 = train_test_split(selected_scaled_data_125,target_125,test_size=0.3)

### Logistic Regression

#### Max Duration

In [84]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [85]:
lr = LogisticRegression(solver='liblinear', multi_class='ovr')
trained_model_lr = lr.fit(X_train, y_train)
predictions_lr = trained_model_lr.predict(X_test)

Train_accuracy_lr = accuracy_score(y_train,trained_model_lr.predict(X_train))
Test_accuracy_lr = accuracy_score(y_test, predictions_lr)
Confusion_matrix_lr = confusion_matrix(y_test,predictions_lr)

In [86]:
Train_accuracy_lr

0.6541056059548732

In [87]:
Test_accuracy_lr

0.6489419424850786

#### 5 Years

In [88]:
trained_model_lr_1250 = lr.fit(X_train_1250, y_train_1250)
predictions_lr_1250 = trained_model_lr_1250.predict(X_test_1250)

Train_accuracy_lr_1250 = accuracy_score(y_train_1250,trained_model_lr_1250.predict(X_train_1250))
Test_accuracy_lr_1250 = accuracy_score(y_test_1250, predictions_lr_1250)
Confusion_matrix_lr_1250 = confusion_matrix(y_test_1250,predictions_lr_1250)

In [89]:
Train_accuracy_lr_1250

0.6491428571428571

In [90]:
Test_accuracy_lr_1250

0.6586666666666666

#### 1 Year

In [91]:
trained_model_lr_250 = lr.fit(X_train_250, y_train_250)
predictions_lr_250 = trained_model_lr_250.predict(X_test_250)

Train_accuracy_lr_250 = accuracy_score(y_train_250,trained_model_lr_250.predict(X_train_250))
Test_accuracy_lr_250 = accuracy_score(y_test_250, predictions_lr_250)
Confusion_matrix_lr_250 = confusion_matrix(y_test_250,predictions_lr_250)

In [92]:
Train_accuracy_lr_250

0.6857142857142857

In [93]:
Test_accuracy_lr_250

0.56

#### 6 Months

In [94]:
trained_model_lr_125 = lr.fit(X_train_125, y_train_125)
predictions_lr_125 = trained_model_lr_125.predict(X_test_125)

Train_accuracy_lr_125 = accuracy_score(y_train_125,trained_model_lr_125.predict(X_train_125))
Test_accuracy_lr_125 = accuracy_score(y_test_125, predictions_lr_125)
Confusion_matrix_lr_125 = confusion_matrix(y_test_125,predictions_lr_125)

In [95]:
Train_accuracy_lr_125

0.6781609195402298

In [96]:
Test_accuracy_lr_125

0.39473684210526316

#### Last 100 Days based on Best Model - Max Duration

In [97]:
last100 = selected_scaled_data_df.tail(100)

In [151]:
predictions_lr_100 = trained_model_lr_1250.predict(last100)
predictions_lr_100

array([1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1])

### SVM

In [99]:
svm = SVC(gamma='auto')
trained_model_svm = svm.fit(X_train, y_train)
predictions_svm = trained_model_svm.predict(X_test)

Train_accuracy_svm = accuracy_score(y_train,trained_model_svm.predict(X_train))
Test_accuracy_svm = accuracy_score(y_test, predictions_svm)
Confusion_matrix_svm = confusion_matrix(y_test,predictions_svm)

In [100]:
Train_accuracy_svm

1.0

In [101]:
Test_accuracy_svm

0.46500271296798695

#### 5 Years

In [102]:
trained_model_svm_1250 = svm.fit(X_train_1250, y_train_1250)
predictions_svm_1250 = trained_model_svm_1250.predict(X_test_1250)

Train_accuracy_svm_1250 = accuracy_score(y_train_1250,trained_model_svm_1250.predict(X_train_1250))
Test_accuracy_svm_1250 = accuracy_score(y_test_1250, predictions_svm_1250)
Confusion_matrix_svm_1250 = confusion_matrix(y_test_1250,predictions_svm_1250)

In [103]:
Train_accuracy_svm_1250

0.6777142857142857

In [104]:
Test_accuracy_svm_1250

0.6426666666666667

#### 1 Year

In [105]:
trained_model_svm_250 = svm.fit(X_train_250, y_train_250)
predictions_svm_250 = trained_model_svm_1250.predict(X_test_250)

Train_accuracy_svm_250 = accuracy_score(y_train_250,trained_model_svm_250.predict(X_train_250))
Test_accuracy_svm_250 = accuracy_score(y_test_250, predictions_svm_250)
Confusion_matrix_svm_250 = confusion_matrix(y_test_250,predictions_svm_250)

In [106]:
Train_accuracy_svm_250

0.7314285714285714

In [107]:
Test_accuracy_svm_250

0.56

#### 6 Months

In [108]:
trained_model_svm_125 = svm.fit(X_train_125, y_train_125)
predictions_svm_125 = trained_model_svm_1250.predict(X_test_125)

Train_accuracy_svm_125 = accuracy_score(y_train_125,trained_model_svm_125.predict(X_train_125))
Test_accuracy_svm_125 = accuracy_score(y_test_125, predictions_svm_125)
Confusion_matrix_svm_125 = confusion_matrix(y_test_125,predictions_svm_125)

In [109]:
Train_accuracy_svm_125

0.6551724137931034

In [110]:
Test_accuracy_svm_125

0.4473684210526316

#### Last 100 Days based on Best Model - Max Duration

In [152]:
predictions_svm_100 = trained_model_svm_1250.predict(last100)
predictions_svm_100

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1])

### RandomForest

In [112]:
rf = RandomForestClassifier(n_estimators=10)
trained_model_rf = rf.fit(X_train, y_train)
predictions_rf = trained_model_rf.predict(X_test)

Train_accuracy_rf = accuracy_score(y_train, trained_model_rf.predict(X_train))
Test_accuracy_rf = accuracy_score(y_test, predictions_rf)
Confusion_matrix_rf = confusion_matrix(y_test, predictions_rf)

In [113]:
Train_accuracy_rf

0.9860432658757851

In [114]:
Test_accuracy_rf

0.6093326098752034

#### 5 Years

In [115]:
trained_model_rf_1250 = rf.fit(X_train_1250, y_train_1250)
predictions_rf_1250 = trained_model_rf_1250.predict(X_test_1250)

Train_accuracy_rf_1250 = accuracy_score(y_train_1250,trained_model_rf_1250.predict(X_train_1250))
Test_accuracy_rf_1250 = accuracy_score(y_test_1250, predictions_rf_1250)
Confusion_matrix_rf_1250 = confusion_matrix(y_test_1250,predictions_rf_1250)

In [116]:
Train_accuracy_rf_1250

0.984

In [117]:
Test_accuracy_rf_1250

0.6613333333333333

#### 1 Year

In [118]:
trained_model_rf_250 = rf.fit(X_train_250, y_train_250)
predictions_rf_250 = trained_model_rf_250.predict(X_test_250)

Train_accuracy_rf_250 = accuracy_score(y_train_250,trained_model_rf_250.predict(X_train_250))
Test_accuracy_rf_250 = accuracy_score(y_test_250, predictions_rf_250)
Confusion_matrix_rf_250 = confusion_matrix(y_test_250,predictions_rf_250)

In [119]:
Train_accuracy_rf_250

0.9714285714285714

In [120]:
Test_accuracy_rf_250

0.52

#### 6 Months

In [121]:
trained_model_rf_125 = rf.fit(X_train_125, y_train_125)
predictions_rf_125 = trained_model_rf_125.predict(X_test_125)

Train_accuracy_rf_125 = accuracy_score(y_train_125,trained_model_rf_125.predict(X_train_125))
Test_accuracy_rf_125 = accuracy_score(y_test_125, predictions_rf_125)
Confusion_matrix_rf_125 = confusion_matrix(y_test_125,predictions_rf_125)

In [122]:
Train_accuracy_rf_125

0.9885057471264368

In [123]:
Test_accuracy_rf_125

0.39473684210526316

#### Last 100 Days based on Best Model

In [153]:
predictions_rf_100 = trained_model_rf_1250.predict(last100)
predictions_rf_100

array([1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1])

### KNN

In [125]:
knn = KNeighborsClassifier(n_neighbors=5)
trained_model_knn = knn.fit(X_train, y_train)
predictions_knn = trained_model_knn.predict(X_test)

Train_accuracy_knn = accuracy_score(y_train, trained_model_knn.predict(X_train))
Test_accuracy_knn = accuracy_score(y_test, predictions_knn)
Confusion_matrix_knn = confusion_matrix(y_test, predictions_knn)

In [126]:
Train_accuracy_knn

0.7385438474063736

In [127]:
Test_accuracy_knn

0.5756918068366793

#### 5 Years

In [128]:
trained_model_knn_1250 = knn.fit(X_train_1250, y_train_1250)
predictions_knn_1250 = trained_model_knn_1250.predict(X_test_1250)

Train_accuracy_knn_1250 = accuracy_score(y_train_1250,trained_model_knn_1250.predict(X_train_1250))
Test_accuracy_knn_1250 = accuracy_score(y_test_1250, predictions_knn_1250)
Confusion_matrix_knn_1250 = confusion_matrix(y_test_1250,predictions_knn_1250)

In [129]:
Train_accuracy_knn_1250

0.7508571428571429

In [130]:
Test_accuracy_knn_1250

0.56

#### 1 Year

In [131]:
trained_model_knn_250 = knn.fit(X_train_250, y_train_250)
predictions_knn_250 = trained_model_knn_250.predict(X_test_250)

Train_accuracy_knn_250 = accuracy_score(y_train_250,trained_model_knn_250.predict(X_train_250))
Test_accuracy_knn_250 = accuracy_score(y_test_250, predictions_knn_250)
Confusion_matrix_knn_250 = confusion_matrix(y_test_250,predictions_knn_250)

In [132]:
Train_accuracy_knn_250

0.72

In [133]:
Test_accuracy_knn_250

0.6

#### 6 Months

In [134]:
trained_model_knn_125 = knn.fit(X_train_125, y_train_125)
predictions_knn_125 = trained_model_knn_125.predict(X_test_125)

Train_accuracy_knn_125 = accuracy_score(y_train_125,trained_model_knn_125.predict(X_train_125))
Test_accuracy_knn_125 = accuracy_score(y_test_125, predictions_knn_125)
Confusion_matrix_knn_125 = confusion_matrix(y_test_125,predictions_knn_125)

In [135]:
Train_accuracy_knn_125

0.5862068965517241

In [136]:
Test_accuracy_knn_125

0.5

#### Last 100 Days based on Best Model

In [154]:
predictions_knn_100 = trained_model_knn_250.predict(last100)
predictions_knn_100

array([0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1])

### XGBoost

In [138]:
xgb = XGBClassifier(use_label_encoder=False)
trained_model_xgb = xgb.fit(X_train, y_train)
predictions_xgb = trained_model_xgb.predict(X_test)

Train_accuracy_xgb = accuracy_score(y_train, trained_model_xgb.predict(X_train))
Test_accuracy_xgb = accuracy_score(y_test, predictions_xgb)
Confusion_matrix_xgb = confusion_matrix(y_test, predictions_xgb)



In [139]:
Train_accuracy_xgb

0.9813910211677134

In [140]:
Test_accuracy_xgb

0.626695604991861

#### 5 Years

In [141]:
trained_model_xgb_1250 = xgb.fit(X_train_1250, y_train_1250)
predictions_xgb_1250 = trained_model_xgb_1250.predict(X_test_1250)

Train_accuracy_xgb_1250 = accuracy_score(y_train_1250,trained_model_xgb_1250.predict(X_train_1250))
Test_accuracy_xgb_1250 = accuracy_score(y_test_1250, predictions_xgb_1250)
Confusion_matrix_xgb_1250 = confusion_matrix(y_test_1250,predictions_xgb_1250)



In [142]:
Train_accuracy_xgb_1250

1.0

In [143]:
Test_accuracy_xgb_1250

0.632

#### 1 Year

In [144]:
trained_model_xgb_250 = xgb.fit(X_train_250, y_train_250)
predictions_xgb_250 = trained_model_xgb_250.predict(X_test_250)

Train_accuracy_xgb_250 = accuracy_score(y_train_250,trained_model_xgb_250.predict(X_train_250))
Test_accuracy_xgb_250 = accuracy_score(y_test_250, predictions_xgb_250)
Confusion_matrix_xgb_250 = confusion_matrix(y_test_250,predictions_xgb_250)



In [145]:
Train_accuracy_xgb_250

1.0

In [146]:
Test_accuracy_xgb_250

0.5466666666666666

#### 6 Months

In [147]:
trained_model_xgb_125 = xgb.fit(X_train_125, y_train_125)
predictions_xgb_125 = trained_model_xgb_125.predict(X_test_125)

Train_accuracy_xgb_125 = accuracy_score(y_train_125,trained_model_xgb_125.predict(X_train_125))
Test_accuracy_xgb_125 = accuracy_score(y_test_125, predictions_xgb_125)
Confusion_matrix_xgb_125 = confusion_matrix(y_test_125,predictions_xgb_125)



In [148]:
Train_accuracy_xgb_125

1.0

In [149]:
Test_accuracy_xgb_125

0.5

#### Last 100 Days based on Best Model

In [155]:
predictions_xgb_100 = trained_model_xgb_1250.predict(last100)
predictions_xgb_100

array([1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1])

## KFold cross validation
### Basic example

In [None]:
from sklearn.model_selection import cross_val_score

cross_val_score(LogisticRegression(solver='liblinear',multi_class='ovr'), complete_data, target, cv=10)

In [None]:
cross_val_score(SVC(gamma='auto'), complete_data, target, cv=10)

In [None]:
cross_val_score(RandomForestClassifier(n_estimators=5), complete_data, target, cv=10)

In [None]:
cross_val_score(KNeighborsClassifier(n_neighbors=5), complete_data, target, cv=10)

In [None]:
cross_val_score(XGBClassifier(use_label_encoder=False), complete_data, target, cv=10)