In [1]:
import numpy as np
import pandas as pd
import ta
import seaborn as sn
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from ta import add_all_ta_features
from ta import momentum
from ta.utils import dropna

In [2]:
#pip install xgboost

In [3]:
pip install autofeat

Note: you may need to restart the kernel to use updated packages.


In [4]:
def get_data(company_symbol):
    return pd.read_csv("dataset/"+company_symbol+".csv")

In [5]:
data = get_data("FB")
data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,5/18/2012,42.049999,45.000000,38.000000,38.230000,38.230000,573576400
1,5/21/2012,36.529999,36.660000,33.000000,34.029999,34.029999,168192700
2,5/22/2012,32.610001,33.590000,30.940001,31.000000,31.000000,101786600
3,5/23/2012,31.370001,32.500000,31.360001,32.000000,32.000000,73600000
4,5/24/2012,32.950001,33.209999,31.770000,33.029999,33.029999,50237200
...,...,...,...,...,...,...,...
2374,10/25/2021,320.299988,329.559998,319.720001,328.690002,328.690002,38409000
2375,10/26/2021,328.260010,330.209991,309.600006,315.809998,315.809998,65654000
2376,10/27/2021,314.190002,319.250000,312.059998,312.220001,312.220001,29971800
2377,10/28/2021,312.989990,325.519989,308.109985,316.920013,316.920013,50806800


In [6]:
data.dtypes

Date          object
Open         float64
High         float64
Low          float64
Close        float64
Adj Close    float64
Volume         int64
dtype: object

In [7]:
data.isna().any()

Date         False
Open         False
High         False
Low          False
Close        False
Adj Close    False
Volume       False
dtype: bool

In [8]:
data[data.columns[data.isna().any()]]

0
1
2
3
4
...
2374
2375
2376
2377
2378


In [9]:
cond1 = data.Date.isna() # aapl['Date'].isna()
cond2 = data.Date.str.contains(r'^\s*$', na=False)
cond3 = data.Date == ''

data_checked = data.assign(cond1= cond1, cond2= cond2, cond3= cond3)
print (data_checked)

            Date        Open        High         Low       Close   Adj Close  \
0      5/18/2012   42.049999   45.000000   38.000000   38.230000   38.230000   
1      5/21/2012   36.529999   36.660000   33.000000   34.029999   34.029999   
2      5/22/2012   32.610001   33.590000   30.940001   31.000000   31.000000   
3      5/23/2012   31.370001   32.500000   31.360001   32.000000   32.000000   
4      5/24/2012   32.950001   33.209999   31.770000   33.029999   33.029999   
...          ...         ...         ...         ...         ...         ...   
2374  10/25/2021  320.299988  329.559998  319.720001  328.690002  328.690002   
2375  10/26/2021  328.260010  330.209991  309.600006  315.809998  315.809998   
2376  10/27/2021  314.190002  319.250000  312.059998  312.220001  312.220001   
2377  10/28/2021  312.989990  325.519989  308.109985  316.920013  316.920013   
2378  10/29/2021  320.190002  326.000000  319.600006  323.570007  323.570007   

         Volume  cond1  cond2  cond3  


In [10]:
def simple_moving_average_5(close):
    return close.rolling(5, min_periods=1).mean()

In [11]:
def simple_moving_average_10(close):
    return close.rolling(10, min_periods=1).mean()

In [12]:
def stochastic_k(high, low, close):
    return (((close-low.rolling(14).min())/(high.rolling(14).max()-low.rolling(14).min()))*100)

In [13]:
def stochastic_d(stochastic_k):
    return stochastic_k.rolling(3,min_periods=1).mean()

In [14]:
def larry_williams_r(high, low, close):
    return (((high.rolling(14).max()-close)/(high.rolling(14).max()-low.rolling(14).min()))*100)

In [15]:
def rate_of_change(close):
    return ((close-close.shift(12))/close.shift(12))

In [16]:
def price_volume_trend(close, volume):
    return (((close-close.shift(1))/close.shift(1))*volume)

In [17]:
def accumulation_distribution_oscillator(high, low, close):
    return((high-close.shift(1))/(high-low))

In [18]:
def weighted_moving_average_10(close):
    return ((10*close+9*close.shift(1)+8*close.shift(2)+7*close.shift(3)
             +6*close.shift(4)+5*close.shift(5)+4*close.shift(6)+3*close.shift(7)
             +2*close.shift(8)+close.shift(9))/(10+9+8+7+6+5+4+3+2+1))

In [19]:
def exponential_moving_average_12(close):
    return (close.ewm(span=12, adjust=False).mean())

In [20]:
def exponential_moving_average_26(close):
    return (close.ewm(span=26, adjust=False).mean())

In [21]:
def moving_average_convergence_divergence(ema_12, ema_26):
    return (ema_12 - ema_26)

In [22]:
def momentum(close):
    return ((close/close.shift(14))*100)

In [23]:
def change(close):
    return (close.shift(-1) - close)

In [24]:
def direction(change):
    return (np.where(change>0, 1, 0))

In [25]:
def relative_strength_index(direction):
    return (100-(100/(1+(direction.rolling(14).sum()/14)/(14-direction.rolling(14).sum())/14)))

In [26]:
def channel_commodity_index(df, ndays): 
    df['TP'] = (df['High'] + df['Low'] + df['Close']) / 3 
    df['sma'] = df['TP'].rolling(ndays).mean()
    df['mad'] = df['TP'].rolling(ndays).apply(lambda x: pd.Series(x).mad())
    df['CCI'] = (df['TP'] - df['sma']) / (0.015 * df['mad']) 
    return df['CCI']

In [27]:
def disparity_index(close):
    return (((close - close.rolling(14).mean())/(close.rolling(14).mean()))*100)

In [28]:
def get_adx(high, low, close, lookback):
    plus_dm = high.diff()
    minus_dm = low.diff()
    plus_dm[plus_dm < 0] = 0
    minus_dm[minus_dm > 0] = 0
    
    tr1 = pd.DataFrame(high - low)
    tr2 = pd.DataFrame(abs(high - close.shift(1)))
    tr3 = pd.DataFrame(abs(low - close.shift(1)))
    frames = [tr1, tr2, tr3]
    tr = pd.concat(frames, axis = 1, join = 'inner').max(axis = 1)
    atr = tr.rolling(lookback).mean()
    
    plus_di = 100 * (plus_dm.ewm(alpha = 1/lookback).mean() / atr)
    minus_di = abs(100 * (minus_dm.ewm(alpha = 1/lookback).mean() / atr))
    dx = (abs(plus_di - minus_di) / abs(plus_di + minus_di)) * 100
    adx = ((dx.shift(1) * (lookback - 1)) + dx) / lookback
    adx_smooth = adx.ewm(alpha = 1/lookback).mean()
    return adx_smooth

In [29]:
def aroon(close):
    return (ta.trend.AroonIndicator(close, 25, True).aroon_indicator())

In [30]:
def compute_all_indicators(data):
    data['SMA5'] = simple_moving_average_5(data['Close'])
    data['SMA10'] = simple_moving_average_10(data['Close'])
    data['StochasticK'] = stochastic_k(data['High'], data['Low'], data['Close'])
    data['StochasticD'] = stochastic_d(data['StochasticK'])
    data['LarryWilliamsR'] = larry_williams_r(data['High'], data['Low'], data['Close'])
    data['ROC'] = rate_of_change(data['Close'])
    data['PVT'] = price_volume_trend(data['Close'], data['Volume'])
    data['ADO'] = accumulation_distribution_oscillator(data['High'], data['Low'], data['Close'])
    data['WMA10'] = weighted_moving_average_10(data['Close'])
    data['EMA12'] = exponential_moving_average_12(data['Close'])
    data['EMA26'] = exponential_moving_average_26(data['Close'])
    data['MACD'] = moving_average_convergence_divergence(data['EMA12'], data['EMA26'])
    data['Momentum'] = momentum(data['Close'])
    data['Change'] = change(data['Close'])
    data['Direction'] = direction(data['Change'])
    data['RSI'] = relative_strength_index(data['Direction'])
    data['CCI'] = channel_commodity_index(data, 14)
    data['DI'] = disparity_index(data['Close'])
    data['ADX'] = get_adx(data['High'], data['Low'], data['Close'], 14)
    data['Aroon'] = aroon(data['Close'])    
    return data

In [31]:
complete_data = compute_all_indicators(data)
complete_data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,...,Change,Direction,RSI,TP,sma,mad,CCI,DI,ADX,Aroon
0,5/18/2012,42.049999,45.000000,38.000000,38.230000,38.230000,573576400,38.230000,38.230000,,...,-4.200001,0,,40.410000,,,,,,0.0
1,5/21/2012,36.529999,36.660000,33.000000,34.029999,34.029999,168192700,36.129999,36.129999,,...,-3.029999,0,,34.563333,,,,,,-4.0
2,5/22/2012,32.610001,33.590000,30.940001,31.000000,31.000000,101786600,34.420000,34.420000,,...,1.000000,1,,31.843334,,,,,,-8.0
3,5/23/2012,31.370001,32.500000,31.360001,32.000000,32.000000,73600000,33.815000,33.815000,,...,1.029999,1,,31.953334,,,,,,-8.0
4,5/24/2012,32.950001,33.209999,31.770000,33.029999,33.029999,50237200,33.658000,33.658000,,...,-1.119999,0,,32.669999,,,,,,-8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2374,10/25/2021,320.299988,329.559998,319.720001,328.690002,328.690002,38409000,335.189996,331.288998,42.540399,...,-12.880004,0,0.675676,325.990000,330.581667,5.022143,-60.952286,-0.638920,32.451445,-60.0
2375,10/26/2021,328.260010,330.209991,309.600006,315.809998,315.809998,65654000,330.353998,330.492999,18.062801,...,-3.589997,0,0.675676,318.539998,329.672142,5.702925,-130.133739,-4.163506,32.635555,-84.0
2376,10/27/2021,314.190002,319.250000,312.059998,312.220001,312.220001,29971800,324.641998,329.260999,7.620694,...,4.700012,1,0.675676,314.510000,328.398095,6.276394,-147.516715,-4.902510,33.349941,-88.0
2377,10/28/2021,312.989990,325.519989,308.109985,316.920013,316.920013,50806800,319.650000,328.100000,24.560975,...,6.649994,1,0.910010,316.849996,327.407381,6.652347,-105.801103,-3.194426,33.965208,-88.0


In [32]:
complete_data = complete_data.dropna()

In [33]:
complete_data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,...,Change,Direction,RSI,TP,sma,mad,CCI,DI,ADX,Aroon
14,6/8/2012,26.549999,27.760000,26.440001,27.100000,27.100000,38034000,26.598000,27.925000,14.183124,...,-0.090000,0,0.282646,27.100000,29.409048,2.327483,-66.138608,-7.307419,81.950354,-44.0
15,6/11/2012,27.180000,28.070000,26.840000,27.010000,27.010000,28219600,26.620000,27.435000,18.463445,...,0.390000,1,0.381194,27.306667,28.890714,1.961395,-53.840862,-6.003132,77.257952,-44.0
16,6/12/2012,27.480000,27.770000,26.959999,27.400000,27.400000,15816800,26.926000,27.291000,24.447337,...,-0.130000,0,0.282646,27.376666,28.571667,1.740238,-45.779188,-3.784895,73.418742,-44.0
17,6/13/2012,27.660000,28.100000,27.100000,27.270000,27.270000,17102800,27.018000,27.199000,22.756830,...,1.020001,1,0.282646,27.490000,28.252857,1.479388,-34.377153,-3.091684,71.350434,-44.0
18,6/14/2012,27.650000,28.320000,27.379999,28.290001,28.290001,16855000,27.414000,27.068000,37.281301,...,1.719999,1,0.381194,27.996667,27.919048,1.074150,4.817392,1.757364,68.371248,-44.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2373,10/22/2021,326.350006,329.630005,321.109985,324.609985,324.609985,35152700,336.519995,330.964999,27.207763,...,4.080017,1,0.910010,325.116658,330.970476,4.735239,-82.414952,-1.962675,32.401953,-60.0
2374,10/25/2021,320.299988,329.559998,319.720001,328.690002,328.690002,38409000,335.189996,331.288998,42.540399,...,-12.880004,0,0.675676,325.990000,330.581667,5.022143,-60.952286,-0.638920,32.451445,-60.0
2375,10/26/2021,328.260010,330.209991,309.600006,315.809998,315.809998,65654000,330.353998,330.492999,18.062801,...,-3.589997,0,0.675676,318.539998,329.672142,5.702925,-130.133739,-4.163506,32.635555,-84.0
2376,10/27/2021,314.190002,319.250000,312.059998,312.220001,312.220001,29971800,324.641998,329.260999,7.620694,...,4.700012,1,0.675676,314.510000,328.398095,6.276394,-147.516715,-4.902510,33.349941,-88.0


In [34]:
complete_data = complete_data.drop(['TP', 'sma', 'mad', 'Change'], axis=1)

In [35]:
complete_data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,...,EMA12,EMA26,MACD,Momentum,Direction,RSI,CCI,DI,ADX,Aroon
14,6/8/2012,26.549999,27.760000,26.440001,27.100000,27.100000,38034000,26.598000,27.925000,14.183124,...,28.871771,31.839056,-2.967285,70.886738,0,0.282646,-66.138608,-7.307419,81.950354,-44.0
15,6/11/2012,27.180000,28.070000,26.840000,27.010000,27.010000,28219600,26.620000,27.435000,18.463445,...,28.585345,31.481348,-2.896003,79.371145,1,0.381194,-53.840862,-6.003132,77.257952,-44.0
16,6/12/2012,27.480000,27.770000,26.959999,27.400000,27.400000,15816800,26.926000,27.291000,24.447337,...,28.402984,31.179026,-2.776042,88.387097,0,0.282646,-45.779188,-3.784895,73.418742,-44.0
17,6/13/2012,27.660000,28.100000,27.100000,27.270000,27.270000,17102800,27.018000,27.199000,22.756830,...,28.228679,30.889468,-2.660790,85.218750,1,0.282646,-34.377153,-3.091684,71.350434,-44.0
18,6/14/2012,27.650000,28.320000,27.379999,28.290001,28.290001,16855000,27.414000,27.068000,37.281301,...,28.238113,30.696915,-2.458802,85.649415,1,0.381194,4.817392,1.757364,68.371248,-44.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2373,10/22/2021,326.350006,329.630005,321.109985,324.609985,324.609985,35152700,336.519995,330.964999,27.207763,...,333.872032,339.701384,-5.829352,99.503410,1,0.910010,-82.414952,-1.962675,32.401953,-60.0
2374,10/25/2021,320.299988,329.559998,319.720001,328.690002,328.690002,38409000,335.189996,331.288998,42.540399,...,333.074797,338.885726,-5.810929,98.717567,0,0.675676,-60.952286,-0.638920,32.451445,-60.0
2375,10/26/2021,328.260010,330.209991,309.600006,315.809998,315.809998,65654000,330.353998,330.492999,18.062801,...,330.418674,337.176413,-6.757739,94.655912,0,0.675676,-130.133739,-4.163506,32.635555,-84.0
2376,10/27/2021,314.190002,319.250000,312.059998,312.220001,312.220001,29971800,324.641998,329.260999,7.620694,...,327.618878,335.327790,-7.708912,94.836280,1,0.675676,-147.516715,-4.902510,33.349941,-88.0


In [36]:
#corrMatrix = complete_data.corr()
#print(corrMatrix)

In [37]:
#sn.heatmap(corrMatrix, annot=False)
#plt.show()

In [38]:
#corr_pairs = corrMatrix.unstack()
#corr_pairs

In [39]:
#sorted_pairs = corr_pairs.sort_values(kind="quicksort")
#sorted_pairs

In [40]:
#strong_pairs = sorted_pairs[abs(sorted_pairs) > 0.5]

#print(strong_pairs)

In [41]:
## strong_pairs[strong_pairs.index[0][0] == strong_pairs.index[0][1]]

#removed_diagonal = [(i, j) for (i, j) in strong_pairs.index if i!=j]
#len(removed_diagonal)

In [42]:
## # Create correlation matrix
## corr_matrix = complete_data.corr().abs()

## # Select upper triangle of correlation matrix
## upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool8))

## # Find index of feature columns with correlation greater than 0.95
## to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
## to_drop

In [43]:
## Find index of feature columns with correlation greater than 0.8

#correlated_features = set()
#for i in range(len(corrMatrix.columns)):
 #   for j in range(i):
  #      if abs(corrMatrix.iloc[i, j]) > 0.8:
   #         colname = corrMatrix.columns[i]
    #        correlated_features.add(colname)

In [44]:
#correlated_features

In [45]:
#complete_data = complete_data.drop(labels=correlated_features, axis=1)

In [46]:
#complete_data

In [47]:
target = complete_data['Direction']
complete_data = complete_data.drop(['Date', 'Direction'], axis=1)
complete_data

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,StochasticD,...,WMA10,EMA12,EMA26,MACD,Momentum,RSI,CCI,DI,ADX,Aroon
14,26.549999,27.760000,26.440001,27.100000,27.100000,38034000,26.598000,27.925000,14.183124,9.119280,...,27.198545,28.871771,31.839056,-2.967285,70.886738,0.282646,-66.138608,-7.307419,81.950354,-44.0
15,27.180000,28.070000,26.840000,27.010000,27.010000,28219600,26.620000,27.435000,18.463445,12.234002,...,27.032182,28.585345,31.481348,-2.896003,79.371145,0.381194,-53.840862,-6.003132,77.257952,-44.0
16,27.480000,27.770000,26.959999,27.400000,27.400000,15816800,26.926000,27.291000,24.447337,19.031302,...,27.025818,28.402984,31.179026,-2.776042,88.387097,0.282646,-45.779188,-3.784895,73.418742,-44.0
17,27.660000,28.100000,27.100000,27.270000,27.270000,17102800,27.018000,27.199000,22.756830,21.889204,...,27.022000,28.228679,30.889468,-2.660790,85.218750,0.282646,-34.377153,-3.091684,71.350434,-44.0
18,27.650000,28.320000,27.379999,28.290001,28.290001,16855000,27.414000,27.068000,37.281301,28.161823,...,27.220364,28.238113,30.696915,-2.458802,85.649415,0.381194,4.817392,1.757364,68.371248,-44.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2373,326.350006,329.630005,321.109985,324.609985,324.609985,35152700,336.519995,330.964999,27.207763,67.993823,...,333.195634,333.872032,339.701384,-5.829352,99.503410,0.910010,-82.414952,-1.962675,32.401953,-60.0
2374,320.299988,329.559998,319.720001,328.690002,328.690002,38409000,335.189996,331.288998,42.540399,53.952125,...,332.781998,333.074797,338.885726,-5.810929,98.717567,0.675676,-60.952286,-0.638920,32.451445,-60.0
2375,328.260010,330.209991,309.600006,315.809998,315.809998,65654000,330.353998,330.492999,18.062801,29.270321,...,329.967634,330.418674,337.176413,-6.757739,94.655912,0.675676,-130.133739,-4.163506,32.635555,-84.0
2376,314.190002,319.250000,312.059998,312.220001,312.220001,29971800,324.641998,329.260999,7.620694,22.741298,...,326.645271,327.618878,335.327790,-7.708912,94.836280,0.675676,-147.516715,-4.902510,33.349941,-88.0


### autofeat Classification

In [48]:
from sklearn.model_selection import train_test_split

In [49]:
from autofeat import AutoFeatClassifier
X_train, X_test, y_train, y_test = train_test_split(complete_data,target,test_size=0.3)
model = AutoFeatClassifier()
df = model.fit_transform(X_train, y_train)
y_pred = model.predict(X_test)

In [50]:
df_test = model.transform(X_test)
model.score(df_test,y_test)

0.6056338028169014

In [51]:
df

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,StochasticD,...,Momentum,RSI,CCI,DI,ADX,Aroon,sqrt(StochasticK)/RSI,Momentum**3/RSI,RSI/StochasticD,sqrt(LarryWilliamsR)*RSI
0,161.500000,166.119995,161.240005,165.440002,165.440002,20211500.0,164.726001,167.972002,41.011993,21.393720,...,102.471357,0.910010,-65.307025,-1.571516,18.750457,56.0,7.037351,1.182391e+06,0.042536,6.989210
1,186.940002,187.889999,185.630005,187.839996,187.839996,10529900.0,187.034000,183.325999,92.550961,93.651926,...,104.640411,0.381194,91.476834,3.460945,17.774823,88.0,25.237361,3.005743e+06,0.004070,1.040391
2,184.850006,188.320007,184.179993,186.990005,186.990005,21207800.0,182.414001,178.463002,95.860561,97.517190,...,112.753262,0.910010,148.552991,6.774727,20.775024,96.0,10.759046,1.575220e+06,0.009332,1.851471
3,49.770000,51.160000,49.570000,51.040001,51.040001,74447000.0,50.230000,49.788000,93.893161,84.369339,...,120.065875,0.910010,70.574462,4.827993,50.240747,76.0,10.648067,1.902009e+06,0.010786,2.248819
4,178.350006,179.100006,177.960007,178.460007,178.460007,11070200.0,179.548001,179.817000,68.308433,72.924154,...,104.198052,0.675676,21.694999,0.376046,13.458131,20.0,12.232040,1.674328e+06,0.009265,3.803734
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1649,59.290001,59.680000,55.880001,59.090000,59.090000,108623000.0,59.616000,59.218000,43.093277,36.038355,...,97.847327,0.910010,-72.011152,-1.047822,30.456252,-68.0,7.213708,1.029439e+06,0.025251,6.864802
1650,102.199997,103.809998,101.550003,103.769997,103.769997,25961200.0,99.948000,97.990000,99.700142,97.906997,...,111.821114,1.259446,164.749880,7.432684,16.613042,76.0,7.928087,1.110176e+06,0.012864,0.689663
1651,25.809999,26.360001,25.700001,25.969999,25.969999,25687600.0,26.188000,26.704000,27.796576,21.920904,...,102.163647,0.675676,-100.602860,-2.941808,20.908000,48.0,7.802924,1.578159e+06,0.030823,5.741392
1652,118.989998,120.849998,118.660004,120.610001,120.610001,21541300.0,118.182001,117.650000,96.954352,92.191397,...,107.018637,1.259446,146.606639,3.380212,16.662954,60.0,7.818153,9.731925e+05,0.013661,2.197958


In [52]:
df_test

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,StochasticD,...,Momentum,RSI,CCI,DI,ADX,Aroon,sqrt(StochasticK)/RSI,Momentum**3/RSI,RSI/StochasticD,sqrt(LarryWilliamsR)*RSI
0,351.230011,354.190002,349.640015,353.160004,353.160004,10018600.0,350.500000,351.683005,78.503229,69.809361,...,106.280657,0.507614,57.872836,1.295637,27.126977,24.0,17.454603,2.364988e+06,0.007271,2.353534
1,173.080002,176.979996,173.059998,176.610001,176.610001,17677800.0,174.512003,170.553001,93.284600,88.518358,...,107.146757,0.381194,85.449174,4.110015,17.454155,88.0,25.337190,3.226938e+06,0.004306,0.987831
2,183.770004,186.080002,183.470001,185.570007,185.570007,10128700.0,181.348001,182.601000,60.529908,42.232771,...,98.786266,0.910010,57.198164,1.399231,22.935268,-80.0,8.549462,1.059360e+06,0.021547,5.717159
3,19.879999,19.959999,19.610001,19.750000,19.750000,21817300.0,20.186000,21.063000,9.554111,9.022426,...,86.395447,0.381194,-108.281634,-5.515310,19.273140,28.0,8.108651,1.691710e+06,0.042250,3.625275
4,192.039993,194.210007,189.979996,190.279999,190.279999,26677500.0,188.671997,188.210999,74.180292,79.128635,...,106.082402,0.675676,105.824922,2.490391,16.397169,88.0,12.746941,1.766818e+06,0.008539,3.433317
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
705,116.730003,118.489998,115.830002,117.199997,117.199997,34681400.0,119.056000,120.957000,18.795028,15.747995,...,89.438342,0.282646,-96.789829,-5.213054,29.133201,-60.0,15.338374,2.531215e+06,0.017948,2.547027
706,358.450012,363.000000,357.109985,362.649994,362.649994,7170700.0,361.771997,359.023999,50.083732,46.794244,...,98.069170,0.381194,-8.932888,0.296120,20.584551,28.0,18.565293,2.474292e+06,0.008146,2.693194
707,68.180000,68.949997,66.620003,68.739998,68.739998,52197000.0,69.253999,69.931000,36.677602,31.578951,...,98.410880,0.381194,-89.391754,-1.132141,26.350170,76.0,15.887441,2.500247e+06,0.012071,3.033369
708,176.300003,176.789993,174.750000,175.899994,175.899994,18494100.0,175.433997,174.442998,42.649498,39.301505,...,96.072968,0.381194,-10.993983,-0.112764,28.943345,20.0,17.132094,2.326254e+06,0.009699,2.886790


### Generate New Features Based on autofeat Results

In [53]:
import math

In [54]:
df_test.columns

Index(['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'SMA5', 'SMA10',
       'StochasticK', 'StochasticD', 'LarryWilliamsR', 'ROC', 'PVT', 'ADO',
       'WMA10', 'EMA12', 'EMA26', 'MACD', 'Momentum', 'RSI', 'CCI', 'DI',
       'ADX', 'Aroon', 'sqrt(StochasticK)/RSI', 'Momentum**3/RSI',
       'RSI/StochasticD', 'sqrt(LarryWilliamsR)*RSI'],
      dtype='object')

In [55]:
def sqrt_stochastic_k_rsi(stochastic_k, rsi):
    return(np.sqrt(stochastic_k)/rsi)

In [56]:
def momentum_3_rsi(momentum, rsi):
    return ((momentum**3)/rsi)

In [57]:
def rsi_stochastic_d (rsi, stochastic_d):
    return (rsi / stochastic_d)

In [58]:
def sqrt_lwr_rsi (larry_williams_r, rsi):
    return (np.sqrt(larry_williams_r)*rsi)

In [59]:
def compute_adv_indicators(complete_data):
    complete_data['SqrtStochasticK/RSI'] = sqrt_stochastic_k_rsi(complete_data['StochasticK'], complete_data['RSI'])
    complete_data['Momentum**3/RSI'] = momentum_3_rsi(complete_data['Momentum'], complete_data['RSI'])
    complete_data['RSI/StochasticD'] = rsi_stochastic_d(complete_data['RSI'], complete_data['StochasticD'])
    complete_data['SqrtLarryWilliamsR*RSI'] = sqrt_lwr_rsi(complete_data['LarryWilliamsR'], complete_data['RSI'])
    return complete_data

In [60]:
enhanced_data = compute_adv_indicators(complete_data)
enhanced_data

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,StochasticD,...,Momentum,RSI,CCI,DI,ADX,Aroon,SqrtStochasticK/RSI,Momentum**3/RSI,RSI/StochasticD,SqrtLarryWilliamsR*RSI
14,26.549999,27.760000,26.440001,27.100000,27.100000,38034000,26.598000,27.925000,14.183124,9.119280,...,70.886738,0.282646,-66.138608,-7.307419,81.950354,-44.0,13.324281,1.260239e+06,0.030994,2.618355
15,27.180000,28.070000,26.840000,27.010000,27.010000,28219600,26.620000,27.435000,18.463445,12.234002,...,79.371145,0.381194,-53.840862,-6.003132,77.257952,-44.0,11.272230,1.311721e+06,0.031159,3.442094
16,27.480000,27.770000,26.959999,27.400000,27.400000,15816800,26.926000,27.291000,24.447337,19.031302,...,88.387097,0.282646,-45.779188,-3.784895,73.418742,-44.0,17.493375,2.443005e+06,0.014852,2.456784
17,27.660000,28.100000,27.100000,27.270000,27.270000,17102800,27.018000,27.199000,22.756830,21.889204,...,85.218750,0.282646,-34.377153,-3.091684,71.350434,-44.0,16.877717,2.189593e+06,0.012913,2.484118
18,27.650000,28.320000,27.379999,28.290001,28.290001,16855000,27.414000,27.068000,37.281301,28.161823,...,85.649415,0.381194,4.817392,1.757364,68.371248,-44.0,16.017658,1.648264e+06,0.013536,3.018874
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2373,326.350006,329.630005,321.109985,324.609985,324.609985,35152700,336.519995,330.964999,27.207763,67.993823,...,99.503410,0.910010,-82.414952,-1.962675,32.401953,-60.0,5.731921,1.082599e+06,0.013384,7.764058
2374,320.299988,329.559998,319.720001,328.690002,328.690002,38409000,335.189996,331.288998,42.540399,53.952125,...,98.717567,0.675676,-60.952286,-0.638920,32.451445,-60.0,9.653004,1.423787e+06,0.012524,5.121764
2375,328.260010,330.209991,309.600006,315.809998,315.809998,65654000,330.353998,330.492999,18.062801,29.270321,...,94.655912,0.675676,-130.133739,-4.163506,32.635555,-84.0,6.290052,1.255177e+06,0.023084,6.116160
2376,314.190002,319.250000,312.059998,312.220001,312.220001,29971800,324.641998,329.260999,7.620694,22.741298,...,94.836280,0.675676,-147.516715,-4.902510,33.349941,-88.0,4.085629,1.262366e+06,0.029711,6.494200


### autofeat Feature Selection

In [61]:
from autofeat import FeatureSelector
fsel = FeatureSelector(verbose=1)
selected_data = fsel.fit_transform(pd.DataFrame(enhanced_data), pd.DataFrame(target))

  return f(**kwargs)


[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 7 features after 5 feature selection runs
[featsel] 7 features after correlation filtering
[featsel] 4 features after noise filtering


In [62]:
selected_data

Unnamed: 0,Momentum**3/RSI,RSI/StochasticD,SqrtStochasticK/RSI,SqrtLarryWilliamsR*RSI
0,1.260239e+06,0.030994,13.324281,2.618355
1,1.311721e+06,0.031159,11.272230,3.442094
2,2.443005e+06,0.014852,17.493375,2.456784
3,2.189593e+06,0.012913,16.877717,2.484118
4,1.648264e+06,0.013536,16.017658,3.018874
...,...,...,...,...
2359,1.082599e+06,0.013384,5.731921,7.764058
2360,1.423787e+06,0.012524,9.653004,5.121764
2361,1.255177e+06,0.023084,6.290052,6.116160
2362,1.262366e+06,0.029711,4.085629,6.494200


In [63]:
from sklearn.preprocessing import StandardScaler

In [64]:
scaler = StandardScaler()  
scaler.fit(selected_data)

StandardScaler()

In [65]:

selected_scaled_data = scaler.transform(selected_data)

In [66]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(selected_scaled_data,target,test_size=0.3)

In [67]:
X_train = scaler.transform(X_train)  
X_test = scaler.transform(X_test) 

In [68]:
selected_scaled_data_df = pd.DataFrame(selected_scaled_data)
selected_scaled_data_1250 = selected_scaled_data_df.tail(1250)
selected_scaled_data_250 = selected_scaled_data_df.tail(250)
selected_scaled_data_125 = selected_scaled_data_df.tail(125)
target_1250 = target.tail(1250)
target_250 = target.tail(250)
target_125 = target.tail(125)

In [69]:
X_train_1250, X_test_1250, y_train_1250, y_test_1250 = train_test_split(selected_scaled_data_1250,target_1250,test_size=0.3)

In [70]:
X_train_250, X_test_250, y_train_250, y_test_250 = train_test_split(selected_scaled_data_250,target_250,test_size=0.3)

In [71]:
X_train_125, X_test_125, y_train_125, y_test_125 = train_test_split(selected_scaled_data_125,target_125,test_size=0.3)

### Logistic Regression

#### Max Duration

In [72]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [73]:
lr = LogisticRegression(solver='liblinear', multi_class='ovr')
trained_model_lr = lr.fit(X_train, y_train)
predictions_lr = trained_model_lr.predict(X_test)

Train_accuracy_lr = accuracy_score(y_train,trained_model_lr.predict(X_train))
Test_accuracy_lr = accuracy_score(y_test, predictions_lr)
Confusion_matrix_lr = confusion_matrix(y_test,predictions_lr)

In [140]:
Train_accuracy_lr

0.6475211608222491

In [141]:
Test_accuracy_lr

0.6563380281690141

#### 5 Years

In [76]:
trained_model_lr_1250 = lr.fit(X_train_1250, y_train_1250)
predictions_lr_1250 = trained_model_lr_1250.predict(X_test_1250)

Train_accuracy_lr_1250 = accuracy_score(y_train_1250,trained_model_lr_1250.predict(X_train_1250))
Test_accuracy_lr_1250 = accuracy_score(y_test_1250, predictions_lr_1250)
Confusion_matrix_lr_1250 = confusion_matrix(y_test_1250,predictions_lr_1250)

In [77]:
Train_accuracy_lr_1250

0.6754285714285714

In [78]:
Test_accuracy_lr_1250

0.656

#### 1 Year

In [79]:
trained_model_lr_250 = lr.fit(X_train_250, y_train_250)
predictions_lr_250 = trained_model_lr_250.predict(X_test_250)

Train_accuracy_lr_250 = accuracy_score(y_train_250,trained_model_lr_250.predict(X_train_250))
Test_accuracy_lr_250 = accuracy_score(y_test_250, predictions_lr_250)
Confusion_matrix_lr_250 = confusion_matrix(y_test_250,predictions_lr_250)

In [80]:
Train_accuracy_lr_250

0.72

In [81]:
Test_accuracy_lr_250

0.72

#### 6 Months

In [82]:
trained_model_lr_125 = lr.fit(X_train_125, y_train_125)
predictions_lr_125 = trained_model_lr_125.predict(X_test_125)

Train_accuracy_lr_125 = accuracy_score(y_train_125,trained_model_lr_125.predict(X_train_125))
Test_accuracy_lr_125 = accuracy_score(y_test_125, predictions_lr_125)
Confusion_matrix_lr_125 = confusion_matrix(y_test_125,predictions_lr_125)

In [83]:
Train_accuracy_lr_125

0.7241379310344828

In [84]:
Test_accuracy_lr_125

0.7368421052631579

#### Last 100 Days based on Best Model - Max Duration

In [85]:
last100 = selected_scaled_data_df.tail(100)

In [146]:
predictions_lr_100 = trained_model_lr.predict(last100)
predictions_lr_100

array([1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

### SVM

In [87]:
svm = SVC(gamma='auto')
trained_model_svm = svm.fit(X_train, y_train)
predictions_svm = trained_model_svm.predict(X_test)

Train_accuracy_svm = accuracy_score(y_train,trained_model_svm.predict(X_train))
Test_accuracy_svm = accuracy_score(y_test, predictions_svm)
Confusion_matrix_svm = confusion_matrix(y_test,predictions_svm)

In [88]:
Train_accuracy_svm

0.656590084643289

In [89]:
Test_accuracy_svm

0.6295774647887324

#### 5 Years

In [90]:
trained_model_svm_1250 = svm.fit(X_train_1250, y_train_1250)
predictions_svm_1250 = trained_model_svm_1250.predict(X_test_1250)

Train_accuracy_svm_1250 = accuracy_score(y_train_1250,trained_model_svm_1250.predict(X_train_1250))
Test_accuracy_svm_1250 = accuracy_score(y_test_1250, predictions_svm_1250)
Confusion_matrix_svm_1250 = confusion_matrix(y_test_1250,predictions_svm_1250)

In [91]:
Train_accuracy_svm_1250

0.6891428571428572

In [92]:
Test_accuracy_svm_1250

0.6586666666666666

#### 1 Year

In [93]:
trained_model_svm_250 = svm.fit(X_train_250, y_train_250)
predictions_svm_250 = trained_model_svm_1250.predict(X_test_250)

Train_accuracy_svm_250 = accuracy_score(y_train_250,trained_model_svm_250.predict(X_train_250))
Test_accuracy_svm_250 = accuracy_score(y_test_250, predictions_svm_250)
Confusion_matrix_svm_250 = confusion_matrix(y_test_250,predictions_svm_250)

In [94]:
Train_accuracy_svm_250

0.7428571428571429

In [95]:
Test_accuracy_svm_250

0.6933333333333334

#### 6 Months

In [96]:
trained_model_svm_125 = svm.fit(X_train_125, y_train_125)
predictions_svm_125 = trained_model_svm_1250.predict(X_test_125)

Train_accuracy_svm_125 = accuracy_score(y_train_125,trained_model_svm_125.predict(X_train_125))
Test_accuracy_svm_125 = accuracy_score(y_test_125, predictions_svm_125)
Confusion_matrix_svm_125 = confusion_matrix(y_test_125,predictions_svm_125)

In [97]:
Train_accuracy_svm_125

0.7241379310344828

In [98]:
Test_accuracy_svm_125

0.6842105263157895

#### Last 100 Days based on Best Model - Max Duration

In [143]:
predictions_svm_100 = trained_model_svm_250.predict(last100)
predictions_svm_100

array([1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

### RandomForest

In [100]:
rf = RandomForestClassifier(n_estimators=10)
trained_model_rf = rf.fit(X_train, y_train)
predictions_rf = trained_model_rf.predict(X_test)

Train_accuracy_rf = accuracy_score(y_train, trained_model_rf.predict(X_train))
Test_accuracy_rf = accuracy_score(y_test, predictions_rf)
Confusion_matrix_rf = confusion_matrix(y_test, predictions_rf)

In [101]:
Train_accuracy_rf

0.9782345828295043

In [102]:
Test_accuracy_rf

0.6140845070422535

#### 5 Years

In [103]:
trained_model_rf_1250 = rf.fit(X_train_1250, y_train_1250)
predictions_rf_1250 = trained_model_rf_1250.predict(X_test_1250)

Train_accuracy_rf_1250 = accuracy_score(y_train_1250,trained_model_rf_1250.predict(X_train_1250))
Test_accuracy_rf_1250 = accuracy_score(y_test_1250, predictions_rf_1250)
Confusion_matrix_rf_1250 = confusion_matrix(y_test_1250,predictions_rf_1250)

In [104]:
Train_accuracy_rf_1250

0.9645714285714285

In [105]:
Test_accuracy_rf_1250

0.632

#### 1 Year

In [106]:
trained_model_rf_250 = rf.fit(X_train_250, y_train_250)
predictions_rf_250 = trained_model_rf_250.predict(X_test_250)

Train_accuracy_rf_250 = accuracy_score(y_train_250,trained_model_rf_250.predict(X_train_250))
Test_accuracy_rf_250 = accuracy_score(y_test_250, predictions_rf_250)
Confusion_matrix_rf_250 = confusion_matrix(y_test_250,predictions_rf_250)

In [107]:
Train_accuracy_rf_250

0.9828571428571429

In [108]:
Test_accuracy_rf_250

0.56

#### 6 Months

In [109]:
trained_model_rf_125 = rf.fit(X_train_125, y_train_125)
predictions_rf_125 = trained_model_rf_125.predict(X_test_125)

Train_accuracy_rf_125 = accuracy_score(y_train_125,trained_model_rf_125.predict(X_train_125))
Test_accuracy_rf_125 = accuracy_score(y_test_125, predictions_rf_125)
Confusion_matrix_rf_125 = confusion_matrix(y_test_125,predictions_rf_125)

In [110]:
Train_accuracy_rf_125

1.0

In [111]:
Test_accuracy_rf_125

0.6842105263157895

#### Last 100 Days based on Best Model

In [112]:
predictions_rf_100 = trained_model_rf_125.predict(last100)
predictions_rf_100

array([1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1])

### KNN

In [113]:
knn = KNeighborsClassifier(n_neighbors=5)
trained_model_knn = knn.fit(X_train, y_train)
predictions_knn = trained_model_knn.predict(X_test)

Train_accuracy_knn = accuracy_score(y_train, trained_model_knn.predict(X_train))
Test_accuracy_knn = accuracy_score(y_test, predictions_knn)
Confusion_matrix_knn = confusion_matrix(y_test, predictions_knn)

In [114]:
Train_accuracy_knn

0.7200725513905684

In [115]:
Test_accuracy_knn

0.604225352112676

#### 5 Years

In [116]:
trained_model_knn_1250 = knn.fit(X_train_1250, y_train_1250)
predictions_knn_1250 = trained_model_knn_1250.predict(X_test_1250)

Train_accuracy_knn_1250 = accuracy_score(y_train_1250,trained_model_knn_1250.predict(X_train_1250))
Test_accuracy_knn_1250 = accuracy_score(y_test_1250, predictions_knn_1250)
Confusion_matrix_knn_1250 = confusion_matrix(y_test_1250,predictions_knn_1250)

In [117]:
Train_accuracy_knn_1250

0.7611428571428571

In [118]:
Test_accuracy_knn_1250

0.648

#### 1 Year

In [119]:
trained_model_knn_250 = knn.fit(X_train_250, y_train_250)
predictions_knn_250 = trained_model_knn_250.predict(X_test_250)

Train_accuracy_knn_250 = accuracy_score(y_train_250,trained_model_knn_250.predict(X_train_250))
Test_accuracy_knn_250 = accuracy_score(y_test_250, predictions_knn_250)
Confusion_matrix_knn_250 = confusion_matrix(y_test_250,predictions_knn_250)

In [120]:
Train_accuracy_knn_250

0.76

In [121]:
Test_accuracy_knn_250

0.68

#### 6 Months

In [122]:
trained_model_knn_125 = knn.fit(X_train_125, y_train_125)
predictions_knn_125 = trained_model_knn_125.predict(X_test_125)

Train_accuracy_knn_125 = accuracy_score(y_train_125,trained_model_knn_125.predict(X_train_125))
Test_accuracy_knn_125 = accuracy_score(y_test_125, predictions_knn_125)
Confusion_matrix_knn_125 = confusion_matrix(y_test_125,predictions_knn_125)

In [123]:
Train_accuracy_knn_125

0.7471264367816092

In [124]:
Test_accuracy_knn_125

0.7368421052631579

#### Last 100 Days based on Best Model

In [144]:
predictions_knn_100 = trained_model_knn_125.predict(last100)
predictions_knn_100

array([1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

### XGBoost

In [126]:
xgb = XGBClassifier(use_label_encoder=False)
trained_model_xgb = xgb.fit(X_train, y_train)
predictions_xgb = trained_model_xgb.predict(X_test)

Train_accuracy_xgb = accuracy_score(y_train, trained_model_xgb.predict(X_train))
Test_accuracy_xgb = accuracy_score(y_test, predictions_xgb)
Confusion_matrix_xgb = confusion_matrix(y_test, predictions_xgb)



In [127]:
Train_accuracy_xgb

0.9564691656590084

In [128]:
Test_accuracy_xgb

0.6309859154929578

#### 5 Years

In [129]:
trained_model_xgb_1250 = xgb.fit(X_train_1250, y_train_1250)
predictions_xgb_1250 = trained_model_xgb_1250.predict(X_test_1250)

Train_accuracy_xgb_1250 = accuracy_score(y_train_1250,trained_model_xgb_1250.predict(X_train_1250))
Test_accuracy_xgb_1250 = accuracy_score(y_test_1250, predictions_xgb_1250)
Confusion_matrix_xgb_1250 = confusion_matrix(y_test_1250,predictions_xgb_1250)



In [130]:
Train_accuracy_xgb_1250

1.0

In [131]:
Test_accuracy_xgb_1250

0.656

#### 1 Year

In [132]:
trained_model_xgb_250 = xgb.fit(X_train_250, y_train_250)
predictions_xgb_250 = trained_model_xgb_250.predict(X_test_250)

Train_accuracy_xgb_250 = accuracy_score(y_train_250,trained_model_xgb_250.predict(X_train_250))
Test_accuracy_xgb_250 = accuracy_score(y_test_250, predictions_xgb_250)
Confusion_matrix_xgb_250 = confusion_matrix(y_test_250,predictions_xgb_250)



In [133]:
Train_accuracy_xgb_250

1.0

In [134]:
Test_accuracy_xgb_250

0.5466666666666666

#### 6 Months

In [135]:
trained_model_xgb_125 = xgb.fit(X_train_125, y_train_125)
predictions_xgb_125 = trained_model_xgb_125.predict(X_test_125)

Train_accuracy_xgb_125 = accuracy_score(y_train_125,trained_model_xgb_125.predict(X_train_125))
Test_accuracy_xgb_125 = accuracy_score(y_test_125, predictions_xgb_125)
Confusion_matrix_xgb_125 = confusion_matrix(y_test_125,predictions_xgb_125)



In [136]:
Train_accuracy_xgb_125

1.0

In [137]:
Test_accuracy_xgb_125

0.5

#### Last 100 Days based on Best Model

In [145]:
predictions_xgb_100 = trained_model_xgb_1250.predict(last100)
predictions_xgb_100

array([1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0,
       1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1])

## KFold cross validation
### Basic example

In [139]:
from sklearn.model_selection import cross_val_score

cross_val_score(LogisticRegression(solver='liblinear',multi_class='ovr'), complete_data, target, cv=10)

array([0.59493671, 0.68776371, 0.55696203, 0.66666667, 0.63983051,
       0.58898305, 0.59322034, 0.55508475, 0.58050847, 0.63135593])

In [None]:
cross_val_score(SVC(gamma='auto'), complete_data, target, cv=10)

In [None]:
cross_val_score(RandomForestClassifier(n_estimators=5), complete_data, target, cv=10)

In [None]:
cross_val_score(KNeighborsClassifier(n_neighbors=5), complete_data, target, cv=10)

In [None]:
cross_val_score(XGBClassifier(use_label_encoder=False), complete_data, target, cv=10)