In [1]:
import numpy as np
import pandas as pd
import ta
import seaborn as sn
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from ta import add_all_ta_features
from ta import momentum
from ta.utils import dropna

In [2]:
#pip install xgboost

In [3]:
pip install autofeat

Note: you may need to restart the kernel to use updated packages.


In [4]:
def get_data(company_symbol):
    return pd.read_csv("dataset/"+company_symbol+".csv")

In [5]:
data = get_data("FB")
data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,5/18/2012,42.049999,45.000000,38.000000,38.230000,38.230000,573576400
1,5/21/2012,36.529999,36.660000,33.000000,34.029999,34.029999,168192700
2,5/22/2012,32.610001,33.590000,30.940001,31.000000,31.000000,101786600
3,5/23/2012,31.370001,32.500000,31.360001,32.000000,32.000000,73600000
4,5/24/2012,32.950001,33.209999,31.770000,33.029999,33.029999,50237200
...,...,...,...,...,...,...,...
2374,10/25/2021,320.299988,329.559998,319.720001,328.690002,328.690002,38409000
2375,10/26/2021,328.260010,330.209991,309.600006,315.809998,315.809998,65654000
2376,10/27/2021,314.190002,319.250000,312.059998,312.220001,312.220001,29971800
2377,10/28/2021,312.989990,325.519989,308.109985,316.920013,316.920013,50806800


In [6]:
data.dtypes

Date          object
Open         float64
High         float64
Low          float64
Close        float64
Adj Close    float64
Volume         int64
dtype: object

In [7]:
data.isna().any()

Date         False
Open         False
High         False
Low          False
Close        False
Adj Close    False
Volume       False
dtype: bool

In [8]:
data[data.columns[data.isna().any()]]

0
1
2
3
4
...
2374
2375
2376
2377
2378


In [9]:
cond1 = data.Date.isna() # aapl['Date'].isna()
cond2 = data.Date.str.contains(r'^\s*$', na=False)
cond3 = data.Date == ''

data_checked = data.assign(cond1= cond1, cond2= cond2, cond3= cond3)
print (data_checked)

            Date        Open        High         Low       Close   Adj Close  \
0      5/18/2012   42.049999   45.000000   38.000000   38.230000   38.230000   
1      5/21/2012   36.529999   36.660000   33.000000   34.029999   34.029999   
2      5/22/2012   32.610001   33.590000   30.940001   31.000000   31.000000   
3      5/23/2012   31.370001   32.500000   31.360001   32.000000   32.000000   
4      5/24/2012   32.950001   33.209999   31.770000   33.029999   33.029999   
...          ...         ...         ...         ...         ...         ...   
2374  10/25/2021  320.299988  329.559998  319.720001  328.690002  328.690002   
2375  10/26/2021  328.260010  330.209991  309.600006  315.809998  315.809998   
2376  10/27/2021  314.190002  319.250000  312.059998  312.220001  312.220001   
2377  10/28/2021  312.989990  325.519989  308.109985  316.920013  316.920013   
2378  10/29/2021  320.190002  326.000000  319.600006  323.570007  323.570007   

         Volume  cond1  cond2  cond3  


In [10]:
def simple_moving_average_5(close):
    return close.rolling(5, min_periods=1).mean()

In [11]:
def simple_moving_average_10(close):
    return close.rolling(10, min_periods=1).mean()

In [12]:
def stochastic_k(high, low, close):
    return (((close-low.rolling(14).min())/(high.rolling(14).max()-low.rolling(14).min()))*100)

In [13]:
def stochastic_d(stochastic_k):
    return stochastic_k.rolling(3,min_periods=1).mean()

In [14]:
def larry_williams_r(high, low, close):
    return (((high.rolling(14).max()-close)/(high.rolling(14).max()-low.rolling(14).min()))*100)

In [15]:
def rate_of_change(close):
    return ((close-close.shift(12))/close.shift(12))

In [16]:
def price_volume_trend(close, volume):
    return (((close-close.shift(1))/close.shift(1))*volume)

In [17]:
def accumulation_distribution_oscillator(high, low, close):
    return((high-close.shift(1))/(high-low))

In [18]:
def weighted_moving_average_10(close):
    return ((10*close+9*close.shift(1)+8*close.shift(2)+7*close.shift(3)
             +6*close.shift(4)+5*close.shift(5)+4*close.shift(6)+3*close.shift(7)
             +2*close.shift(8)+close.shift(9))/(10+9+8+7+6+5+4+3+2+1))

In [19]:
def exponential_moving_average_12(close):
    return (close.ewm(span=12, adjust=False).mean())

In [20]:
def exponential_moving_average_26(close):
    return (close.ewm(span=26, adjust=False).mean())

In [21]:
def moving_average_convergence_divergence(ema_12, ema_26):
    return (ema_12 - ema_26)

In [22]:
def momentum(close):
    return ((close/close.shift(14))*100)

In [23]:
def change(close):
    return (close.shift(-1) - close)

In [24]:
def direction(change):
    return (np.where(change>0, 1, 0))

In [25]:
def relative_strength_index(direction):
    return (100-(100/(1+(direction.rolling(14).sum()/14)/(14-direction.rolling(14).sum())/14)))

In [26]:
def channel_commodity_index(df, ndays): 
    df['TP'] = (df['High'] + df['Low'] + df['Close']) / 3 
    df['sma'] = df['TP'].rolling(ndays).mean()
    df['mad'] = df['TP'].rolling(ndays).apply(lambda x: pd.Series(x).mad())
    df['CCI'] = (df['TP'] - df['sma']) / (0.015 * df['mad']) 
    return df['CCI']

In [27]:
def disparity_index(close):
    return (((close - close.rolling(14).mean())/(close.rolling(14).mean()))*100)

In [28]:
def get_adx(high, low, close, lookback):
    plus_dm = high.diff()
    minus_dm = low.diff()
    plus_dm[plus_dm < 0] = 0
    minus_dm[minus_dm > 0] = 0
    
    tr1 = pd.DataFrame(high - low)
    tr2 = pd.DataFrame(abs(high - close.shift(1)))
    tr3 = pd.DataFrame(abs(low - close.shift(1)))
    frames = [tr1, tr2, tr3]
    tr = pd.concat(frames, axis = 1, join = 'inner').max(axis = 1)
    atr = tr.rolling(lookback).mean()
    
    plus_di = 100 * (plus_dm.ewm(alpha = 1/lookback).mean() / atr)
    minus_di = abs(100 * (minus_dm.ewm(alpha = 1/lookback).mean() / atr))
    dx = (abs(plus_di - minus_di) / abs(plus_di + minus_di)) * 100
    adx = ((dx.shift(1) * (lookback - 1)) + dx) / lookback
    adx_smooth = adx.ewm(alpha = 1/lookback).mean()
    return adx_smooth

In [29]:
def aroon(close):
    return (ta.trend.AroonIndicator(close, 25, True).aroon_indicator())

In [30]:
def compute_all_indicators(data):
    data['SMA5'] = simple_moving_average_5(data['Close'])
    data['SMA10'] = simple_moving_average_10(data['Close'])
    data['StochasticK'] = stochastic_k(data['High'], data['Low'], data['Close'])
    data['StochasticD'] = stochastic_d(data['StochasticK'])
    data['LarryWilliamsR'] = larry_williams_r(data['High'], data['Low'], data['Close'])
    data['ROC'] = rate_of_change(data['Close'])
    data['PVT'] = price_volume_trend(data['Close'], data['Volume'])
    data['ADO'] = accumulation_distribution_oscillator(data['High'], data['Low'], data['Close'])
    data['WMA10'] = weighted_moving_average_10(data['Close'])
    data['EMA12'] = exponential_moving_average_12(data['Close'])
    data['EMA26'] = exponential_moving_average_26(data['Close'])
    data['MACD'] = moving_average_convergence_divergence(data['EMA12'], data['EMA26'])
    data['Momentum'] = momentum(data['Close'])
    data['Change'] = change(data['Close'])
    data['Direction'] = direction(data['Change'])
    data['RSI'] = relative_strength_index(data['Direction'])
    data['CCI'] = channel_commodity_index(data, 14)
    data['DI'] = disparity_index(data['Close'])
    data['ADX'] = get_adx(data['High'], data['Low'], data['Close'], 14)
    data['Aroon'] = aroon(data['Close'])    
    return data

In [31]:
complete_data = compute_all_indicators(data)
complete_data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,...,Change,Direction,RSI,TP,sma,mad,CCI,DI,ADX,Aroon
0,5/18/2012,42.049999,45.000000,38.000000,38.230000,38.230000,573576400,38.230000,38.230000,,...,-4.200001,0,,40.410000,,,,,,0.0
1,5/21/2012,36.529999,36.660000,33.000000,34.029999,34.029999,168192700,36.129999,36.129999,,...,-3.029999,0,,34.563333,,,,,,-4.0
2,5/22/2012,32.610001,33.590000,30.940001,31.000000,31.000000,101786600,34.420000,34.420000,,...,1.000000,1,,31.843334,,,,,,-8.0
3,5/23/2012,31.370001,32.500000,31.360001,32.000000,32.000000,73600000,33.815000,33.815000,,...,1.029999,1,,31.953334,,,,,,-8.0
4,5/24/2012,32.950001,33.209999,31.770000,33.029999,33.029999,50237200,33.658000,33.658000,,...,-1.119999,0,,32.669999,,,,,,-8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2374,10/25/2021,320.299988,329.559998,319.720001,328.690002,328.690002,38409000,335.189996,331.288998,42.540399,...,-12.880004,0,0.675676,325.990000,330.581667,5.022143,-60.952286,-0.638920,32.451445,-60.0
2375,10/26/2021,328.260010,330.209991,309.600006,315.809998,315.809998,65654000,330.353998,330.492999,18.062801,...,-3.589997,0,0.675676,318.539998,329.672142,5.702925,-130.133739,-4.163506,32.635555,-84.0
2376,10/27/2021,314.190002,319.250000,312.059998,312.220001,312.220001,29971800,324.641998,329.260999,7.620694,...,4.700012,1,0.675676,314.510000,328.398095,6.276394,-147.516715,-4.902510,33.349941,-88.0
2377,10/28/2021,312.989990,325.519989,308.109985,316.920013,316.920013,50806800,319.650000,328.100000,24.560975,...,6.649994,1,0.910010,316.849996,327.407381,6.652347,-105.801103,-3.194426,33.965208,-88.0


In [32]:
complete_data = complete_data.dropna()

In [33]:
complete_data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,...,Change,Direction,RSI,TP,sma,mad,CCI,DI,ADX,Aroon
14,6/8/2012,26.549999,27.760000,26.440001,27.100000,27.100000,38034000,26.598000,27.925000,14.183124,...,-0.090000,0,0.282646,27.100000,29.409048,2.327483,-66.138608,-7.307419,81.950354,-44.0
15,6/11/2012,27.180000,28.070000,26.840000,27.010000,27.010000,28219600,26.620000,27.435000,18.463445,...,0.390000,1,0.381194,27.306667,28.890714,1.961395,-53.840862,-6.003132,77.257952,-44.0
16,6/12/2012,27.480000,27.770000,26.959999,27.400000,27.400000,15816800,26.926000,27.291000,24.447337,...,-0.130000,0,0.282646,27.376666,28.571667,1.740238,-45.779188,-3.784895,73.418742,-44.0
17,6/13/2012,27.660000,28.100000,27.100000,27.270000,27.270000,17102800,27.018000,27.199000,22.756830,...,1.020001,1,0.282646,27.490000,28.252857,1.479388,-34.377153,-3.091684,71.350434,-44.0
18,6/14/2012,27.650000,28.320000,27.379999,28.290001,28.290001,16855000,27.414000,27.068000,37.281301,...,1.719999,1,0.381194,27.996667,27.919048,1.074150,4.817392,1.757364,68.371248,-44.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2373,10/22/2021,326.350006,329.630005,321.109985,324.609985,324.609985,35152700,336.519995,330.964999,27.207763,...,4.080017,1,0.910010,325.116658,330.970476,4.735239,-82.414952,-1.962675,32.401953,-60.0
2374,10/25/2021,320.299988,329.559998,319.720001,328.690002,328.690002,38409000,335.189996,331.288998,42.540399,...,-12.880004,0,0.675676,325.990000,330.581667,5.022143,-60.952286,-0.638920,32.451445,-60.0
2375,10/26/2021,328.260010,330.209991,309.600006,315.809998,315.809998,65654000,330.353998,330.492999,18.062801,...,-3.589997,0,0.675676,318.539998,329.672142,5.702925,-130.133739,-4.163506,32.635555,-84.0
2376,10/27/2021,314.190002,319.250000,312.059998,312.220001,312.220001,29971800,324.641998,329.260999,7.620694,...,4.700012,1,0.675676,314.510000,328.398095,6.276394,-147.516715,-4.902510,33.349941,-88.0


In [34]:
complete_data = complete_data.drop(['TP', 'sma', 'mad', 'Change'], axis=1)

In [35]:
complete_data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,...,EMA12,EMA26,MACD,Momentum,Direction,RSI,CCI,DI,ADX,Aroon
14,6/8/2012,26.549999,27.760000,26.440001,27.100000,27.100000,38034000,26.598000,27.925000,14.183124,...,28.871771,31.839056,-2.967285,70.886738,0,0.282646,-66.138608,-7.307419,81.950354,-44.0
15,6/11/2012,27.180000,28.070000,26.840000,27.010000,27.010000,28219600,26.620000,27.435000,18.463445,...,28.585345,31.481348,-2.896003,79.371145,1,0.381194,-53.840862,-6.003132,77.257952,-44.0
16,6/12/2012,27.480000,27.770000,26.959999,27.400000,27.400000,15816800,26.926000,27.291000,24.447337,...,28.402984,31.179026,-2.776042,88.387097,0,0.282646,-45.779188,-3.784895,73.418742,-44.0
17,6/13/2012,27.660000,28.100000,27.100000,27.270000,27.270000,17102800,27.018000,27.199000,22.756830,...,28.228679,30.889468,-2.660790,85.218750,1,0.282646,-34.377153,-3.091684,71.350434,-44.0
18,6/14/2012,27.650000,28.320000,27.379999,28.290001,28.290001,16855000,27.414000,27.068000,37.281301,...,28.238113,30.696915,-2.458802,85.649415,1,0.381194,4.817392,1.757364,68.371248,-44.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2373,10/22/2021,326.350006,329.630005,321.109985,324.609985,324.609985,35152700,336.519995,330.964999,27.207763,...,333.872032,339.701384,-5.829352,99.503410,1,0.910010,-82.414952,-1.962675,32.401953,-60.0
2374,10/25/2021,320.299988,329.559998,319.720001,328.690002,328.690002,38409000,335.189996,331.288998,42.540399,...,333.074797,338.885726,-5.810929,98.717567,0,0.675676,-60.952286,-0.638920,32.451445,-60.0
2375,10/26/2021,328.260010,330.209991,309.600006,315.809998,315.809998,65654000,330.353998,330.492999,18.062801,...,330.418674,337.176413,-6.757739,94.655912,0,0.675676,-130.133739,-4.163506,32.635555,-84.0
2376,10/27/2021,314.190002,319.250000,312.059998,312.220001,312.220001,29971800,324.641998,329.260999,7.620694,...,327.618878,335.327790,-7.708912,94.836280,1,0.675676,-147.516715,-4.902510,33.349941,-88.0


In [36]:
#corrMatrix = complete_data.corr()
#print(corrMatrix)

In [37]:
#sn.heatmap(corrMatrix, annot=False)
#plt.show()

In [38]:
#corr_pairs = corrMatrix.unstack()
#corr_pairs

In [39]:
#sorted_pairs = corr_pairs.sort_values(kind="quicksort")
#sorted_pairs

In [40]:
#strong_pairs = sorted_pairs[abs(sorted_pairs) > 0.5]

#print(strong_pairs)

In [41]:
## strong_pairs[strong_pairs.index[0][0] == strong_pairs.index[0][1]]

#removed_diagonal = [(i, j) for (i, j) in strong_pairs.index if i!=j]
#len(removed_diagonal)

In [42]:
## # Create correlation matrix
## corr_matrix = complete_data.corr().abs()

## # Select upper triangle of correlation matrix
## upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool8))

## # Find index of feature columns with correlation greater than 0.95
## to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
## to_drop

In [43]:
## Find index of feature columns with correlation greater than 0.8

#correlated_features = set()
#for i in range(len(corrMatrix.columns)):
 #   for j in range(i):
  #      if abs(corrMatrix.iloc[i, j]) > 0.8:
   #         colname = corrMatrix.columns[i]
    #        correlated_features.add(colname)

In [44]:
#correlated_features

In [45]:
#complete_data = complete_data.drop(labels=correlated_features, axis=1)

In [46]:
#complete_data

In [47]:
target = complete_data['Direction']
complete_data = complete_data.drop(['Date', 'Direction'], axis=1)
complete_data

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,StochasticD,...,WMA10,EMA12,EMA26,MACD,Momentum,RSI,CCI,DI,ADX,Aroon
14,26.549999,27.760000,26.440001,27.100000,27.100000,38034000,26.598000,27.925000,14.183124,9.119280,...,27.198545,28.871771,31.839056,-2.967285,70.886738,0.282646,-66.138608,-7.307419,81.950354,-44.0
15,27.180000,28.070000,26.840000,27.010000,27.010000,28219600,26.620000,27.435000,18.463445,12.234002,...,27.032182,28.585345,31.481348,-2.896003,79.371145,0.381194,-53.840862,-6.003132,77.257952,-44.0
16,27.480000,27.770000,26.959999,27.400000,27.400000,15816800,26.926000,27.291000,24.447337,19.031302,...,27.025818,28.402984,31.179026,-2.776042,88.387097,0.282646,-45.779188,-3.784895,73.418742,-44.0
17,27.660000,28.100000,27.100000,27.270000,27.270000,17102800,27.018000,27.199000,22.756830,21.889204,...,27.022000,28.228679,30.889468,-2.660790,85.218750,0.282646,-34.377153,-3.091684,71.350434,-44.0
18,27.650000,28.320000,27.379999,28.290001,28.290001,16855000,27.414000,27.068000,37.281301,28.161823,...,27.220364,28.238113,30.696915,-2.458802,85.649415,0.381194,4.817392,1.757364,68.371248,-44.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2373,326.350006,329.630005,321.109985,324.609985,324.609985,35152700,336.519995,330.964999,27.207763,67.993823,...,333.195634,333.872032,339.701384,-5.829352,99.503410,0.910010,-82.414952,-1.962675,32.401953,-60.0
2374,320.299988,329.559998,319.720001,328.690002,328.690002,38409000,335.189996,331.288998,42.540399,53.952125,...,332.781998,333.074797,338.885726,-5.810929,98.717567,0.675676,-60.952286,-0.638920,32.451445,-60.0
2375,328.260010,330.209991,309.600006,315.809998,315.809998,65654000,330.353998,330.492999,18.062801,29.270321,...,329.967634,330.418674,337.176413,-6.757739,94.655912,0.675676,-130.133739,-4.163506,32.635555,-84.0
2376,314.190002,319.250000,312.059998,312.220001,312.220001,29971800,324.641998,329.260999,7.620694,22.741298,...,326.645271,327.618878,335.327790,-7.708912,94.836280,0.675676,-147.516715,-4.902510,33.349941,-88.0


### autofeat Classification

In [48]:
from sklearn.model_selection import train_test_split

In [49]:
from autofeat import AutoFeatClassifier
X_train, X_test, y_train, y_test = train_test_split(complete_data,target,test_size=0.3)
model = AutoFeatClassifier()
df = model.fit_transform(X_train, y_train)
y_pred = model.predict(X_test)

In [50]:
df_test = model.transform(X_test)
model.score(df_test,y_test)

0.6140845070422535

In [51]:
df

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,StochasticD,...,ROC**2/ADX,Low/AdjClose,DI**3*ROC**2,MACD*log(RSI),ROC**3*SMA5**3,log(EMA12)/RSI,Abs(Aroon)/RSI,log(RSI)*Abs(MACD),StochasticD*Abs(ADO),sqrt(StochasticK)*Abs(ROC)
0,87.519997,89.180000,85.610001,86.059998,86.059998,62778700.0,92.205998,93.084000,3.427244,19.734825,...,0.000610,0.994771,-6.112873,0.046835,-977.458786,16.024942,325.496000,-0.046835,7.628579,0.199257
1,24.540001,24.540001,23.920000,24.100000,24.100000,50079700.0,24.858000,25.610000,4.958679,6.306167,...,0.000553,0.992531,-4.498554,1.175635,-19.743350,23.321485,489.373333,-1.175635,2.339401,0.242116
2,202.839996,204.240005,200.960007,202.360001,202.360001,14583700.0,201.124002,202.220000,73.710909,65.504459,...,0.000029,0.993082,0.000458,-4.159315,227.356621,13.902913,167.893333,-4.159315,38.344057,0.260538
3,165.649994,167.419998,164.089996,164.339996,164.339996,16389200.0,163.580002,167.141000,33.688387,39.991119,...,0.000129,0.998479,-0.025820,-0.433808,-464.958131,7.565983,82.880000,-0.433808,16.092465,0.274883
4,328.950012,335.890015,327.500000,335.339996,335.339996,21585000.0,327.388001,328.826001,64.501043,38.989827,...,0.000004,0.976621,0.000283,3.481823,-59.629652,8.591695,118.400000,-3.481823,51.723026,0.095839
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1649,151.710007,153.570007,151.710007,152.869995,152.869995,19026500.0,150.847998,151.292000,75.339968,59.323078,...,0.000004,0.992412,0.000040,-0.263797,2.373741,7.427154,88.800000,-0.263797,93.450050,0.076757
1650,126.000000,134.240005,125.889999,134.179993,134.179993,39723400.0,129.965997,136.724999,46.172916,18.970818,...,0.000021,0.938217,-0.009525,1.294557,-28.772425,7.259251,41.440000,-1.294557,23.128493,0.160210
1651,229.029999,231.550003,227.410004,231.399994,231.399994,15466500.0,230.267999,229.514000,66.156029,64.207208,...,0.000001,0.982757,0.000036,-3.494088,2.935546,8.032079,41.440000,-3.494088,12.096998,0.050576
1652,206.750000,209.789993,206.270004,209.779999,209.779999,12077100.0,207.066000,206.148999,99.939868,82.636503,...,0.000282,0.983268,0.193826,-0.278189,4670.780651,5.849870,52.746667,-0.278189,106.582476,0.807031


In [52]:
df_test

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,StochasticD,...,ROC**2/ADX,Low/AdjClose,DI**3*ROC**2,MACD*log(RSI),ROC**3*SMA5**3,log(EMA12)/RSI,Abs(Aroon)/RSI,log(RSI)*Abs(MACD),StochasticD*Abs(ADO),sqrt(StochasticK)*Abs(ROC)
0,78.019997,78.300003,77.099998,77.480003,77.480003,26676800.0,77.478000,76.793000,80.993556,85.240257,...,0.000017,0.995095,0.001711,-0.541176,3.726615,6.421241,124.320000,-0.541176,26.992991,0.180088
1,40.889999,41.779999,40.799999,41.279999,41.279999,58303400.0,40.671999,39.353000,88.851349,75.506753,...,0.000268,0.988372,3.861269,-0.218788,102.519342,4.034736,92.306667,-0.218788,94.768679,1.084689
2,160.820007,161.100006,149.020004,160.059998,160.059998,126116600.0,164.375998,172.945999,29.773445,14.437888,...,0.000718,0.931026,-11.296105,5.702054,-8102.421388,18.195115,141.520000,-5.702054,2.043782,0.666727
3,28.010000,28.639999,27.830000,28.139999,28.139999,35642100.0,27.930000,27.666000,66.420652,69.098843,...,0.000036,0.988984,0.006304,0.245062,0.682609,8.730286,146.906667,-0.245062,58.008976,0.256924
4,124.010002,124.260002,123.639999,123.910004,123.910004,12400800.0,124.072002,124.561001,45.771289,40.877985,...,0.000002,0.997821,-0.000001,-1.778543,0.564686,17.053412,226.432000,-1.778543,7.252577,0.045070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
705,332.690002,339.920013,332.600006,339.029999,339.029999,15345300.0,333.714002,333.606000,94.586290,61.922091,...,0.000035,0.981034,0.003995,-1.887031,674.152974,8.595943,142.080000,-1.887031,64.544447,0.255540
706,159.179993,166.750000,158.059998,165.949997,165.949997,22515200.0,160.653998,155.061000,74.393996,61.967208,...,0.000141,0.952456,1.198294,6.723752,1702.030639,10.005892,102.440000,-6.723752,71.023438,0.641013
707,26.389999,26.750000,26.010000,26.320000,26.320000,30817600.0,26.024000,25.361000,84.420290,91.612559,...,0.000298,0.988222,0.454473,0.243195,5.376098,1.762256,45.741818,0.243195,58.186231,0.618498
708,69.739998,71.330002,69.610001,71.290001,71.290001,78435000.0,68.958000,67.873000,99.561393,94.477458,...,0.000574,0.976434,2.212245,-0.532401,254.488232,6.245402,65.120000,-0.532401,113.153443,0.916959


### Generate New Features Based on autofeat Results

In [53]:
import math

In [54]:
def sqrt_rsi_momentum(rsi, momentum):
    return (np.sqrt(rsi)/momentum)

In [55]:
def sqrt_stochastic_d_rsi(stochastic_d, rsi):
    return(np.sqrt(stochastic_d)/rsi)

In [56]:
def sqrt_lwr_rsi (larry_williams_r, rsi):
    return (np.sqrt(larry_williams_r)*rsi)

In [57]:
def momentum_3_rsi(momentum, rsi):
    return ((momentum**3)/rsi)

In [58]:
def ado_macd_2(ado, macd):
    return (ado*(macd**2))

In [59]:
def adx_abs_roc(adx, roc):
    return (adx*np.abs(roc))

In [60]:
def sqrt_stochastic_k_log_rsi (stochastic_k, rsi):
    return ((np.sqrt(stochastic_k))*np.log(rsi))

In [61]:
def compute_adv_indicators(complete_data):
    complete_data['SqrtRSI/Momentum'] = sqrt_rsi_momentum(complete_data['RSI'], complete_data['Momentum'])
    complete_data['SqrtStochasticD/RSI'] = sqrt_stochastic_d_rsi(complete_data['StochasticD'], complete_data['RSI'])
    complete_data['SqrtLarryWilliamsR*RSI'] = sqrt_lwr_rsi(complete_data['LarryWilliamsR'], complete_data['RSI'])
    complete_data['Momentum**3/RSI'] = momentum_3_rsi(complete_data['Momentum'], complete_data['RSI'])
    complete_data['ADO*MACD**2'] = ado_macd_2(complete_data['ADO'], complete_data['MACD'])
    complete_data['ADX*AbsROC'] = adx_abs_roc(complete_data['ADX'], complete_data['ROC'])
    complete_data['SqrtStochasticK*LogRSI'] = sqrt_stochastic_k_log_rsi(complete_data['StochasticK'], complete_data['RSI'])   
    return complete_data

In [62]:
enhanced_data = compute_adv_indicators(complete_data)
enhanced_data

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,StochasticD,...,DI,ADX,Aroon,SqrtRSI/Momentum,SqrtStochasticD/RSI,SqrtLarryWilliamsR*RSI,Momentum**3/RSI,ADO*MACD**2,ADX*AbsROC,SqrtStochasticK*LogRSI
14,26.549999,27.760000,26.440001,27.100000,27.100000,38034000,26.598000,27.925000,14.183124,9.119280,...,-7.307419,81.950354,-44.0,0.007500,10.684104,2.618355,1.260239e+06,9.671931,10.309883,-4.758635
15,27.180000,28.070000,26.840000,27.010000,27.010000,28219600,26.620000,27.435000,18.463445,12.234002,...,-6.003132,77.257952,-44.0,0.007779,9.175669,3.442094,1.311721e+06,6.614008,12.047412,-4.144138
16,27.480000,27.770000,26.959999,27.400000,27.400000,15816800,26.926000,27.291000,24.447337,19.031302,...,-3.784895,73.418742,-44.0,0.006015,15.434483,2.456784,2.443005e+06,7.230696,12.514304,-6.247585
17,27.660000,28.100000,27.100000,27.270000,27.270000,17102800,27.018000,27.199000,22.756830,21.889204,...,-3.091684,71.350434,-44.0,0.006239,16.552851,2.484118,2.189593e+06,4.955861,10.374993,-6.027709
18,27.650000,28.320000,27.379999,28.290001,28.290001,16855000,27.414000,27.068000,37.281301,28.161823,...,1.757364,68.371248,-44.0,0.007209,13.921430,3.018874,1.648264e+06,6.753178,1.303888,-5.888753
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2373,326.350006,329.630005,321.109985,324.609985,324.609985,35152700,336.519995,330.964999,27.207763,67.993823,...,-1.962675,32.401953,-60.0,0.009587,9.061258,7.764058,1.082599e+06,-48.858035,0.876965,-0.491877
2374,320.299988,329.559998,319.720001,328.690002,328.690002,38409000,335.189996,331.288998,42.540399,53.952125,...,-0.638920,32.451445,-60.0,0.008327,10.870912,5.121764,1.423787e+06,16.986447,0.052242,-2.557016
2375,328.260010,330.209991,309.600006,315.809998,315.809998,65654000,330.353998,330.492999,18.062801,29.270321,...,-4.163506,32.635555,-84.0,0.008684,8.007104,6.116160,1.255177e+06,3.367950,1.408059,-1.666193
2376,314.190002,319.250000,312.059998,312.220001,312.220001,29971800,324.641998,329.260999,7.620694,22.741298,...,-4.902510,33.349941,-88.0,0.008668,7.057800,6.494200,1.262366e+06,28.432551,1.355723,-1.082256


### autofeat Feature Selection

In [63]:
from autofeat import FeatureSelector
fsel = FeatureSelector(verbose=1)
selected_data = fsel.fit_transform(pd.DataFrame(enhanced_data), pd.DataFrame(target))

  return f(**kwargs)


[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 9 features after 5 feature selection runs
[featsel] 8 features after correlation filtering
[featsel] 7 features after noise filtering


In [64]:
selected_data

Unnamed: 0,Momentum**3/RSI,SqrtStochasticD/RSI,SqrtLarryWilliamsR*RSI,SqrtStochasticK*LogRSI,ROC,ADO*MACD**2,LarryWilliamsR
0,1.260239e+06,10.684104,2.618355,-4.758635,-0.125806,9.671931,85.816876
1,1.311721e+06,9.175669,3.442094,-4.144138,-0.155937,6.614008,81.536555
2,2.443005e+06,15.434483,2.456784,-6.247585,-0.170451,7.230696,75.552663
3,2.189593e+06,16.552851,2.484118,-6.027709,-0.145409,4.955861,77.243170
4,1.648264e+06,13.921430,3.018874,-5.888753,-0.019071,6.753178,62.718699
...,...,...,...,...,...,...,...
2359,1.082599e+06,9.061258,7.764058,-0.491877,-0.027065,-48.858035,72.792237
2360,1.423787e+06,10.870912,5.121764,-2.557016,-0.001610,16.986447,57.459601
2361,1.255177e+06,8.007104,6.116160,-1.666193,-0.043145,3.367950,81.937199
2362,1.262366e+06,7.057800,6.494200,-1.082256,-0.040651,28.432551,92.379306


In [65]:
from sklearn.preprocessing import StandardScaler

In [66]:
scaler = StandardScaler()  
scaler.fit(selected_data)

StandardScaler()

In [67]:

selected_scaled_data = scaler.transform(selected_data)

In [68]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(selected_scaled_data,target,test_size=0.3)

In [69]:
X_train = scaler.transform(X_train)  
X_test = scaler.transform(X_test) 

In [70]:
selected_scaled_data_df = pd.DataFrame(selected_scaled_data)
selected_scaled_data_1250 = selected_scaled_data_df.tail(1250)
selected_scaled_data_250 = selected_scaled_data_df.tail(250)
selected_scaled_data_125 = selected_scaled_data_df.tail(125)
target_1250 = target.tail(1250)
target_250 = target.tail(250)
target_125 = target.tail(125)

In [71]:
X_train_1250, X_test_1250, y_train_1250, y_test_1250 = train_test_split(selected_scaled_data_1250,target_1250,test_size=0.3)

In [72]:
X_train_250, X_test_250, y_train_250, y_test_250 = train_test_split(selected_scaled_data_250,target_250,test_size=0.3)

In [73]:
X_train_125, X_test_125, y_train_125, y_test_125 = train_test_split(selected_scaled_data_125,target_125,test_size=0.3)

### Logistic Regression

#### Max Duration

In [74]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [75]:
lr = LogisticRegression(solver='liblinear', multi_class='ovr')
trained_model_lr = lr.fit(X_train, y_train)
predictions_lr = trained_model_lr.predict(X_test)

Train_accuracy_lr = accuracy_score(y_train,trained_model_lr.predict(X_train))
Test_accuracy_lr = accuracy_score(y_test, predictions_lr)
Confusion_matrix_lr = confusion_matrix(y_test,predictions_lr)

In [76]:
Train_accuracy_lr

0.660217654171705

In [77]:
Test_accuracy_lr

0.6380281690140845

#### 5 Years

In [78]:
trained_model_lr_1250 = lr.fit(X_train_1250, y_train_1250)
predictions_lr_1250 = trained_model_lr_1250.predict(X_test_1250)

Train_accuracy_lr_1250 = accuracy_score(y_train_1250,trained_model_lr_1250.predict(X_train_1250))
Test_accuracy_lr_1250 = accuracy_score(y_test_1250, predictions_lr_1250)
Confusion_matrix_lr_1250 = confusion_matrix(y_test_1250,predictions_lr_1250)

In [79]:
Train_accuracy_lr_1250

0.6617142857142857

In [80]:
Test_accuracy_lr_1250

0.656

#### 1 Year

In [81]:
trained_model_lr_250 = lr.fit(X_train_250, y_train_250)
predictions_lr_250 = trained_model_lr_250.predict(X_test_250)

Train_accuracy_lr_250 = accuracy_score(y_train_250,trained_model_lr_250.predict(X_train_250))
Test_accuracy_lr_250 = accuracy_score(y_test_250, predictions_lr_250)
Confusion_matrix_lr_250 = confusion_matrix(y_test_250,predictions_lr_250)

In [82]:
Train_accuracy_lr_250

0.6742857142857143

In [83]:
Test_accuracy_lr_250

0.7066666666666667

#### 6 Months

In [84]:
trained_model_lr_125 = lr.fit(X_train_125, y_train_125)
predictions_lr_125 = trained_model_lr_125.predict(X_test_125)

Train_accuracy_lr_125 = accuracy_score(y_train_125,trained_model_lr_125.predict(X_train_125))
Test_accuracy_lr_125 = accuracy_score(y_test_125, predictions_lr_125)
Confusion_matrix_lr_125 = confusion_matrix(y_test_125,predictions_lr_125)

In [85]:
Train_accuracy_lr_125

0.735632183908046

In [86]:
Test_accuracy_lr_125

0.7894736842105263

#### Last 100 Days based on Best Model - Max Duration

In [87]:
last100 = selected_scaled_data_df.tail(100)

In [88]:
predictions_lr_100 = trained_model_lr_1250.predict(last100)
predictions_lr_100

array([1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1])

### SVM

In [89]:
svm = SVC(gamma='auto')
trained_model_svm = svm.fit(X_train, y_train)
predictions_svm = trained_model_svm.predict(X_test)

Train_accuracy_svm = accuracy_score(y_train,trained_model_svm.predict(X_train))
Test_accuracy_svm = accuracy_score(y_test, predictions_svm)
Confusion_matrix_svm = confusion_matrix(y_test,predictions_svm)

In [90]:
Train_accuracy_svm

0.6801692865779927

In [91]:
Test_accuracy_svm

0.5887323943661972

#### 5 Years

In [92]:
trained_model_svm_1250 = svm.fit(X_train_1250, y_train_1250)
predictions_svm_1250 = trained_model_svm_1250.predict(X_test_1250)

Train_accuracy_svm_1250 = accuracy_score(y_train_1250,trained_model_svm_1250.predict(X_train_1250))
Test_accuracy_svm_1250 = accuracy_score(y_test_1250, predictions_svm_1250)
Confusion_matrix_svm_1250 = confusion_matrix(y_test_1250,predictions_svm_1250)

In [93]:
Train_accuracy_svm_1250

0.6822857142857143

In [94]:
Test_accuracy_svm_1250

0.6426666666666667

#### 1 Year

In [95]:
trained_model_svm_250 = svm.fit(X_train_250, y_train_250)
predictions_svm_250 = trained_model_svm_1250.predict(X_test_250)

Train_accuracy_svm_250 = accuracy_score(y_train_250,trained_model_svm_250.predict(X_train_250))
Test_accuracy_svm_250 = accuracy_score(y_test_250, predictions_svm_250)
Confusion_matrix_svm_250 = confusion_matrix(y_test_250,predictions_svm_250)

In [96]:
Train_accuracy_svm_250

0.7257142857142858

In [97]:
Test_accuracy_svm_250

0.76

#### 6 Months

In [98]:
trained_model_svm_125 = svm.fit(X_train_125, y_train_125)
predictions_svm_125 = trained_model_svm_1250.predict(X_test_125)

Train_accuracy_svm_125 = accuracy_score(y_train_125,trained_model_svm_125.predict(X_train_125))
Test_accuracy_svm_125 = accuracy_score(y_test_125, predictions_svm_125)
Confusion_matrix_svm_125 = confusion_matrix(y_test_125,predictions_svm_125)

In [99]:
Train_accuracy_svm_125

0.735632183908046

In [100]:
Test_accuracy_svm_125

0.7631578947368421

#### Last 100 Days based on Best Model - Max Duration

In [101]:
predictions_svm_100 = trained_model_svm.predict(last100)
predictions_svm_100

array([1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

### RandomForest

In [102]:
rf = RandomForestClassifier(n_estimators=10)
trained_model_rf = rf.fit(X_train, y_train)
predictions_rf = trained_model_rf.predict(X_test)

Train_accuracy_rf = accuracy_score(y_train, trained_model_rf.predict(X_train))
Test_accuracy_rf = accuracy_score(y_test, predictions_rf)
Confusion_matrix_rf = confusion_matrix(y_test, predictions_rf)

In [103]:
Train_accuracy_rf

0.9788391777509069

In [104]:
Test_accuracy_rf

0.5830985915492958

#### 5 Years

In [105]:
trained_model_rf_1250 = rf.fit(X_train_1250, y_train_1250)
predictions_rf_1250 = trained_model_rf_1250.predict(X_test_1250)

Train_accuracy_rf_1250 = accuracy_score(y_train_1250,trained_model_rf_1250.predict(X_train_1250))
Test_accuracy_rf_1250 = accuracy_score(y_test_1250, predictions_rf_1250)
Confusion_matrix_rf_1250 = confusion_matrix(y_test_1250,predictions_rf_1250)

In [106]:
Train_accuracy_rf_1250

0.9817142857142858

In [107]:
Test_accuracy_rf_1250

0.5893333333333334

#### 1 Year

In [108]:
trained_model_rf_250 = rf.fit(X_train_250, y_train_250)
predictions_rf_250 = trained_model_rf_250.predict(X_test_250)

Train_accuracy_rf_250 = accuracy_score(y_train_250,trained_model_rf_250.predict(X_train_250))
Test_accuracy_rf_250 = accuracy_score(y_test_250, predictions_rf_250)
Confusion_matrix_rf_250 = confusion_matrix(y_test_250,predictions_rf_250)

In [109]:
Train_accuracy_rf_250

0.9657142857142857

In [110]:
Test_accuracy_rf_250

0.6133333333333333

#### 6 Months

In [111]:
trained_model_rf_125 = rf.fit(X_train_125, y_train_125)
predictions_rf_125 = trained_model_rf_125.predict(X_test_125)

Train_accuracy_rf_125 = accuracy_score(y_train_125,trained_model_rf_125.predict(X_train_125))
Test_accuracy_rf_125 = accuracy_score(y_test_125, predictions_rf_125)
Confusion_matrix_rf_125 = confusion_matrix(y_test_125,predictions_rf_125)

In [112]:
Train_accuracy_rf_125

0.9770114942528736

In [113]:
Test_accuracy_rf_125

0.6842105263157895

#### Last 100 Days based on Best Model

In [114]:
predictions_rf_100 = trained_model_rf_125.predict(last100)
predictions_rf_100

array([0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1])

### KNN

In [115]:
knn = KNeighborsClassifier(n_neighbors=5)
trained_model_knn = knn.fit(X_train, y_train)
predictions_knn = trained_model_knn.predict(X_test)

Train_accuracy_knn = accuracy_score(y_train, trained_model_knn.predict(X_train))
Test_accuracy_knn = accuracy_score(y_test, predictions_knn)
Confusion_matrix_knn = confusion_matrix(y_test, predictions_knn)

In [116]:
Train_accuracy_knn

0.718863361547763

In [117]:
Test_accuracy_knn

0.5830985915492958

#### 5 Years

In [118]:
trained_model_knn_1250 = knn.fit(X_train_1250, y_train_1250)
predictions_knn_1250 = trained_model_knn_1250.predict(X_test_1250)

Train_accuracy_knn_1250 = accuracy_score(y_train_1250,trained_model_knn_1250.predict(X_train_1250))
Test_accuracy_knn_1250 = accuracy_score(y_test_1250, predictions_knn_1250)
Confusion_matrix_knn_1250 = confusion_matrix(y_test_1250,predictions_knn_1250)

In [119]:
Train_accuracy_knn_1250

0.7417142857142857

In [120]:
Test_accuracy_knn_1250

0.6346666666666667

#### 1 Year

In [121]:
trained_model_knn_250 = knn.fit(X_train_250, y_train_250)
predictions_knn_250 = trained_model_knn_250.predict(X_test_250)

Train_accuracy_knn_250 = accuracy_score(y_train_250,trained_model_knn_250.predict(X_train_250))
Test_accuracy_knn_250 = accuracy_score(y_test_250, predictions_knn_250)
Confusion_matrix_knn_250 = confusion_matrix(y_test_250,predictions_knn_250)

In [122]:
Train_accuracy_knn_250

0.7714285714285715

In [123]:
Test_accuracy_knn_250

0.7066666666666667

#### 6 Months

In [124]:
trained_model_knn_125 = knn.fit(X_train_125, y_train_125)
predictions_knn_125 = trained_model_knn_125.predict(X_test_125)

Train_accuracy_knn_125 = accuracy_score(y_train_125,trained_model_knn_125.predict(X_train_125))
Test_accuracy_knn_125 = accuracy_score(y_test_125, predictions_knn_125)
Confusion_matrix_knn_125 = confusion_matrix(y_test_125,predictions_knn_125)

In [125]:
Train_accuracy_knn_125

0.7931034482758621

In [126]:
Test_accuracy_knn_125

0.631578947368421

#### Last 100 Days based on Best Model

In [127]:
predictions_knn_100 = trained_model_knn.predict(last100)
predictions_knn_100

array([0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1])

### XGBoost

In [128]:
xgb = XGBClassifier(use_label_encoder=False)
trained_model_xgb = xgb.fit(X_train, y_train)
predictions_xgb = trained_model_xgb.predict(X_test)

Train_accuracy_xgb = accuracy_score(y_train, trained_model_xgb.predict(X_train))
Test_accuracy_xgb = accuracy_score(y_test, predictions_xgb)
Confusion_matrix_xgb = confusion_matrix(y_test, predictions_xgb)



In [129]:
Train_accuracy_xgb

0.9969770253929867

In [130]:
Test_accuracy_xgb

0.5985915492957746

#### 5 Years

In [131]:
trained_model_xgb_1250 = xgb.fit(X_train_1250, y_train_1250)
predictions_xgb_1250 = trained_model_xgb_1250.predict(X_test_1250)

Train_accuracy_xgb_1250 = accuracy_score(y_train_1250,trained_model_xgb_1250.predict(X_train_1250))
Test_accuracy_xgb_1250 = accuracy_score(y_test_1250, predictions_xgb_1250)
Confusion_matrix_xgb_1250 = confusion_matrix(y_test_1250,predictions_xgb_1250)



In [132]:
Train_accuracy_xgb_1250

1.0

In [133]:
Test_accuracy_xgb_1250

0.6026666666666667

#### 1 Year

In [134]:
trained_model_xgb_250 = xgb.fit(X_train_250, y_train_250)
predictions_xgb_250 = trained_model_xgb_250.predict(X_test_250)

Train_accuracy_xgb_250 = accuracy_score(y_train_250,trained_model_xgb_250.predict(X_train_250))
Test_accuracy_xgb_250 = accuracy_score(y_test_250, predictions_xgb_250)
Confusion_matrix_xgb_250 = confusion_matrix(y_test_250,predictions_xgb_250)



In [135]:
Train_accuracy_xgb_250

1.0

In [136]:
Test_accuracy_xgb_250

0.6133333333333333

#### 6 Months

In [137]:
trained_model_xgb_125 = xgb.fit(X_train_125, y_train_125)
predictions_xgb_125 = trained_model_xgb_125.predict(X_test_125)

Train_accuracy_xgb_125 = accuracy_score(y_train_125,trained_model_xgb_125.predict(X_train_125))
Test_accuracy_xgb_125 = accuracy_score(y_test_125, predictions_xgb_125)
Confusion_matrix_xgb_125 = confusion_matrix(y_test_125,predictions_xgb_125)



In [138]:
Train_accuracy_xgb_125

1.0

In [139]:
Test_accuracy_xgb_125

0.6842105263157895

#### Last 100 Days based on Best Model

In [140]:
predictions_xgb_100 = trained_model_xgb.predict(last100)
predictions_xgb_100

array([1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1])

## KFold cross validation
### Basic example

In [141]:
from sklearn.model_selection import cross_val_score

cross_val_score(LogisticRegression(solver='liblinear',multi_class='ovr'), complete_data, target, cv=10)

array([0.59493671, 0.68776371, 0.55696203, 0.66244726, 0.63983051,
       0.58898305, 0.59322034, 0.55508475, 0.58474576, 0.63135593])

In [None]:
cross_val_score(SVC(gamma='auto'), complete_data, target, cv=10)

In [None]:
cross_val_score(RandomForestClassifier(n_estimators=5), complete_data, target, cv=10)

In [None]:
cross_val_score(KNeighborsClassifier(n_neighbors=5), complete_data, target, cv=10)

In [None]:
cross_val_score(XGBClassifier(use_label_encoder=False), complete_data, target, cv=10)