In [1]:
import numpy as np
import pandas as pd
import ta
import seaborn as sn
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from ta import add_all_ta_features
from ta import momentum
from ta.utils import dropna

In [2]:
#pip install xgboost

In [3]:
pip install autofeat

Note: you may need to restart the kernel to use updated packages.


In [4]:
def get_data(company_symbol):
    return pd.read_csv("dataset/"+company_symbol+".csv")

In [5]:
data = get_data("AMZN")
data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,5/15/1997,2.437500,2.500000,1.927083,1.958333,1.958333,72156000
1,5/16/1997,1.968750,1.979167,1.708333,1.729167,1.729167,14700000
2,5/19/1997,1.760417,1.770833,1.625000,1.708333,1.708333,6106800
3,5/20/1997,1.729167,1.750000,1.635417,1.635417,1.635417,5467200
4,5/21/1997,1.635417,1.645833,1.375000,1.427083,1.427083,18853200
...,...,...,...,...,...,...,...
6152,10/25/2021,3335.000000,3347.800049,3297.699951,3320.370117,3320.370117,2226000
6153,10/26/2021,3349.510010,3416.120117,3343.979980,3376.070068,3376.070068,2698300
6154,10/27/2021,3388.000000,3437.000000,3371.449951,3392.489990,3392.489990,2702200
6155,10/28/2021,3402.100098,3479.000000,3386.000000,3446.570068,3446.570068,5708700


In [6]:
data.dtypes

Date          object
Open         float64
High         float64
Low          float64
Close        float64
Adj Close    float64
Volume         int64
dtype: object

In [7]:
data.isna().any()

Date         False
Open         False
High         False
Low          False
Close        False
Adj Close    False
Volume       False
dtype: bool

In [8]:
data[data.columns[data.isna().any()]]

0
1
2
3
4
...
6152
6153
6154
6155
6156


In [9]:
cond1 = data.Date.isna() # aapl['Date'].isna()
cond2 = data.Date.str.contains(r'^\s*$', na=False)
cond3 = data.Date == ''

data_checked = data.assign(cond1= cond1, cond2= cond2, cond3= cond3)
print (data_checked)

            Date         Open         High          Low        Close  \
0      5/15/1997     2.437500     2.500000     1.927083     1.958333   
1      5/16/1997     1.968750     1.979167     1.708333     1.729167   
2      5/19/1997     1.760417     1.770833     1.625000     1.708333   
3      5/20/1997     1.729167     1.750000     1.635417     1.635417   
4      5/21/1997     1.635417     1.645833     1.375000     1.427083   
...          ...          ...          ...          ...          ...   
6152  10/25/2021  3335.000000  3347.800049  3297.699951  3320.370117   
6153  10/26/2021  3349.510010  3416.120117  3343.979980  3376.070068   
6154  10/27/2021  3388.000000  3437.000000  3371.449951  3392.489990   
6155  10/28/2021  3402.100098  3479.000000  3386.000000  3446.570068   
6156  10/29/2021  3300.020020  3374.820068  3273.320068  3372.429932   

        Adj Close    Volume  cond1  cond2  cond3  
0        1.958333  72156000  False  False  False  
1        1.729167  14700000  Fals

In [10]:
def simple_moving_average_5(close):
    return close.rolling(5, min_periods=1).mean()

In [11]:
def simple_moving_average_10(close):
    return close.rolling(10, min_periods=1).mean()

In [12]:
def stochastic_k(high, low, close):
    return (((close-low.rolling(14).min())/(high.rolling(14).max()-low.rolling(14).min()))*100)

In [13]:
def stochastic_d(stochastic_k):
    return stochastic_k.rolling(3,min_periods=1).mean()

In [14]:
def larry_williams_r(high, low, close):
    return (((high.rolling(14).max()-close)/(high.rolling(14).max()-low.rolling(14).min()))*100)

In [15]:
def rate_of_change(close):
    return ((close-close.shift(12))/close.shift(12))

In [16]:
def price_volume_trend(close, volume):
    return (((close-close.shift(1))/close.shift(1))*volume)

In [17]:
def accumulation_distribution_oscillator(high, low, close):
    return((high-close.shift(1))/(high-low))

In [18]:
def weighted_moving_average_10(close):
    return ((10*close+9*close.shift(1)+8*close.shift(2)+7*close.shift(3)
             +6*close.shift(4)+5*close.shift(5)+4*close.shift(6)+3*close.shift(7)
             +2*close.shift(8)+close.shift(9))/(10+9+8+7+6+5+4+3+2+1))

In [19]:
def exponential_moving_average_12(close):
    return (close.ewm(span=12, adjust=False).mean())

In [20]:
def exponential_moving_average_26(close):
    return (close.ewm(span=26, adjust=False).mean())

In [21]:
def moving_average_convergence_divergence(ema_12, ema_26):
    return (ema_12 - ema_26)

In [22]:
def momentum(close):
    return ((close/close.shift(14))*100)

In [23]:
def change(close):
    return (close.shift(-1) - close)

In [24]:
def direction(change):
    return (np.where(change>0, 1, 0))

In [25]:
def relative_strength_index(direction):
    return (100-(100/(1+(direction.rolling(14).sum()/14)/(14-direction.rolling(14).sum())/14)))

In [26]:
def channel_commodity_index(df, ndays): 
    df['TP'] = (df['High'] + df['Low'] + df['Close']) / 3 
    df['sma'] = df['TP'].rolling(ndays).mean()
    df['mad'] = df['TP'].rolling(ndays).apply(lambda x: pd.Series(x).mad())
    df['CCI'] = (df['TP'] - df['sma']) / (0.015 * df['mad']) 
    return df['CCI']

In [27]:
def disparity_index(close):
    return (((close - close.rolling(14).mean())/(close.rolling(14).mean()))*100)

In [28]:
def get_adx(high, low, close, lookback):
    plus_dm = high.diff()
    minus_dm = low.diff()
    plus_dm[plus_dm < 0] = 0
    minus_dm[minus_dm > 0] = 0
    
    tr1 = pd.DataFrame(high - low)
    tr2 = pd.DataFrame(abs(high - close.shift(1)))
    tr3 = pd.DataFrame(abs(low - close.shift(1)))
    frames = [tr1, tr2, tr3]
    tr = pd.concat(frames, axis = 1, join = 'inner').max(axis = 1)
    atr = tr.rolling(lookback).mean()
    
    plus_di = 100 * (plus_dm.ewm(alpha = 1/lookback).mean() / atr)
    minus_di = abs(100 * (minus_dm.ewm(alpha = 1/lookback).mean() / atr))
    dx = (abs(plus_di - minus_di) / abs(plus_di + minus_di)) * 100
    adx = ((dx.shift(1) * (lookback - 1)) + dx) / lookback
    adx_smooth = adx.ewm(alpha = 1/lookback).mean()
    return adx_smooth

In [29]:
def aroon(close):
    return (ta.trend.AroonIndicator(close, 25, True).aroon_indicator())

In [30]:
def compute_all_indicators(data):
    data['SMA5'] = simple_moving_average_5(data['Close'])
    data['SMA10'] = simple_moving_average_10(data['Close'])
    data['StochasticK'] = stochastic_k(data['High'], data['Low'], data['Close'])
    data['StochasticD'] = stochastic_d(data['StochasticK'])
    data['LarryWilliamsR'] = larry_williams_r(data['High'], data['Low'], data['Close'])
    data['ROC'] = rate_of_change(data['Close'])
    data['PVT'] = price_volume_trend(data['Close'], data['Volume'])
    data['ADO'] = accumulation_distribution_oscillator(data['High'], data['Low'], data['Close'])
    data['WMA10'] = weighted_moving_average_10(data['Close'])
    data['EMA12'] = exponential_moving_average_12(data['Close'])
    data['EMA26'] = exponential_moving_average_26(data['Close'])
    data['MACD'] = moving_average_convergence_divergence(data['EMA12'], data['EMA26'])
    data['Momentum'] = momentum(data['Close'])
    data['Change'] = change(data['Close'])
    data['Direction'] = direction(data['Change'])
    data['RSI'] = relative_strength_index(data['Direction'])
    data['CCI'] = channel_commodity_index(data, 14)
    data['DI'] = disparity_index(data['Close'])
    data['ADX'] = get_adx(data['High'], data['Low'], data['Close'], 14)
    data['Aroon'] = aroon(data['Close'])    
    return data

In [31]:
complete_data = compute_all_indicators(data)
complete_data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,...,Change,Direction,RSI,TP,sma,mad,CCI,DI,ADX,Aroon
0,5/15/1997,2.437500,2.500000,1.927083,1.958333,1.958333,72156000,1.958333,1.958333,,...,-0.229166,0,,2.128472,,,,,,0.0
1,5/16/1997,1.968750,1.979167,1.708333,1.729167,1.729167,14700000,1.843750,1.843750,,...,-0.020834,0,,1.805556,,,,,,-4.0
2,5/19/1997,1.760417,1.770833,1.625000,1.708333,1.708333,6106800,1.798611,1.798611,,...,-0.072916,0,,1.701389,,,,,,-8.0
3,5/20/1997,1.729167,1.750000,1.635417,1.635417,1.635417,5467200,1.757812,1.757812,,...,-0.208334,0,,1.673611,,,,,,-12.0
4,5/21/1997,1.635417,1.645833,1.375000,1.427083,1.427083,18853200,1.691667,1.691667,,...,-0.031250,0,,1.482639,,,,,,-16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6152,10/25/2021,3335.000000,3347.800049,3297.699951,3320.370117,3320.370117,2226000,3390.028027,3363.737036,46.075539,...,55.699951,1,0.675676,3321.956706,3336.762172,62.706724,-15.740435,-0.538225,22.462941,40.0
6153,10/26/2021,3349.510010,3416.120117,3343.979980,3376.070068,3376.070068,2698300,3376.412061,3376.611035,61.695644,...,16.419922,1,0.675676,3378.723388,3346.552647,58.910708,36.406218,0.884063,21.835137,40.0
6154,10/27/2021,3388.000000,3437.000000,3371.449951,3392.489990,3392.489990,2702200,3371.898047,3387.432031,68.942496,...,54.080078,1,0.910010,3400.313314,3353.450265,58.707812,53.216142,1.180228,20.421165,40.0
6155,10/28/2021,3402.100098,3479.000000,3386.000000,3446.570068,3446.570068,5708700,3374.210058,3402.103027,86.638952,...,-74.140136,0,0.910010,3437.190023,3363.291219,58.017979,84.914831,2.448428,19.442353,40.0


In [32]:
complete_data = complete_data.dropna()

In [33]:
complete_data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,...,Change,Direction,RSI,TP,sma,mad,CCI,DI,ADX,Aroon
14,6/5/1997,1.416667,1.541667,1.375000,1.541667,1.541667,5672400,1.489584,1.496354,34.375033,...,0.114583,1,0.282646,1.486111,1.540303,0.086788,-41.627398,0.558137,50.226281,-20.0
15,6/6/1997,1.515625,1.708333,1.510417,1.656250,1.656250,7807200,1.520834,1.522396,75.000055,...,0.031250,1,0.381194,1.625000,1.527406,0.070206,92.673942,8.400292,43.640597,-20.0
16,6/9/1997,1.656250,1.708333,1.656250,1.687500,1.687500,2352000,1.556250,1.541146,85.714286,...,-0.104167,0,0.381194,1.684028,1.526166,0.068612,153.386632,10.553252,30.868253,-20.0
17,6/10/1997,1.708333,1.708333,1.531250,1.583333,1.583333,5458800,1.577083,1.541146,68.421026,...,-0.041666,0,0.381194,1.607639,1.521453,0.062553,91.852991,3.982389,24.801838,-20.0
18,6/11/1997,1.593750,1.604167,1.531250,1.541667,1.541667,1188000,1.602083,1.542188,57.894870,...,0.062500,1,0.507614,1.559028,1.526910,0.063244,33.856459,0.704759,24.430750,-20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6151,10/22/2021,3421.000000,3429.840088,3331.300049,3335.550049,3335.550049,3133800,3415.302002,3356.330029,51.820291,...,-15.179932,0,0.675676,3365.563395,3330.055501,68.455299,34.580127,0.129382,23.568887,40.0
6152,10/25/2021,3335.000000,3347.800049,3297.699951,3320.370117,3320.370117,2226000,3390.028027,3363.737036,46.075539,...,55.699951,1,0.675676,3321.956706,3336.762172,62.706724,-15.740435,-0.538225,22.462941,40.0
6153,10/26/2021,3349.510010,3416.120117,3343.979980,3376.070068,3376.070068,2698300,3376.412061,3376.611035,61.695644,...,16.419922,1,0.675676,3378.723388,3346.552647,58.910708,36.406218,0.884063,21.835137,40.0
6154,10/27/2021,3388.000000,3437.000000,3371.449951,3392.489990,3392.489990,2702200,3371.898047,3387.432031,68.942496,...,54.080078,1,0.910010,3400.313314,3353.450265,58.707812,53.216142,1.180228,20.421165,40.0


In [34]:
complete_data = complete_data.drop(['TP', 'sma', 'mad', 'Change'], axis=1)

In [35]:
complete_data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,...,EMA12,EMA26,MACD,Momentum,Direction,RSI,CCI,DI,ADX,Aroon
14,6/5/1997,1.416667,1.541667,1.375000,1.541667,1.541667,5672400,1.489584,1.496354,34.375033,...,1.550516,1.668338,-0.117822,78.723435,1,0.282646,-41.627398,0.558137,50.226281,-20.0
15,6/6/1997,1.515625,1.708333,1.510417,1.656250,1.656250,7807200,1.520834,1.522396,75.000055,...,1.566783,1.667443,-0.100660,95.783114,1,0.381194,92.673942,8.400292,43.640597,-20.0
16,6/9/1997,1.656250,1.708333,1.656250,1.687500,1.687500,2352000,1.556250,1.541146,85.714286,...,1.585355,1.668929,-0.083574,98.780507,0,0.381194,153.386632,10.553252,30.868253,-20.0
17,6/10/1997,1.708333,1.708333,1.531250,1.583333,1.583333,5458800,1.577083,1.541146,68.421026,...,1.585044,1.662588,-0.077545,96.815247,0,0.381194,91.852991,3.982389,24.801838,-20.0
18,6/11/1997,1.593750,1.604167,1.531250,1.541667,1.541667,1188000,1.602083,1.542188,57.894870,...,1.578370,1.653631,-0.075261,108.029246,1,0.507614,33.856459,0.704759,24.430750,-20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6151,10/22/2021,3421.000000,3429.840088,3331.300049,3335.550049,3335.550049,3133800,3415.302002,3356.330029,51.820291,...,3363.883172,3356.629660,7.253513,104.569908,0,0.675676,34.580127,0.129382,23.568887,40.0
6152,10/25/2021,3335.000000,3347.800049,3297.699951,3320.370117,3320.370117,2226000,3390.028027,3363.737036,46.075539,...,3357.188856,3353.943768,3.245089,103.085070,1,0.675676,-15.740435,-0.538225,22.462941,40.0
6153,10/26/2021,3349.510010,3416.120117,3343.979980,3376.070068,3376.070068,2698300,3376.412061,3376.611035,61.695644,...,3360.093658,3355.582753,4.510905,103.496619,1,0.675676,36.406218,0.884063,21.835137,40.0
6154,10/27/2021,3388.000000,3437.000000,3371.449951,3392.489990,3392.489990,2702200,3371.898047,3387.432031,68.942496,...,3365.077709,3358.316622,6.761087,102.727085,1,0.910010,53.216142,1.180228,20.421165,40.0


In [36]:
#corrMatrix = complete_data.corr()
#print(corrMatrix)

In [37]:
#sn.heatmap(corrMatrix, annot=False)
#plt.show()

In [38]:
#corr_pairs = corrMatrix.unstack()
#corr_pairs

In [39]:
#sorted_pairs = corr_pairs.sort_values(kind="quicksort")
#sorted_pairs

In [40]:
#strong_pairs = sorted_pairs[abs(sorted_pairs) > 0.5]

#print(strong_pairs)

In [41]:
## strong_pairs[strong_pairs.index[0][0] == strong_pairs.index[0][1]]

#removed_diagonal = [(i, j) for (i, j) in strong_pairs.index if i!=j]
#len(removed_diagonal)

In [42]:
## # Create correlation matrix
## corr_matrix = complete_data.corr().abs()

## # Select upper triangle of correlation matrix
## upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool8))

## # Find index of feature columns with correlation greater than 0.95
## to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
## to_drop

In [43]:
## Find index of feature columns with correlation greater than 0.8

#correlated_features = set()
#for i in range(len(corrMatrix.columns)):
 #   for j in range(i):
  #      if abs(corrMatrix.iloc[i, j]) > 0.8:
   #         colname = corrMatrix.columns[i]
    #        correlated_features.add(colname)

In [44]:
#correlated_features

In [45]:
#complete_data = complete_data.drop(labels=correlated_features, axis=1)

In [46]:
#complete_data

In [47]:
target = complete_data['Direction']
complete_data = complete_data.drop(['Date', 'Direction'], axis=1)
complete_data

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,StochasticD,...,WMA10,EMA12,EMA26,MACD,Momentum,RSI,CCI,DI,ADX,Aroon
14,1.416667,1.541667,1.375000,1.541667,1.541667,5672400,1.489584,1.496354,34.375033,21.573495,...,1.497633,1.550516,1.668338,-0.117822,78.723435,0.282646,-41.627398,0.558137,50.226281,-20.0
15,1.515625,1.708333,1.510417,1.656250,1.656250,7807200,1.520834,1.522396,75.000055,39.382348,...,1.526705,1.566783,1.667443,-0.100660,95.783114,0.381194,92.673942,8.400292,43.640597,-20.0
16,1.656250,1.708333,1.656250,1.687500,1.687500,2352000,1.556250,1.541146,85.714286,65.029791,...,1.556724,1.585355,1.668929,-0.083574,98.780507,0.381194,153.386632,10.553252,30.868253,-20.0
17,1.708333,1.708333,1.531250,1.583333,1.583333,5458800,1.577083,1.541146,68.421026,76.378455,...,1.564394,1.585044,1.662588,-0.077545,96.815247,0.381194,91.852991,3.982389,24.801838,-20.0
18,1.593750,1.604167,1.531250,1.541667,1.541667,1188000,1.602083,1.542188,57.894870,70.676727,...,1.564489,1.578370,1.653631,-0.075261,108.029246,0.507614,33.856459,0.704759,24.430750,-20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6151,3421.000000,3429.840088,3331.300049,3335.550049,3335.550049,3133800,3415.302002,3356.330029,51.820291,75.141833,...,3385.798202,3363.883172,3356.629660,7.253513,104.569908,0.675676,34.580127,0.129382,23.568887,40.0
6152,3335.000000,3347.800049,3297.699951,3320.370117,3320.370117,2226000,3390.028027,3363.737036,46.075539,62.726254,...,3379.260036,3357.188856,3353.943768,3.245089,103.085070,0.675676,-15.740435,-0.538225,22.462941,40.0
6153,3349.510010,3416.120117,3343.979980,3376.070068,3376.070068,2698300,3376.412061,3376.611035,61.695644,53.197158,...,3381.502406,3360.093658,3355.582753,4.510905,103.496619,0.675676,36.406218,0.884063,21.835137,40.0
6154,3388.000000,3437.000000,3371.449951,3392.489990,3392.489990,2702200,3371.898047,3387.432031,68.942496,58.904560,...,3384.389489,3365.077709,3358.316622,6.761087,102.727085,0.910010,53.216142,1.180228,20.421165,40.0


### autofeat Classification

In [50]:
from sklearn.model_selection import train_test_split

In [51]:
from autofeat import AutoFeatClassifier
X_train, X_test, y_train, y_test = train_test_split(complete_data,target,test_size=0.3)
model = AutoFeatClassifier()
df = model.fit_transform(X_train, y_train)
y_pred = model.predict(X_test)

  x = um.multiply(x, x, out=x)


In [52]:
df_test = model.transform(X_test)
model.score(df_test,y_test)

0.6451437873033098

In [53]:
df

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,StochasticD,...,sqrt(LarryWilliamsR)*RSI,sqrt(RSI)/Momentum,sqrt(StochasticK)/RSI,RSI/ADX,EMA12/SMA10,1/RSI,Aroon*RSI,log(RSI)/ADX,sqrt(Low)*ROC,CCI**2/StochasticD
0,44.910000,45.290001,43.889999,44.490002,44.490002,6452700.0,45.104000,43.966000,65.517274,76.245211,...,2.980812,0.006840,15.945720,0.024205,1.002696,1.970000,16.243655,-0.032331,0.269648,57.022925
1,21.666668,23.083332,21.166668,21.270832,21.270832,27810600.0,21.700000,19.983333,63.270128,70.102682,...,2.310235,0.005999,20.866655,0.008357,1.008706,2.623333,16.772554,-0.021143,0.207197,62.360769
2,88.900002,90.419998,88.820000,89.650002,89.650002,5780800.0,88.416001,87.078000,83.613480,81.576494,...,5.098270,0.009849,7.260368,0.044912,0.992656,0.794000,120.906801,0.008226,0.792018,153.371440
3,32.500000,33.250000,30.187500,31.562500,31.562500,8871100.0,34.412500,36.850000,10.945274,16.791550,...,1.921972,0.006122,16.244068,0.011897,0.995953,4.910000,-17.922607,-0.092951,-0.869933,1908.747769
4,89.010002,89.059998,84.989998,85.089996,85.089996,7697700.0,89.720000,91.774000,0.913223,42.343579,...,6.725834,0.008268,1.414328,0.028168,0.977717,1.480000,40.540541,-0.016344,-0.419845,851.721342
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4294,40.549999,41.279999,40.209999,40.860001,40.860001,9413300.0,40.400000,41.473000,37.362663,37.785816,...,5.347551,0.007759,9.046501,0.032757,0.985783,1.480000,24.324324,-0.019006,-0.268981,62.509848
4295,89.930000,90.650002,89.320000,89.760002,89.760002,9126200.0,90.044000,91.776000,18.953533,22.364372,...,3.431734,0.006425,11.420854,0.014054,0.992725,2.623333,22.871665,-0.035558,-0.275984,334.028314
4296,126.790001,127.480003,121.410004,122.550003,122.550003,5327700.0,125.602000,123.483000,50.561832,67.091169,...,1.987345,0.005346,25.157601,0.008871,1.007385,3.538000,-18.089316,-0.039658,-0.531419,0.168624
4297,341.760010,341.820007,337.679993,339.040009,339.040009,1939500.0,339.617993,336.802997,74.327040,80.770744,...,6.381422,0.010537,6.845323,0.061614,0.995993,0.794000,90.680101,0.011285,0.718643,85.020923


In [54]:
df_test

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,StochasticD,...,sqrt(LarryWilliamsR)*RSI,sqrt(RSI)/Momentum,sqrt(StochasticK)/RSI,RSI/ADX,EMA12/SMA10,1/RSI,Aroon*RSI,log(RSI)/ADX,sqrt(Low)*ROC,CCI**2/StochasticD
0,19.840000,20.139999,18.650000,18.940001,18.940001,14009200.0,18.284000,17.531000,72.912003,88.011545,...,3.516630,0.006827,12.637502,0.043158,1.008015,1.480000,35.135135,-0.025041,0.489996,365.035293
1,325.880005,326.519989,323.269989,324.570007,324.570007,2416200.0,325.820001,326.792001,20.443332,26.995044,...,4.527642,0.007189,8.907218,0.023453,0.995957,1.970000,10.152284,-0.031327,-0.570180,204.663806
2,183.059998,184.589996,179.419998,180.479996,180.479996,5334500.0,183.379999,182.481000,62.745065,78.122054,...,3.098316,0.006526,15.604721,0.016904,0.987368,1.970000,26.395939,-0.022579,0.744241,12.871082
3,1750.660034,1766.890015,1745.609985,1749.510010,1749.510010,2442800.0,1754.447974,1774.392981,20.301107,22.018406,...,2.523297,0.005326,15.941078,0.016813,0.993930,3.538000,4.522329,-0.075164,0.095264,77.579786
4,48.939999,50.200001,48.680000,50.090000,50.090000,8219900.0,49.212000,49.121000,80.640016,67.925161,...,2.972972,0.007496,13.290368,0.019237,0.991551,1.480000,21.621622,-0.011162,0.580922,69.903008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1838,21.680000,22.370001,21.639999,21.780001,21.780001,8207000.0,21.586001,21.818000,48.630137,60.137741,...,2.732126,0.005868,18.293897,0.027958,0.990120,2.623333,16.772554,-0.070735,0.168194,39.249142
1839,82.250000,83.750000,78.849998,81.160004,81.160004,9490600.0,80.112001,81.113001,48.687392,23.903027,...,3.636184,0.007296,13.745941,0.029234,0.999903,1.970000,10.152284,-0.039048,-0.104874,53.070748
1840,663.250000,675.960022,663.250000,673.250000,673.250000,5387300.0,660.688000,645.050000,96.637691,96.008372,...,2.309395,0.009400,7.805375,0.031883,0.990714,0.794000,120.906801,0.005839,2.735275,189.901760
1841,16.110001,16.350000,16.030001,16.110001,16.110001,5543700.0,16.218000,16.448000,70.347057,70.326511,...,2.764189,0.006554,16.523011,0.031663,0.967361,1.970000,22.335025,-0.042293,0.435370,11.121080


### Generate New Features Based on autofeat Results

In [81]:
import math

In [82]:
def sqrt_rsi_momentum(rsi, momentum):
    return (np.sqrt(rsi)/momentum)

In [83]:
def sqrt_stochastic_d_rsi(stochastic_d, rsi):
    return(np.sqrt(stochastic_d)/rsi)

In [84]:
def sqrt_lwr_rsi (larry_williams_r, rsi):
    return (np.sqrt(larry_williams_r)*rsi)

In [85]:
def momentum_3_rsi(momentum, rsi):
    return ((momentum**3)/rsi)

In [86]:
def ado_macd_2(ado, macd):
    return (ado*(macd**2))

In [87]:
def adx_abs_roc(adx, roc):
    return (adx*np.abs(roc))

In [88]:
def sqrt_stochastic_k_log_rsi (stochastic_k, rsi):
    return ((np.sqrt(stochastic_k))*np.log(rsi))

In [89]:
def compute_adv_indicators(complete_data):
    complete_data['SqrtRSI/Momentum'] = sqrt_rsi_momentum(complete_data['RSI'], complete_data['Momentum'])
    complete_data['SqrtStochasticD/RSI'] = sqrt_stochastic_d_rsi(complete_data['StochasticD'], complete_data['RSI'])
    complete_data['SqrtLarryWilliamsR*RSI'] = sqrt_lwr_rsi(complete_data['LarryWilliamsR'], complete_data['RSI'])
    complete_data['Momentum**3/RSI'] = momentum_3_rsi(complete_data['Momentum'], complete_data['RSI'])
    complete_data['ADO*MACD**2'] = ado_macd_2(complete_data['ADO'], complete_data['MACD'])
    complete_data['ADX*AbsROC'] = adx_abs_roc(complete_data['ADX'], complete_data['ROC'])
    complete_data['SqrtStochasticK*LogRSI'] = sqrt_stochastic_k_log_rsi(complete_data['StochasticK'], complete_data['RSI'])   
    return complete_data

In [90]:
enhanced_data = compute_adv_indicators(complete_data)
enhanced_data

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,StochasticD,...,DI,ADX,Aroon,SqrtRSI/Momentum,SqrtStochasticD/RSI,SqrtLarryWilliamsR*RSI,Momentum**3/RSI,ADO*MACD**2,ADX*AbsROC,SqrtStochasticK*LogRSI
14,1.416667,1.541667,1.375000,1.541667,1.541667,5672400,1.489584,1.496354,34.375033,21.573495,...,0.558137,50.226281,-20.0,0.006753,16.433047,2.289690,1.726116e+06,0.010412,4.900106,-7.408290
15,1.515625,1.708333,1.510417,1.656250,1.656250,7807200,1.520834,1.522396,75.000055,39.382348,...,8.400292,43.640597,-20.0,0.006446,16.462822,1.905970,2.305262e+06,0.008533,0.555922,-8.352348
16,1.656250,1.708333,1.656250,1.687500,1.687500,2352000,1.556250,1.541146,85.714286,65.029791,...,10.553252,30.868253,-20.0,0.006250,21.154836,1.440779,2.528525e+06,0.006985,5.632901,-8.929033
17,1.708333,1.708333,1.531250,1.583333,1.583333,5458800,1.577083,1.541146,68.421026,76.378455,...,3.982389,24.801838,-20.0,0.006377,22.926561,2.142129,2.380591e+06,0.000707,3.331591,-7.977606
18,1.593750,1.604167,1.531250,1.541667,1.541667,1188000,1.602083,1.542188,57.894870,70.676727,...,0.704759,24.430750,-20.0,0.006595,16.561682,3.293831,2.483649e+06,0.001618,0.678637,-5.159068
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6151,3421.000000,3429.840088,3331.300049,3335.550049,3335.550049,3133800,3415.302002,3356.330029,51.820291,75.141833,...,0.129382,23.568887,40.0,0.007861,12.829290,4.689973,1.692318e+06,-2.760375,0.531346,-2.822166
6152,3335.000000,3347.800049,3297.699951,3320.370117,3320.370117,2226000,3390.028027,3363.737036,46.075539,62.726254,...,-0.538225,22.462941,40.0,0.007974,11.721586,4.961708,1.621246e+06,2.574842,0.122028,-2.661141
6153,3349.510010,3416.120117,3343.979980,3376.070068,3376.070068,2698300,3376.412061,3376.611035,61.695644,53.197158,...,0.884063,21.835137,40.0,0.007942,10.794585,4.181791,1.640742e+06,27.007800,0.580633,-3.079356
6154,3388.000000,3437.000000,3371.449951,3392.489990,3392.489990,2702200,3371.898047,3387.432031,68.942496,58.904560,...,1.180228,20.421165,40.0,0.009286,8.433896,5.071419,1.191266e+06,42.490388,0.919622,-0.782985


### autofeat Feature Selection

In [91]:
from autofeat import FeatureSelector
fsel = FeatureSelector(verbose=1)
selected_data = fsel.fit_transform(pd.DataFrame(enhanced_data), pd.DataFrame(target))

  return f(**kwargs)


[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 23 features after 5 feature selection runs
[featsel] 13 features after correlation filtering
[featsel] 9 features after noise filtering


In [92]:
selected_data

Unnamed: 0,DI,ROC,RSI,Aroon,SqrtRSI/Momentum,SqrtLarryWilliamsR*RSI,SqrtStochasticK*LogRSI,MACD,SqrtStochasticD/RSI
0,0.558137,-0.097561,0.282646,-20.0,0.006753,2.289690,-7.408290,-0.117822,16.433047
1,8.400292,0.012739,0.381194,-20.0,0.006446,1.905970,-8.352348,-0.100660,16.462822
2,10.553252,0.182482,0.381194,-20.0,0.006250,1.440779,-8.929033,-0.083574,21.154836
3,3.982389,0.134328,0.381194,-20.0,0.006377,2.142129,-7.977606,-0.077545,22.926561
4,0.704759,0.027778,0.507614,-20.0,0.006595,3.293831,-5.159068,-0.075261,16.561682
...,...,...,...,...,...,...,...,...,...
6137,0.129382,0.022544,0.675676,40.0,0.007861,4.689973,-2.822166,7.253513,12.829290
6138,-0.538225,0.005432,0.675676,40.0,0.007974,4.961708,-2.661141,3.245089,11.721586
6139,0.884063,0.026592,0.675676,40.0,0.007942,4.181791,-3.079356,4.510905,10.794585
6140,1.180228,0.045033,0.910010,40.0,0.009286,5.071419,-0.782985,6.761087,8.433896


In [93]:
from sklearn.preprocessing import StandardScaler

In [94]:
scaler = StandardScaler()  
scaler.fit(selected_data)

StandardScaler()

In [95]:

selected_scaled_data = scaler.transform(selected_data)

In [96]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(selected_scaled_data,target,test_size=0.3)

In [97]:
X_train = scaler.transform(X_train)  
X_test = scaler.transform(X_test) 

In [98]:
selected_scaled_data_df = pd.DataFrame(selected_scaled_data)
selected_scaled_data_1250 = selected_scaled_data_df.tail(1250)
selected_scaled_data_250 = selected_scaled_data_df.tail(250)
selected_scaled_data_125 = selected_scaled_data_df.tail(125)
target_1250 = target.tail(1250)
target_250 = target.tail(250)
target_125 = target.tail(125)

In [99]:
X_train_1250, X_test_1250, y_train_1250, y_test_1250 = train_test_split(selected_scaled_data_1250,target_1250,test_size=0.3)

In [100]:
X_train_250, X_test_250, y_train_250, y_test_250 = train_test_split(selected_scaled_data_250,target_250,test_size=0.3)

In [101]:
X_train_125, X_test_125, y_train_125, y_test_125 = train_test_split(selected_scaled_data_125,target_125,test_size=0.3)

### Logistic Regression

#### Max Duration

In [102]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [103]:
lr = LogisticRegression(solver='liblinear', multi_class='ovr')
trained_model_lr = lr.fit(X_train, y_train)
predictions_lr = trained_model_lr.predict(X_test)

Train_accuracy_lr = accuracy_score(y_train,trained_model_lr.predict(X_train))
Test_accuracy_lr = accuracy_score(y_test, predictions_lr)
Confusion_matrix_lr = confusion_matrix(y_test,predictions_lr)

In [104]:
Train_accuracy_lr

0.6408467085368691

In [105]:
Test_accuracy_lr

0.6592512208355942

#### 5 Years

In [106]:
trained_model_lr_1250 = lr.fit(X_train_1250, y_train_1250)
predictions_lr_1250 = trained_model_lr_1250.predict(X_test_1250)

Train_accuracy_lr_1250 = accuracy_score(y_train_1250,trained_model_lr_1250.predict(X_train_1250))
Test_accuracy_lr_1250 = accuracy_score(y_test_1250, predictions_lr_1250)
Confusion_matrix_lr_1250 = confusion_matrix(y_test_1250,predictions_lr_1250)

In [107]:
Train_accuracy_lr_1250

0.6514285714285715

In [108]:
Test_accuracy_lr_1250

0.6586666666666666

#### 1 Year

In [109]:
trained_model_lr_250 = lr.fit(X_train_250, y_train_250)
predictions_lr_250 = trained_model_lr_250.predict(X_test_250)

Train_accuracy_lr_250 = accuracy_score(y_train_250,trained_model_lr_250.predict(X_train_250))
Test_accuracy_lr_250 = accuracy_score(y_test_250, predictions_lr_250)
Confusion_matrix_lr_250 = confusion_matrix(y_test_250,predictions_lr_250)

In [110]:
Train_accuracy_lr_250

0.7314285714285714

In [111]:
Test_accuracy_lr_250

0.5866666666666667

#### 6 Months

In [112]:
trained_model_lr_125 = lr.fit(X_train_125, y_train_125)
predictions_lr_125 = trained_model_lr_125.predict(X_test_125)

Train_accuracy_lr_125 = accuracy_score(y_train_125,trained_model_lr_125.predict(X_train_125))
Test_accuracy_lr_125 = accuracy_score(y_test_125, predictions_lr_125)
Confusion_matrix_lr_125 = confusion_matrix(y_test_125,predictions_lr_125)

In [113]:
Train_accuracy_lr_125

0.632183908045977

In [114]:
Test_accuracy_lr_125

0.4473684210526316

#### Last 100 Days based on Best Model - Max Duration

In [115]:
last100 = selected_scaled_data_df.tail(100)

In [116]:
predictions_lr_100 = trained_model_lr_1250.predict(last100)
predictions_lr_100

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

### SVM

In [117]:
svm = SVC(gamma='auto')
trained_model_svm = svm.fit(X_train, y_train)
predictions_svm = trained_model_svm.predict(X_test)

Train_accuracy_svm = accuracy_score(y_train,trained_model_svm.predict(X_train))
Test_accuracy_svm = accuracy_score(y_test, predictions_svm)
Confusion_matrix_svm = confusion_matrix(y_test,predictions_svm)

In [118]:
Train_accuracy_svm

0.8408932309839497

In [119]:
Test_accuracy_svm

0.5773195876288659

#### 5 Years

In [120]:
trained_model_svm_1250 = svm.fit(X_train_1250, y_train_1250)
predictions_svm_1250 = trained_model_svm_1250.predict(X_test_1250)

Train_accuracy_svm_1250 = accuracy_score(y_train_1250,trained_model_svm_1250.predict(X_train_1250))
Test_accuracy_svm_1250 = accuracy_score(y_test_1250, predictions_svm_1250)
Confusion_matrix_svm_1250 = confusion_matrix(y_test_1250,predictions_svm_1250)

In [121]:
Train_accuracy_svm_1250

0.664

In [122]:
Test_accuracy_svm_1250

0.6586666666666666

#### 1 Year

In [123]:
trained_model_svm_250 = svm.fit(X_train_250, y_train_250)
predictions_svm_250 = trained_model_svm_1250.predict(X_test_250)

Train_accuracy_svm_250 = accuracy_score(y_train_250,trained_model_svm_250.predict(X_train_250))
Test_accuracy_svm_250 = accuracy_score(y_test_250, predictions_svm_250)
Confusion_matrix_svm_250 = confusion_matrix(y_test_250,predictions_svm_250)

In [124]:
Train_accuracy_svm_250

0.7428571428571429

In [125]:
Test_accuracy_svm_250

0.6

#### 6 Months

In [126]:
trained_model_svm_125 = svm.fit(X_train_125, y_train_125)
predictions_svm_125 = trained_model_svm_1250.predict(X_test_125)

Train_accuracy_svm_125 = accuracy_score(y_train_125,trained_model_svm_125.predict(X_train_125))
Test_accuracy_svm_125 = accuracy_score(y_test_125, predictions_svm_125)
Confusion_matrix_svm_125 = confusion_matrix(y_test_125,predictions_svm_125)

In [127]:
Train_accuracy_svm_125

0.7011494252873564

In [128]:
Test_accuracy_svm_125

0.5

#### Last 100 Days based on Best Model - Max Duration

In [129]:
predictions_svm_100 = trained_model_svm.predict(last100)
predictions_svm_100

array([1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1])

### RandomForest

In [130]:
rf = RandomForestClassifier(n_estimators=10)
trained_model_rf = rf.fit(X_train, y_train)
predictions_rf = trained_model_rf.predict(X_test)

Train_accuracy_rf = accuracy_score(y_train, trained_model_rf.predict(X_train))
Test_accuracy_rf = accuracy_score(y_test, predictions_rf)
Confusion_matrix_rf = confusion_matrix(y_test, predictions_rf)

In [131]:
Train_accuracy_rf

0.981623633403117

In [132]:
Test_accuracy_rf

0.6044492674986435

#### 5 Years

In [133]:
trained_model_rf_1250 = rf.fit(X_train_1250, y_train_1250)
predictions_rf_1250 = trained_model_rf_1250.predict(X_test_1250)

Train_accuracy_rf_1250 = accuracy_score(y_train_1250,trained_model_rf_1250.predict(X_train_1250))
Test_accuracy_rf_1250 = accuracy_score(y_test_1250, predictions_rf_1250)
Confusion_matrix_rf_1250 = confusion_matrix(y_test_1250,predictions_rf_1250)

In [134]:
Train_accuracy_rf_1250

0.9725714285714285

In [135]:
Test_accuracy_rf_1250

0.6026666666666667

#### 1 Year

In [136]:
trained_model_rf_250 = rf.fit(X_train_250, y_train_250)
predictions_rf_250 = trained_model_rf_250.predict(X_test_250)

Train_accuracy_rf_250 = accuracy_score(y_train_250,trained_model_rf_250.predict(X_train_250))
Test_accuracy_rf_250 = accuracy_score(y_test_250, predictions_rf_250)
Confusion_matrix_rf_250 = confusion_matrix(y_test_250,predictions_rf_250)

In [137]:
Train_accuracy_rf_250

0.9714285714285714

In [138]:
Test_accuracy_rf_250

0.5066666666666667

#### 6 Months

In [139]:
trained_model_rf_125 = rf.fit(X_train_125, y_train_125)
predictions_rf_125 = trained_model_rf_125.predict(X_test_125)

Train_accuracy_rf_125 = accuracy_score(y_train_125,trained_model_rf_125.predict(X_train_125))
Test_accuracy_rf_125 = accuracy_score(y_test_125, predictions_rf_125)
Confusion_matrix_rf_125 = confusion_matrix(y_test_125,predictions_rf_125)

In [140]:
Train_accuracy_rf_125

0.9770114942528736

In [141]:
Test_accuracy_rf_125

0.5789473684210527

#### Last 100 Days based on Best Model

In [142]:
predictions_rf_100 = trained_model_rf_125.predict(last100)
predictions_rf_100

array([1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0])

### KNN

In [143]:
knn = KNeighborsClassifier(n_neighbors=5)
trained_model_knn = knn.fit(X_train, y_train)
predictions_knn = trained_model_knn.predict(X_test)

Train_accuracy_knn = accuracy_score(y_train, trained_model_knn.predict(X_train))
Test_accuracy_knn = accuracy_score(y_test, predictions_knn)
Confusion_matrix_knn = confusion_matrix(y_test, predictions_knn)

In [144]:
Train_accuracy_knn

0.7245871132821586

In [145]:
Test_accuracy_knn

0.5952251763429192

#### 5 Years

In [146]:
trained_model_knn_1250 = knn.fit(X_train_1250, y_train_1250)
predictions_knn_1250 = trained_model_knn_1250.predict(X_test_1250)

Train_accuracy_knn_1250 = accuracy_score(y_train_1250,trained_model_knn_1250.predict(X_train_1250))
Test_accuracy_knn_1250 = accuracy_score(y_test_1250, predictions_knn_1250)
Confusion_matrix_knn_1250 = confusion_matrix(y_test_1250,predictions_knn_1250)

In [147]:
Train_accuracy_knn_1250

0.7554285714285714

In [148]:
Test_accuracy_knn_1250

0.592

#### 1 Year

In [149]:
trained_model_knn_250 = knn.fit(X_train_250, y_train_250)
predictions_knn_250 = trained_model_knn_250.predict(X_test_250)

Train_accuracy_knn_250 = accuracy_score(y_train_250,trained_model_knn_250.predict(X_train_250))
Test_accuracy_knn_250 = accuracy_score(y_test_250, predictions_knn_250)
Confusion_matrix_knn_250 = confusion_matrix(y_test_250,predictions_knn_250)

In [150]:
Train_accuracy_knn_250

0.8

In [151]:
Test_accuracy_knn_250

0.4666666666666667

#### 6 Months

In [152]:
trained_model_knn_125 = knn.fit(X_train_125, y_train_125)
predictions_knn_125 = trained_model_knn_125.predict(X_test_125)

Train_accuracy_knn_125 = accuracy_score(y_train_125,trained_model_knn_125.predict(X_train_125))
Test_accuracy_knn_125 = accuracy_score(y_test_125, predictions_knn_125)
Confusion_matrix_knn_125 = confusion_matrix(y_test_125,predictions_knn_125)

In [153]:
Train_accuracy_knn_125

0.7126436781609196

In [154]:
Test_accuracy_knn_125

0.631578947368421

#### Last 100 Days based on Best Model

In [155]:
predictions_knn_100 = trained_model_knn.predict(last100)
predictions_knn_100

array([0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1])

### XGBoost

In [156]:
xgb = XGBClassifier(use_label_encoder=False)
trained_model_xgb = xgb.fit(X_train, y_train)
predictions_xgb = trained_model_xgb.predict(X_test)

Train_accuracy_xgb = accuracy_score(y_train, trained_model_xgb.predict(X_train))
Test_accuracy_xgb = accuracy_score(y_test, predictions_xgb)
Confusion_matrix_xgb = confusion_matrix(y_test, predictions_xgb)



In [157]:
Train_accuracy_xgb

0.9648755524540591

In [158]:
Test_accuracy_xgb

0.6174715138361367

#### 5 Years

In [159]:
trained_model_xgb_1250 = xgb.fit(X_train_1250, y_train_1250)
predictions_xgb_1250 = trained_model_xgb_1250.predict(X_test_1250)

Train_accuracy_xgb_1250 = accuracy_score(y_train_1250,trained_model_xgb_1250.predict(X_train_1250))
Test_accuracy_xgb_1250 = accuracy_score(y_test_1250, predictions_xgb_1250)
Confusion_matrix_xgb_1250 = confusion_matrix(y_test_1250,predictions_xgb_1250)



In [160]:
Train_accuracy_xgb_1250

1.0

In [161]:
Test_accuracy_xgb_1250

0.6453333333333333

#### 1 Year

In [162]:
trained_model_xgb_250 = xgb.fit(X_train_250, y_train_250)
predictions_xgb_250 = trained_model_xgb_250.predict(X_test_250)

Train_accuracy_xgb_250 = accuracy_score(y_train_250,trained_model_xgb_250.predict(X_train_250))
Test_accuracy_xgb_250 = accuracy_score(y_test_250, predictions_xgb_250)
Confusion_matrix_xgb_250 = confusion_matrix(y_test_250,predictions_xgb_250)



In [163]:
Train_accuracy_xgb_250

1.0

In [164]:
Test_accuracy_xgb_250

0.4666666666666667

#### 6 Months

In [165]:
trained_model_xgb_125 = xgb.fit(X_train_125, y_train_125)
predictions_xgb_125 = trained_model_xgb_125.predict(X_test_125)

Train_accuracy_xgb_125 = accuracy_score(y_train_125,trained_model_xgb_125.predict(X_train_125))
Test_accuracy_xgb_125 = accuracy_score(y_test_125, predictions_xgb_125)
Confusion_matrix_xgb_125 = confusion_matrix(y_test_125,predictions_xgb_125)



In [166]:
Train_accuracy_xgb_125

1.0

In [167]:
Test_accuracy_xgb_125

0.42105263157894735

#### Last 100 Days based on Best Model

In [168]:
predictions_xgb_100 = trained_model_xgb.predict(last100)
predictions_xgb_100

array([1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0])

## KFold cross validation
### Basic example

In [None]:
from sklearn.model_selection import cross_val_score

cross_val_score(LogisticRegression(solver='liblinear',multi_class='ovr'), complete_data, target, cv=10)

In [None]:
cross_val_score(SVC(gamma='auto'), complete_data, target, cv=10)

In [None]:
cross_val_score(RandomForestClassifier(n_estimators=5), complete_data, target, cv=10)

In [None]:
cross_val_score(KNeighborsClassifier(n_neighbors=5), complete_data, target, cv=10)

In [None]:
cross_val_score(XGBClassifier(use_label_encoder=False), complete_data, target, cv=10)