In [1]:
import numpy as np
import pandas as pd
import ta
import seaborn as sn
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from ta import add_all_ta_features
from ta import momentum
from ta.utils import dropna

In [2]:
#pip install xgboost

In [3]:
pip install autofeat




In [4]:
def get_data(company_symbol):
    return pd.read_csv("dataset/"+company_symbol+".csv")

In [5]:
data = get_data("GOOGL")
data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,8/19/2004,50.050049,52.082081,48.028027,50.220219,50.220219,44659096
1,8/20/2004,50.555557,54.594597,50.300301,54.209209,54.209209,22834343
2,8/23/2004,55.430431,56.796799,54.579578,54.754753,54.754753,18256126
3,8/24/2004,55.675674,55.855858,51.836838,52.487488,52.487488,15247337
4,8/25/2004,52.532532,54.054054,51.991993,53.053055,53.053055,9188602
...,...,...,...,...,...,...,...
4326,10/25/2021,2751.000000,2760.000000,2708.479980,2748.939941,2748.939941,1720600
4327,10/26/2021,2785.270020,2801.659912,2766.090088,2786.169922,2786.169922,2461800
4328,10/27/2021,2788.100098,2973.000000,2788.100098,2924.350098,2924.350098,4289100
4329,10/28/2021,2942.689941,2943.159912,2892.540039,2916.979980,2916.979980,1810300


In [6]:
data.dtypes

Date          object
Open         float64
High         float64
Low          float64
Close        float64
Adj Close    float64
Volume         int64
dtype: object

In [7]:
data.isna().any()

Date         False
Open         False
High         False
Low          False
Close        False
Adj Close    False
Volume       False
dtype: bool

In [8]:
data[data.columns[data.isna().any()]]

0
1
2
3
4
...
4326
4327
4328
4329
4330


In [9]:
cond1 = data.Date.isna() # aapl['Date'].isna()
cond2 = data.Date.str.contains(r'^\s*$', na=False)
cond3 = data.Date == ''

data_checked = data.assign(cond1= cond1, cond2= cond2, cond3= cond3)
print (data_checked)

            Date         Open         High          Low        Close  \
0      8/19/2004    50.050049    52.082081    48.028027    50.220219   
1      8/20/2004    50.555557    54.594597    50.300301    54.209209   
2      8/23/2004    55.430431    56.796799    54.579578    54.754753   
3      8/24/2004    55.675674    55.855858    51.836838    52.487488   
4      8/25/2004    52.532532    54.054054    51.991993    53.053055   
...          ...          ...          ...          ...          ...   
4326  10/25/2021  2751.000000  2760.000000  2708.479980  2748.939941   
4327  10/26/2021  2785.270020  2801.659912  2766.090088  2786.169922   
4328  10/27/2021  2788.100098  2973.000000  2788.100098  2924.350098   
4329  10/28/2021  2942.689941  2943.159912  2892.540039  2916.979980   
4330  10/29/2021  2901.560059  2964.840088  2892.989990  2960.919922   

        Adj Close    Volume  cond1  cond2  cond3  
0       50.220219  44659096  False  False  False  
1       54.209209  22834343  Fals

In [10]:
def simple_moving_average_5(close):
    return close.rolling(5, min_periods=1).mean()

In [11]:
def simple_moving_average_10(close):
    return close.rolling(10, min_periods=1).mean()

In [12]:
def stochastic_k(high, low, close):
    return (((close-low.rolling(14).min())/(high.rolling(14).max()-low.rolling(14).min()))*100)

In [13]:
def stochastic_d(stochastic_k):
    return stochastic_k.rolling(3,min_periods=1).mean()

In [14]:
def larry_williams_r(high, low, close):
    return (((high.rolling(14).max()-close)/(high.rolling(14).max()-low.rolling(14).min()))*100)

In [15]:
def rate_of_change(close):
    return ((close-close.shift(12))/close.shift(12))

In [16]:
def price_volume_trend(close, volume):
    return (((close-close.shift(1))/close.shift(1))*volume)

In [17]:
def accumulation_distribution_oscillator(high, low, close):
    return((high-close.shift(1))/(high-low))

In [18]:
def weighted_moving_average_10(close):
    return ((10*close+9*close.shift(1)+8*close.shift(2)+7*close.shift(3)
             +6*close.shift(4)+5*close.shift(5)+4*close.shift(6)+3*close.shift(7)
             +2*close.shift(8)+close.shift(9))/(10+9+8+7+6+5+4+3+2+1))

In [19]:
def exponential_moving_average_12(close):
    return (close.ewm(span=12, adjust=False).mean())

In [20]:
def exponential_moving_average_26(close):
    return (close.ewm(span=26, adjust=False).mean())

In [21]:
def moving_average_convergence_divergence(ema_12, ema_26):
    return (ema_12 - ema_26)

In [22]:
def momentum(close):
    return ((close/close.shift(14))*100)

In [23]:
def change(close):
    return (close.shift(-1) - close)

In [24]:
def direction(change):
    return (np.where(change>0, 1, 0))

In [25]:
def relative_strength_index(direction):
    return (100-(100/(1+(direction.rolling(14).sum()/14)/(14-direction.rolling(14).sum())/14)))

In [26]:
def channel_commodity_index(df, ndays): 
    df['TP'] = (df['High'] + df['Low'] + df['Close']) / 3 
    df['sma'] = df['TP'].rolling(ndays).mean()
    df['mad'] = df['TP'].rolling(ndays).apply(lambda x: pd.Series(x).mad())
    df['CCI'] = (df['TP'] - df['sma']) / (0.015 * df['mad']) 
    return df['CCI']

In [27]:
def disparity_index(close):
    return (((close - close.rolling(14).mean())/(close.rolling(14).mean()))*100)

In [28]:
def get_adx(high, low, close, lookback):
    plus_dm = high.diff()
    minus_dm = low.diff()
    plus_dm[plus_dm < 0] = 0
    minus_dm[minus_dm > 0] = 0
    
    tr1 = pd.DataFrame(high - low)
    tr2 = pd.DataFrame(abs(high - close.shift(1)))
    tr3 = pd.DataFrame(abs(low - close.shift(1)))
    frames = [tr1, tr2, tr3]
    tr = pd.concat(frames, axis = 1, join = 'inner').max(axis = 1)
    atr = tr.rolling(lookback).mean()
    
    plus_di = 100 * (plus_dm.ewm(alpha = 1/lookback).mean() / atr)
    minus_di = abs(100 * (minus_dm.ewm(alpha = 1/lookback).mean() / atr))
    dx = (abs(plus_di - minus_di) / abs(plus_di + minus_di)) * 100
    adx = ((dx.shift(1) * (lookback - 1)) + dx) / lookback
    adx_smooth = adx.ewm(alpha = 1/lookback).mean()
    return adx_smooth

In [29]:
def aroon(close):
    return (ta.trend.AroonIndicator(close, 25, True).aroon_indicator())

In [30]:
def compute_all_indicators(data):
    data['SMA5'] = simple_moving_average_5(data['Close'])
    data['SMA10'] = simple_moving_average_10(data['Close'])
    data['StochasticK'] = stochastic_k(data['High'], data['Low'], data['Close'])
    data['StochasticD'] = stochastic_d(data['StochasticK'])
    data['LarryWilliamsR'] = larry_williams_r(data['High'], data['Low'], data['Close'])
    data['ROC'] = rate_of_change(data['Close'])
    data['PVT'] = price_volume_trend(data['Close'], data['Volume'])
    data['ADO'] = accumulation_distribution_oscillator(data['High'], data['Low'], data['Close'])
    data['WMA10'] = weighted_moving_average_10(data['Close'])
    data['EMA12'] = exponential_moving_average_12(data['Close'])
    data['EMA26'] = exponential_moving_average_26(data['Close'])
    data['MACD'] = moving_average_convergence_divergence(data['EMA12'], data['EMA26'])
    data['Momentum'] = momentum(data['Close'])
    data['Change'] = change(data['Close'])
    data['Direction'] = direction(data['Change'])
    data['RSI'] = relative_strength_index(data['Direction'])
    data['CCI'] = channel_commodity_index(data, 14)
    data['DI'] = disparity_index(data['Close'])
    data['ADX'] = get_adx(data['High'], data['Low'], data['Close'], 14)
    data['Aroon'] = aroon(data['Close'])    
    return data

In [31]:
complete_data = compute_all_indicators(data)
complete_data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,...,Change,Direction,RSI,TP,sma,mad,CCI,DI,ADX,Aroon
0,8/19/2004,50.050049,52.082081,48.028027,50.220219,50.220219,44659096,50.220219,50.220219,,...,3.988990,1,,50.110109,,,,,,0.0
1,8/20/2004,50.555557,54.594597,50.300301,54.209209,54.209209,22834343,52.214714,52.214714,,...,0.545544,1,,53.034702,,,,,,4.0
2,8/23/2004,55.430431,56.796799,54.579578,54.754753,54.754753,18256126,53.061394,53.061394,,...,-2.267265,0,,55.377043,,,,,,8.0
3,8/24/2004,55.675674,55.855858,51.836838,52.487488,52.487488,15247337,52.917917,52.917917,,...,0.565567,1,,53.393395,,,,,,8.0
4,8/25/2004,52.532532,54.054054,51.991993,53.053055,53.053055,9188602,52.944945,52.944945,,...,0.955955,1,,53.033034,,,,,,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4326,10/25/2021,2751.000000,2760.000000,2708.479980,2748.939941,2748.939941,1720600,2807.621973,2802.466992,31.791463,...,37.229981,1,0.910010,2739.139974,2793.822144,36.525489,-99.806410,-1.659154,15.283130,44.0
4327,10/26/2021,2785.270020,2801.659912,2766.090088,2786.169922,2786.169922,2461800,2791.907959,2808.185986,47.150533,...,138.180176,1,0.910010,2784.639974,2797.551188,33.080892,-26.019480,-0.416017,15.934061,44.0
4328,10/27/2021,2788.100098,2973.000000,2788.100098,2924.350098,2924.350098,4289100,2809.702002,2825.457007,81.608234,...,-7.370118,0,0.675676,2895.150065,2805.339047,39.235729,152.600736,4.150993,15.753143,68.0
4329,10/28/2021,2942.689941,2943.159912,2892.540039,2916.979980,2916.979980,1810300,2825.554004,2834.853003,78.822011,...,43.939942,1,0.910010,2917.559977,2814.053327,46.064779,149.798686,3.568992,16.191420,68.0


In [32]:
complete_data = complete_data.dropna()

In [33]:
complete_data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,...,Change,Direction,RSI,TP,sma,mad,CCI,DI,ADX,Aroon
14,9/9/2004,51.316315,51.406406,50.550552,51.206207,51.206207,4061734,50.821821,51.371371,23.177440,...,1.511509,1,0.910010,51.054388,52.055508,1.348866,-49.479574,-1.556034,13.189346,-36.0
15,9/10/2004,50.850849,53.333332,50.700703,52.717716,52.717716,8698892,51.204204,51.242242,43.947690,...,1.086086,1,0.910010,52.250584,51.999500,1.284856,13.027860,1.557848,13.278814,-36.0
16,9/13/2004,53.368366,54.259258,53.283283,53.803802,53.803802,7844148,51.953953,51.309809,67.614480,...,1.996998,1,1.259446,53.782114,51.885576,1.154658,109.500695,3.785942,14.363601,-36.0
17,9/14/2004,53.778778,56.056057,53.448448,55.800800,55.800800,10828960,52.945945,51.784284,96.094920,...,0.255257,1,1.259446,55.101768,52.007603,1.294117,159.396447,7.148936,17.732901,24.0
18,9/15/2004,55.335335,57.172173,55.155155,56.056057,56.056057,10713076,53.916916,52.266266,85.415306,...,0.985985,1,1.259446,56.127795,52.228657,1.546751,168.057138,7.197552,22.985583,28.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4325,10/22/2021,2783.000000,2811.659912,2721.120117,2751.330078,2751.330078,2527700,2828.945996,2805.401001,36.641914,...,-2.390137,0,0.910010,2761.370036,2792.116194,38.231440,-53.614090,-1.501967,14.685884,44.0
4326,10/25/2021,2751.000000,2760.000000,2708.479980,2748.939941,2748.939941,1720600,2807.621973,2802.466992,31.791463,...,37.229981,1,0.910010,2739.139974,2793.822144,36.525489,-99.806410,-1.659154,15.283130,44.0
4327,10/26/2021,2785.270020,2801.659912,2766.090088,2786.169922,2786.169922,2461800,2791.907959,2808.185986,47.150533,...,138.180176,1,0.910010,2784.639974,2797.551188,33.080892,-26.019480,-0.416017,15.934061,44.0
4328,10/27/2021,2788.100098,2973.000000,2788.100098,2924.350098,2924.350098,4289100,2809.702002,2825.457007,81.608234,...,-7.370118,0,0.675676,2895.150065,2805.339047,39.235729,152.600736,4.150993,15.753143,68.0


In [34]:
complete_data = complete_data.drop(['TP', 'sma', 'mad', 'Change'], axis=1)

In [35]:
complete_data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,...,EMA12,EMA26,MACD,Momentum,Direction,RSI,CCI,DI,ADX,Aroon
14,9/9/2004,51.316315,51.406406,50.550552,51.206207,51.206207,4061734,50.821821,51.371371,23.177440,...,51.241682,51.171435,0.070247,101.963329,1,0.910010,-49.479574,-1.556034,13.189346,-36.0
15,9/10/2004,50.850849,53.333332,50.700703,52.717716,52.717716,8698892,51.204204,51.242242,43.947690,...,51.468764,51.285974,0.182790,97.248635,1,0.910010,13.027860,1.557848,13.278814,-36.0
16,9/13/2004,53.368366,54.259258,53.283283,53.803802,53.803802,7844148,51.953953,51.309809,67.614480,...,51.828001,51.472480,0.355521,98.263254,1,1.259446,109.500695,3.785942,14.363601,-36.0
17,9/14/2004,53.778778,56.056057,53.448448,55.800800,55.800800,10828960,52.945945,51.784284,96.094920,...,52.439201,51.793096,0.646105,106.312575,1,1.259446,159.396447,7.148936,17.732901,24.0
18,9/15/2004,55.335335,57.172173,55.155155,56.056057,56.056057,10713076,53.916916,52.266266,85.415306,...,52.995640,52.108871,0.886769,105.660375,1,1.259446,168.057138,7.197552,22.985583,28.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4325,10/22/2021,2783.000000,2811.659912,2721.120117,2751.330078,2751.330078,2527700,2828.945996,2805.401001,36.641914,...,2801.459443,2792.721461,8.737982,102.923105,0,0.910010,-53.614090,-1.501967,14.685884,44.0
4326,10/25/2021,2751.000000,2760.000000,2708.479980,2748.939941,2748.939941,1720600,2807.621973,2802.466992,31.791463,...,2793.379519,2789.478385,3.901134,101.046881,1,0.910010,-99.806410,-1.659154,15.283130,44.0
4327,10/26/2021,2785.270020,2801.659912,2766.090088,2786.169922,2786.169922,2461800,2791.907959,2808.185986,47.150533,...,2792.270350,2789.233314,3.037036,101.267396,1,0.910010,-26.019480,-0.416017,15.934061,44.0
4328,10/27/2021,2788.100098,2973.000000,2788.100098,2924.350098,2924.350098,4289100,2809.702002,2825.457007,81.608234,...,2812.590312,2799.241965,13.348347,105.022449,0,0.675676,152.600736,4.150993,15.753143,68.0


In [36]:
#corrMatrix = complete_data.corr()
#print(corrMatrix)

In [37]:
#sn.heatmap(corrMatrix, annot=False)
#plt.show()

In [38]:
#corr_pairs = corrMatrix.unstack()
#corr_pairs

In [39]:
#sorted_pairs = corr_pairs.sort_values(kind="quicksort")
#sorted_pairs

In [40]:
#strong_pairs = sorted_pairs[abs(sorted_pairs) > 0.5]

#print(strong_pairs)

In [41]:
## strong_pairs[strong_pairs.index[0][0] == strong_pairs.index[0][1]]

#removed_diagonal = [(i, j) for (i, j) in strong_pairs.index if i!=j]
#len(removed_diagonal)

In [42]:
## # Create correlation matrix
## corr_matrix = complete_data.corr().abs()

## # Select upper triangle of correlation matrix
## upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool8))

## # Find index of feature columns with correlation greater than 0.95
## to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
## to_drop

In [43]:
## Find index of feature columns with correlation greater than 0.8

#correlated_features = set()
#for i in range(len(corrMatrix.columns)):
 #   for j in range(i):
  #      if abs(corrMatrix.iloc[i, j]) > 0.8:
   #         colname = corrMatrix.columns[i]
    #        correlated_features.add(colname)

In [44]:
#correlated_features

In [45]:
#complete_data = complete_data.drop(labels=correlated_features, axis=1)

In [46]:
#complete_data

In [47]:
target = complete_data['Direction']
complete_data = complete_data.drop(['Date', 'Direction'], axis=1)
complete_data

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,StochasticD,...,WMA10,EMA12,EMA26,MACD,Momentum,RSI,CCI,DI,ADX,Aroon
14,51.316315,51.406406,50.550552,51.206207,51.206207,4061734,50.821821,51.371371,23.177440,29.682331,...,50.983165,51.241682,51.171435,0.070247,101.963329,0.910010,-49.479574,-1.556034,13.189346,-36.0
15,50.850849,53.333332,50.700703,52.717716,52.717716,8698892,51.204204,51.242242,43.947690,34.437451,...,51.227955,51.468764,51.285974,0.182790,97.248635,0.910010,13.027860,1.557848,13.278814,-36.0
16,53.368366,54.259258,53.283283,53.803802,53.803802,7844148,51.953953,51.309809,67.614480,44.913203,...,51.693693,51.828001,51.472480,0.355521,98.263254,1.259446,109.500695,3.785942,14.363601,-36.0
17,53.778778,56.056057,53.448448,55.800800,55.800800,10828960,52.945945,51.784284,96.094920,69.219030,...,52.510237,52.439201,51.793096,0.646105,106.312575,1.259446,159.396447,7.148936,17.732901,24.0
18,55.335335,57.172173,55.155155,56.056057,56.056057,10713076,53.916916,52.266266,85.415306,83.041569,...,53.286923,52.995640,52.108871,0.886769,105.660375,1.259446,168.057138,7.197552,22.985583,28.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4325,2783.000000,2811.659912,2721.120117,2751.330078,2751.330078,2527700,2828.945996,2805.401001,36.641914,69.181246,...,2815.316366,2801.459443,2792.721461,8.737982,102.923105,0.910010,-53.614090,-1.501967,14.685884,44.0
4326,2751.000000,2760.000000,2708.479980,2748.939941,2748.939941,1720600,2807.621973,2802.466992,31.791463,51.449377,...,2805.050719,2793.379519,2789.478385,3.901134,101.046881,0.910010,-99.806410,-1.659154,15.283130,44.0
4327,2785.270020,2801.659912,2766.090088,2786.169922,2786.169922,2461800,2791.907959,2808.185986,47.150533,38.527970,...,2802.087615,2792.270350,2789.233314,3.037036,101.267396,0.910010,-26.019480,-0.416017,15.934061,44.0
4328,2788.100098,2973.000000,2788.100098,2924.350098,2924.350098,4289100,2809.702002,2825.457007,81.608234,53.516743,...,2823.208363,2812.590312,2799.241965,13.348347,105.022449,0.675676,152.600736,4.150993,15.753143,68.0


### autofeat Classification

In [48]:
from sklearn.model_selection import train_test_split

In [49]:
np.any(np.isnan(complete_data))

False

In [50]:
np.all(np.isinf(complete_data))

False

In [51]:
np.any(np.isnan(target))

False

In [52]:
np.all(np.isinf(target))

False

In [53]:
count1 = np.isinf(complete_data).values.sum()
count1

0

In [54]:
count2 = np.isinf(target).values.sum()
count2

0

In [55]:
type(complete_data.columns.to_series())

pandas.core.series.Series

In [56]:
col_name = complete_data.columns.to_series()[np.isinf(complete_data).any()]
print(col_name)

Series([], dtype: object)


In [57]:
row_index = complete_data.index[np.isinf(complete_data).any(1)]
print(row_index)

Int64Index([], dtype='int64')


In [58]:
target.drop(complete_data[np.isinf(complete_data).any(1)].index, inplace = True)
complete_data.drop(complete_data[np.isinf(complete_data).any(1)].index, inplace = True)

In [59]:
complete_data = complete_data.reset_index(drop = True)
target = target.reset_index(drop = True)

In [60]:
count1 = np.isinf(complete_data).values.sum()
count1

0

In [61]:
from autofeat import AutoFeatClassifier
X_train, X_test, y_train, y_test = train_test_split(complete_data,target,test_size=0.3)
model = AutoFeatClassifier()
df = model.fit_transform(X_train, y_train)
y_pred = model.predict(X_test)

  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  sqr = np.multiply(arr, arr, out=arr)


In [62]:
df_test = model.transform(X_test)
model.score(df_test,y_test)

0.5266409266409267

In [63]:
df

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,StochasticD,...,sqrt(LarryWilliamsR)*RSI,Low/Close,MACD**2*PVT,sqrt(RSI)/ADX,ADO*Volume,ADO**2/MACD,MACD**3*PVT,log(RSI)/ADX,RSI**2/StochasticD,ADO**2*sqrt(LarryWilliamsR)
0,560.299988,563.140015,557.460022,557.950012,557.950012,1334200.0,559.422009,553.522998,71.375297,78.923872,...,2.715841,0.999122,-8.530530e+03,0.061944,1.073470e+06,0.269722,-2.047379e+04,-0.058950,0.003265,3.463449
1,241.276276,245.125122,237.737732,237.972977,237.972977,6944249.0,237.913913,245.042041,19.486484,24.658512,...,2.536160,0.999011,-1.807470e+06,0.013204,5.970356e+06,-0.083906,1.592306e+07,-0.031382,0.003240,6.632607
2,150.890884,153.323318,149.339340,151.626633,151.626633,3839157.0,152.005005,154.439438,59.453291,53.970696,...,3.232299,0.984915,2.771673e+04,0.036224,4.012783e+06,-1.653741,-1.831025e+04,-0.034473,0.004774,6.956608
3,759.780029,763.229980,757.229980,759.940002,759.940002,1403900.0,751.283996,755.283002,75.710832,63.406809,...,4.484898,0.996434,-3.687209e+04,0.029756,7.534193e+05,0.017056,-6.226075e+05,-0.002941,0.013060,1.419409
4,396.836823,396.946960,392.422424,395.590576,395.590576,4053343.0,392.003998,396.687186,43.261774,34.769419,...,2.871339,0.991991,5.633934e+03,0.029206,1.309284e+06,-0.045623,-1.288459e+04,-0.045623,0.004179,0.785921
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3016,757.159973,770.039978,755.200012,769.669983,769.669983,1583700.0,764.016016,761.010004,77.835737,75.249006,...,2.389794,0.981200,1.192342e+06,0.064331,7.619657e+05,0.025132,1.098260e+07,-0.061222,0.003424,1.089812
3017,197.242249,198.393387,194.369370,194.964966,194.964966,5679914.0,200.247251,200.128630,28.698397,58.424947,...,4.286308,0.996945,-3.263882e+06,0.023574,-1.850940e+06,0.021582,-1.606007e+07,-0.022434,0.004410,0.896707
3018,441.661652,442.332336,437.307312,440.230225,440.230225,3120077.0,434.239240,431.870871,89.010937,89.480741,...,2.239848,0.993360,4.272153e+02,0.041041,1.311433e+06,-0.071995,-1.048348e+03,-0.019574,0.005102,0.585655
3019,1513.270020,1529.880005,1505.449951,1510.060059,1510.060059,1742600.0,1483.970020,1469.159009,82.043898,85.105931,...,3.856135,0.996947,1.038683e+06,0.028477,1.513624e+06,0.029769,2.632411e+07,-0.002815,0.009730,3.197030


In [64]:
df_test

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,StochasticD,...,sqrt(LarryWilliamsR)*RSI,Low/Close,MACD**2*PVT,sqrt(RSI)/ADX,ADO*Volume,ADO**2/MACD,MACD**3*PVT,log(RSI)/ADX,RSI**2/StochasticD,ADO**2*sqrt(LarryWilliamsR)
0,218.333328,219.164169,213.548553,216.516510,216.516510,6711082.0,219.378378,215.507008,74.293679,79.636092,...,4.613879,0.986292,-2.070605e+06,0.029067,1.357771e+06,0.006136,-1.381335e+07,-0.002873,0.010399,0.207533
1,532.599976,535.799988,527.880005,529.549988,529.549988,1324000.0,534.864002,531.697998,69.234044,74.112112,...,2.114374,0.996846,-1.523444e+04,0.028027,8.592670e+05,-0.179581,3.573102e+04,-0.043781,0.001961,2.336229
2,317.137146,321.026031,304.804810,311.311310,311.311310,23574402.0,306.930927,298.473471,76.157712,88.232808,...,3.299227,0.979100,-1.768041e+07,0.017192,1.165266e+07,0.020771,-2.079674e+08,-0.008200,0.005174,1.193006
3,1244.280029,1251.979980,1226.239990,1232.219971,1232.219971,1852600.0,1251.193994,1249.009998,21.302365,40.057665,...,3.381640,0.995147,-1.144168e+07,0.033510,-4.433600e+05,0.003308,-1.981028e+08,-0.052345,0.003627,0.508078
4,1211.290039,1220.550049,1209.040039,1219.449951,1219.449951,1051100.0,1202.334009,1195.682007,76.729193,66.433873,...,3.259448,0.991463,1.829103e+06,0.043621,8.894609e+05,0.045855,2.856400e+07,-0.020805,0.006872,3.454390
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1290,305.005005,305.305298,296.621613,297.347351,297.347351,12815771.0,307.615619,308.720722,3.933869,43.387724,...,8.919312,0.997559,-6.319697e+06,0.022588,1.285239e+06,0.002185,-2.908605e+07,-0.002233,0.019086,0.098574
1291,95.985985,96.856857,95.985985,96.546547,96.546547,5350844.0,95.272272,92.936436,97.408296,96.220092,...,2.027553,0.994194,2.460943e+04,0.052883,2.337133e+06,0.075811,6.192872e+04,0.010870,0.016485,0.307125
1292,588.960022,591.989990,577.679993,579.549988,579.549988,2309500.0,593.208008,598.136005,8.421748,41.921945,...,2.704820,0.996773,-1.364914e+06,0.032636,-5.567997e+05,0.012351,-6.423268e+06,-0.077565,0.001906,0.556235
1293,152.632629,154.134140,151.521515,151.791794,151.791794,14141444.0,150.433432,152.954454,41.176482,35.272317,...,2.923627,0.998219,-1.439221e+03,0.026088,8.018917e+06,2.393535,-1.933444e+02,-0.040752,0.004120,2.466149


### Generate New Features Based on autofeat Results

In [None]:
import math

In [65]:
df_test.columns

Index(['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'SMA5', 'SMA10',
       'StochasticK', 'StochasticD', 'LarryWilliamsR', 'ROC', 'PVT', 'ADO',
       'WMA10', 'EMA12', 'EMA26', 'MACD', 'Momentum', 'RSI', 'CCI', 'DI',
       'ADX', 'Aroon', 'sqrt(StochasticD)/RSI', 'StochasticD/RSI',
       'sqrt(Volume)*log(RSI)', 'sqrt(LarryWilliamsR)*RSI', 'Low/Close',
       'MACD**2*PVT', 'sqrt(RSI)/ADX', 'ADO*Volume', 'ADO**2/MACD',
       'MACD**3*PVT', 'log(RSI)/ADX', 'RSI**2/StochasticD',
       'ADO**2*sqrt(LarryWilliamsR)'],
      dtype='object')

In [79]:
def sqrt_stochastic_d_rsi (stochastic_d, rsi):
    return (np.sqrt(stochastic_d)/rsi)

In [67]:
def stochastic_d_rsi (stochastic_d, rsi): 
    return (stochastic_d / rsi)

In [68]:
def sqrt_volume_log_rsi (volume, rsi):
    return (np.sqrt(volume)*np.log(rsi))

In [82]:
def sqrt_lwr_rsi (larry_williams_r, rsi):
    return (np.sqrt(larry_williams_r)*rsi)

In [69]:
def low_close (low, close):
    return (low / close)

In [70]:
def macd_2_pvt (macd, pvt):
    return ((macd**2)*pvt)

In [71]:
def sqrt_rsi_adx (rsi, adx):
    return (np.sqrt(rsi)/adx)

In [72]:
def ado_volume (ado, volume):
    return (ado * volume)

In [73]:
def ado_2_macd (ado, macd):
    return ((ado**2)/macd)

In [74]:
def macd_3_pvt (macd, pvt):
    return ((macd**3)* pvt)

In [75]:
def log_rsi_adx (rsi, adx): 
    return (np.log(rsi)/adx)

In [83]:
def rsi_2_stochastic_d (rsi, stochastic_d):
    return ((rsi**2) / stochastic_d)

In [76]:
def ado_2_sqrt_lwr (ado, larrywilliamsr):
    return ((ado**2)*np.sqrt(larrywilliamsr))

In [86]:
def compute_adv_indicators(complete_data):
    complete_data['sqrt(StochasticD)/RSI'] = sqrt_stochastic_d_rsi(complete_data['StochasticD'], complete_data['RSI'])
    complete_data['StochasticD/RSI'] = stochastic_d_rsi(complete_data['StochasticD'], complete_data['RSI'])
    complete_data['sqrt(Volume)*log(RSI)'] =  sqrt_volume_log_rsi(complete_data['Volume'], complete_data['RSI'])
    complete_data['sqrt(LarryWilliamsR)*RSI'] = sqrt_lwr_rsi(complete_data['LarryWilliamsR'], complete_data['RSI'])
    complete_data['Low/Close'] = low_close(complete_data['Low'], complete_data['Close'])
    complete_data['MACD**2*PVT'] = macd_2_pvt(complete_data['MACD'], complete_data['PVT'])
    complete_data['sqrt(RSI)/ADX'] = sqrt_rsi_adx(complete_data['RSI'], complete_data['ADX'])
    complete_data['ADO*Volume'] =  ado_volume(complete_data['ADO'], complete_data['Volume'])
    complete_data['ADO**2/MACD'] = ado_2_macd(complete_data['ADO'], complete_data['MACD']) 
    complete_data['MACD**3*PVT'] = macd_3_pvt(complete_data['MACD'], complete_data['PVT'])
    complete_data['log(RSI)/ADX'] = log_rsi_adx(complete_data['RSI'], complete_data['ADX'])
    complete_data['RSI**2/StochasticD'] = rsi_2_stochastic_d(complete_data['RSI'], complete_data['StochasticD'])
    complete_data['ADO**2*sqrt(LarryWilliamsR)'] = ado_2_sqrt_lwr(complete_data['ADO'], complete_data['LarryWilliamsR'])
    return complete_data

In [87]:
enhanced_data = compute_adv_indicators(complete_data)
enhanced_data

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,StochasticD,...,sqrt(LarryWilliamsR)*RSI,Low/Close,MACD**2*PVT,sqrt(RSI)/ADX,ADO*Volume,ADO**2/MACD,MACD**3*PVT,log(RSI)/ADX,RSI**2/StochasticD,ADO**2*sqrt(LarryWilliamsR)
0,51.316315,51.406406,50.550552,51.206207,51.206207,4061734,50.821821,51.371371,23.177440,29.682331,...,7.976100,0.987196,1.959274e+00,0.072327,9.738624e+05,0.818356,1.376338e-01,-0.007150,0.027899,0.503868
1,50.850849,53.333332,50.700703,52.717716,52.717716,8698892,51.204204,51.242242,43.947690,34.437451,...,6.813072,0.961739,8.579413e+03,0.071840,7.028575e+06,3.571527,1.568232e+03,-0.007102,0.024047,4.887688
2,53.368366,54.259258,53.283283,53.803802,53.803802,7844148,51.953953,51.309809,67.614480,44.913203,...,7.167289,0.990326,2.042602e+04,0.078132,1.238975e+07,7.017269,7.261878e+03,0.016059,0.035317,14.197399
3,53.778778,56.056057,53.448448,55.800800,55.800800,10828960,52.945945,51.784284,96.094920,69.219030,...,2.488826,0.957844,1.677865e+05,0.063286,9.353235e+06,1.154642,1.084076e+05,0.013008,0.022916,1.474230
4,55.335335,57.172173,55.155155,56.056057,56.056057,10713076,53.916916,52.266266,85.415306,83.041569,...,4.809813,0.983929,3.853652e+04,0.048824,7.283833e+06,0.521292,3.417299e+04,0.010035,0.019101,1.765389
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4311,2783.000000,2811.659912,2721.120117,2751.330078,2751.330078,2527700,2828.945996,2805.401001,36.641914,69.181246,...,7.243480,0.989020,-5.875451e+06,0.064957,-7.275476e+05,0.009481,-5.133958e+07,-0.006421,0.011970,0.659436
4312,2751.000000,2760.000000,2708.479980,2748.939941,2748.939941,1720600,2807.621973,2802.466992,31.791463,51.449377,...,7.515633,0.985282,-2.274792e+04,0.062418,2.895470e+05,0.007259,-8.874268e+04,-0.006170,0.016096,0.233882
4313,2785.270020,2801.659912,2766.090088,2786.169922,2786.169922,2461800,2791.907959,2808.185986,47.150533,38.527970,...,6.615559,0.992793,3.075249e+05,0.059868,3.648768e+06,0.723331,9.339644e+05,-0.005918,0.021494,15.970084
4314,2788.100098,2973.000000,2788.100098,2924.350098,2924.350098,4289100,2809.702002,2825.457007,81.608234,53.516743,...,2.897677,0.953408,3.790175e+07,0.052180,4.333874e+06,0.076488,5.059257e+08,-0.024887,0.008531,4.378566


### autofeat Feature Selection

In [88]:
from autofeat import FeatureSelector
fsel = FeatureSelector(verbose=1)
selected_data = fsel.fit_transform(pd.DataFrame(enhanced_data), pd.DataFrame(target))

  return f(**kwargs)


[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 15 features after 5 feature selection runs
[featsel] 13 features after correlation filtering
[featsel] 10 features after noise filtering


In [89]:
selected_data

Unnamed: 0,Momentum,ADO*Volume,log(RSI)/ADX,sqrt(RSI)/ADX,sqrt(Volume)*log(RSI),sqrt(StochasticD)/RSI,sqrt(LarryWilliamsR)*RSI,Low/Close,MACD**2*PVT,StochasticD/RSI
0,101.963329,9.738624e+05,-0.007150,0.072327,-190.048937,5.986911,7.976100,0.987196,1.959274e+00,32.617584
1,97.248635,7.028575e+06,-0.007102,0.071840,-278.126054,6.448657,6.813072,0.961739,8.579413e+03,37.842932
2,98.263254,1.238975e+07,0.016059,0.078132,646.051929,5.321175,7.167289,0.990326,2.042602e+04,35.661084
3,106.312575,9.353235e+06,0.013008,0.063286,759.080636,6.605919,2.488826,0.957844,1.677865e+05,54.959910
4,105.660375,7.283833e+06,0.010035,0.048824,755.008135,7.235495,4.809813,0.983929,3.853652e+04,65.935006
...,...,...,...,...,...,...,...,...,...,...
4311,102.923105,-7.275476e+05,-0.006421,0.064957,-149.924452,9.140037,7.243480,0.989020,-5.875451e+06,76.022503
4312,101.046881,2.895470e+05,-0.006170,0.062418,-123.694293,7.882135,7.515633,0.985282,-2.274792e+04,56.537149
4313,101.267396,3.648768e+06,-0.005918,0.059868,-147.957195,6.820903,6.615559,0.992793,3.075249e+05,42.337958
4314,105.022449,4.333874e+06,-0.024887,0.052180,-811.924748,10.826961,2.897677,0.953408,3.790175e+07,79.204780


In [90]:
from sklearn.preprocessing import StandardScaler

In [91]:
scaler = StandardScaler()  
scaler.fit(selected_data)

StandardScaler()

In [92]:

selected_scaled_data = scaler.transform(selected_data)

In [93]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(selected_scaled_data,target,test_size=0.3)

In [94]:
X_train = scaler.transform(X_train)  
X_test = scaler.transform(X_test) 

In [95]:
selected_scaled_data_df = pd.DataFrame(selected_scaled_data)
selected_scaled_data_1250 = selected_scaled_data_df.tail(1250)
selected_scaled_data_250 = selected_scaled_data_df.tail(250)
selected_scaled_data_125 = selected_scaled_data_df.tail(125)
target_1250 = target.tail(1250)
target_250 = target.tail(250)
target_125 = target.tail(125)

In [96]:
X_train_1250, X_test_1250, y_train_1250, y_test_1250 = train_test_split(selected_scaled_data_1250,target_1250,test_size=0.3)

In [97]:
X_train_250, X_test_250, y_train_250, y_test_250 = train_test_split(selected_scaled_data_250,target_250,test_size=0.3)

In [98]:
X_train_125, X_test_125, y_train_125, y_test_125 = train_test_split(selected_scaled_data_125,target_125,test_size=0.3)

### Logistic Regression

#### Max Duration

In [173]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [174]:
lr = LogisticRegression(solver='liblinear', multi_class='ovr')
trained_model_lr = lr.fit(X_train, y_train)
predictions_lr = trained_model_lr.predict(X_test)

Train_accuracy_lr = accuracy_score(y_train,trained_model_lr.predict(X_train))
Test_accuracy_lr = accuracy_score(y_test, predictions_lr)
Confusion_matrix_lr = confusion_matrix(y_test,predictions_lr)

In [175]:
Train_accuracy_lr

0.6401853690830851

In [176]:
Test_accuracy_lr

0.6154440154440154

#### 5 Years

In [177]:
trained_model_lr_1250 = lr.fit(X_train_1250, y_train_1250)
predictions_lr_1250 = trained_model_lr_1250.predict(X_test_1250)

Train_accuracy_lr_1250 = accuracy_score(y_train_1250,trained_model_lr_1250.predict(X_train_1250))
Test_accuracy_lr_1250 = accuracy_score(y_test_1250, predictions_lr_1250)
Confusion_matrix_lr_1250 = confusion_matrix(y_test_1250,predictions_lr_1250)

In [178]:
Train_accuracy_lr_1250

0.6651428571428571

In [179]:
Test_accuracy_lr_1250

0.6293333333333333

#### 1 Year

In [180]:
trained_model_lr_250 = lr.fit(X_train_250, y_train_250)
predictions_lr_250 = trained_model_lr_250.predict(X_test_250)

Train_accuracy_lr_250 = accuracy_score(y_train_250,trained_model_lr_250.predict(X_train_250))
Test_accuracy_lr_250 = accuracy_score(y_test_250, predictions_lr_250)
Confusion_matrix_lr_250 = confusion_matrix(y_test_250,predictions_lr_250)

In [181]:
Train_accuracy_lr_250

0.64

In [182]:
Test_accuracy_lr_250

0.72

#### 6 Months

In [183]:
trained_model_lr_125 = lr.fit(X_train_125, y_train_125)
predictions_lr_125 = trained_model_lr_125.predict(X_test_125)

Train_accuracy_lr_125 = accuracy_score(y_train_125,trained_model_lr_125.predict(X_train_125))
Test_accuracy_lr_125 = accuracy_score(y_test_125, predictions_lr_125)
Confusion_matrix_lr_125 = confusion_matrix(y_test_125,predictions_lr_125)

In [184]:
Train_accuracy_lr_125

0.6781609195402298

In [185]:
Test_accuracy_lr_125

0.6052631578947368

#### Last 100 Days based on Best Model - Max Duration

In [186]:
last100 = selected_scaled_data_df.tail(100)

In [240]:
predictions_lr_100 = trained_model_lr_250.predict(last100)
predictions_lr_100

array([0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1])

### SVM

In [188]:
svm = SVC(gamma='auto')
trained_model_svm = svm.fit(X_train, y_train)
predictions_svm = trained_model_svm.predict(X_test)

Train_accuracy_svm = accuracy_score(y_train,trained_model_svm.predict(X_train))
Test_accuracy_svm = accuracy_score(y_test, predictions_svm)
Confusion_matrix_svm = confusion_matrix(y_test,predictions_svm)

In [189]:
Train_accuracy_svm

0.982787156570672

In [190]:
Test_accuracy_svm

0.518918918918919

#### 5 Years

In [191]:
trained_model_svm_1250 = svm.fit(X_train_1250, y_train_1250)
predictions_svm_1250 = trained_model_svm_1250.predict(X_test_1250)

Train_accuracy_svm_1250 = accuracy_score(y_train_1250,trained_model_svm_1250.predict(X_train_1250))
Test_accuracy_svm_1250 = accuracy_score(y_test_1250, predictions_svm_1250)
Confusion_matrix_svm_1250 = confusion_matrix(y_test_1250,predictions_svm_1250)

In [192]:
Train_accuracy_svm_1250

0.68

In [193]:
Test_accuracy_svm_1250

0.624

#### 1 Year

In [194]:
trained_model_svm_250 = svm.fit(X_train_250, y_train_250)
predictions_svm_250 = trained_model_svm_1250.predict(X_test_250)

Train_accuracy_svm_250 = accuracy_score(y_train_250,trained_model_svm_250.predict(X_train_250))
Test_accuracy_svm_250 = accuracy_score(y_test_250, predictions_svm_250)
Confusion_matrix_svm_250 = confusion_matrix(y_test_250,predictions_svm_250)

In [195]:
Train_accuracy_svm_250

0.7028571428571428

In [196]:
Test_accuracy_svm_250

0.7066666666666667

#### 6 Months

In [197]:
trained_model_svm_125 = svm.fit(X_train_125, y_train_125)
predictions_svm_125 = trained_model_svm_1250.predict(X_test_125)

Train_accuracy_svm_125 = accuracy_score(y_train_125,trained_model_svm_125.predict(X_train_125))
Test_accuracy_svm_125 = accuracy_score(y_test_125, predictions_svm_125)
Confusion_matrix_svm_125 = confusion_matrix(y_test_125,predictions_svm_125)

In [198]:
Train_accuracy_svm_125

0.632183908045977

In [199]:
Test_accuracy_svm_125

0.5526315789473685

#### Last 100 Days based on Best Model - Max Duration

In [241]:
predictions_svm_100 = trained_model_svm_250.predict(last100)
predictions_svm_100

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

### RandomForest

In [201]:
rf = RandomForestClassifier(n_estimators=10)
trained_model_rf = rf.fit(X_train, y_train)
predictions_rf = trained_model_rf.predict(X_test)

Train_accuracy_rf = accuracy_score(y_train, trained_model_rf.predict(X_train))
Test_accuracy_rf = accuracy_score(y_test, predictions_rf)
Confusion_matrix_rf = confusion_matrix(y_test, predictions_rf)

In [202]:
Train_accuracy_rf

0.9817941079112876

In [203]:
Test_accuracy_rf

0.5783783783783784

#### 5 Years

In [204]:
trained_model_rf_1250 = rf.fit(X_train_1250, y_train_1250)
predictions_rf_1250 = trained_model_rf_1250.predict(X_test_1250)

Train_accuracy_rf_1250 = accuracy_score(y_train_1250,trained_model_rf_1250.predict(X_train_1250))
Test_accuracy_rf_1250 = accuracy_score(y_test_1250, predictions_rf_1250)
Confusion_matrix_rf_1250 = confusion_matrix(y_test_1250,predictions_rf_1250)

In [205]:
Train_accuracy_rf_1250

0.9805714285714285

In [206]:
Test_accuracy_rf_1250

0.52

#### 1 Year

In [207]:
trained_model_rf_250 = rf.fit(X_train_250, y_train_250)
predictions_rf_250 = trained_model_rf_250.predict(X_test_250)

Train_accuracy_rf_250 = accuracy_score(y_train_250,trained_model_rf_250.predict(X_train_250))
Test_accuracy_rf_250 = accuracy_score(y_test_250, predictions_rf_250)
Confusion_matrix_rf_250 = confusion_matrix(y_test_250,predictions_rf_250)

In [208]:
Train_accuracy_rf_250

0.9942857142857143

In [209]:
Test_accuracy_rf_250

0.6533333333333333

#### 6 Months

In [210]:
trained_model_rf_125 = rf.fit(X_train_125, y_train_125)
predictions_rf_125 = trained_model_rf_125.predict(X_test_125)

Train_accuracy_rf_125 = accuracy_score(y_train_125,trained_model_rf_125.predict(X_train_125))
Test_accuracy_rf_125 = accuracy_score(y_test_125, predictions_rf_125)
Confusion_matrix_rf_125 = confusion_matrix(y_test_125,predictions_rf_125)

In [211]:
Train_accuracy_rf_125

0.9885057471264368

In [212]:
Test_accuracy_rf_125

0.6578947368421053

#### Last 100 Days based on Best Model

In [213]:
predictions_rf_100 = trained_model_rf_125.predict(last100)
predictions_rf_100

array([1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1])

### KNN

In [214]:
knn = KNeighborsClassifier(n_neighbors=5)
trained_model_knn = knn.fit(X_train, y_train)
predictions_knn = trained_model_knn.predict(X_test)

Train_accuracy_knn = accuracy_score(y_train, trained_model_knn.predict(X_train))
Test_accuracy_knn = accuracy_score(y_test, predictions_knn)
Confusion_matrix_knn = confusion_matrix(y_test, predictions_knn)

In [215]:
Train_accuracy_knn

0.7361800728235683

In [216]:
Test_accuracy_knn

0.5459459459459459

#### 5 Years

In [217]:
trained_model_knn_1250 = knn.fit(X_train_1250, y_train_1250)
predictions_knn_1250 = trained_model_knn_1250.predict(X_test_1250)

Train_accuracy_knn_1250 = accuracy_score(y_train_1250,trained_model_knn_1250.predict(X_train_1250))
Test_accuracy_knn_1250 = accuracy_score(y_test_1250, predictions_knn_1250)
Confusion_matrix_knn_1250 = confusion_matrix(y_test_1250,predictions_knn_1250)

In [218]:
Train_accuracy_knn_1250

0.7154285714285714

In [219]:
Test_accuracy_knn_1250

0.5066666666666667

#### 1 Year

In [220]:
trained_model_knn_250 = knn.fit(X_train_250, y_train_250)
predictions_knn_250 = trained_model_knn_250.predict(X_test_250)

Train_accuracy_knn_250 = accuracy_score(y_train_250,trained_model_knn_250.predict(X_train_250))
Test_accuracy_knn_250 = accuracy_score(y_test_250, predictions_knn_250)
Confusion_matrix_knn_250 = confusion_matrix(y_test_250,predictions_knn_250)

In [221]:
Train_accuracy_knn_250

0.6914285714285714

In [222]:
Test_accuracy_knn_250

0.6133333333333333

#### 6 Months

In [223]:
trained_model_knn_125 = knn.fit(X_train_125, y_train_125)
predictions_knn_125 = trained_model_knn_125.predict(X_test_125)

Train_accuracy_knn_125 = accuracy_score(y_train_125,trained_model_knn_125.predict(X_train_125))
Test_accuracy_knn_125 = accuracy_score(y_test_125, predictions_knn_125)
Confusion_matrix_knn_125 = confusion_matrix(y_test_125,predictions_knn_125)

In [224]:
Train_accuracy_knn_125

0.7241379310344828

In [225]:
Test_accuracy_knn_125

0.5

#### Last 100 Days based on Best Model

In [243]:
predictions_knn_100 = trained_model_knn_250.predict(last100)
predictions_knn_100

array([1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

### XGBoost

In [227]:
xgb = XGBClassifier(use_label_encoder=False)
trained_model_xgb = xgb.fit(X_train, y_train)
predictions_xgb = trained_model_xgb.predict(X_test)

Train_accuracy_xgb = accuracy_score(y_train, trained_model_xgb.predict(X_train))
Test_accuracy_xgb = accuracy_score(y_test, predictions_xgb)
Confusion_matrix_xgb = confusion_matrix(y_test, predictions_xgb)



In [228]:
Train_accuracy_xgb

0.9923866269447202

In [229]:
Test_accuracy_xgb

0.5814671814671815

#### 5 Years

In [230]:
trained_model_xgb_1250 = xgb.fit(X_train_1250, y_train_1250)
predictions_xgb_1250 = trained_model_xgb_1250.predict(X_test_1250)

Train_accuracy_xgb_1250 = accuracy_score(y_train_1250,trained_model_xgb_1250.predict(X_train_1250))
Test_accuracy_xgb_1250 = accuracy_score(y_test_1250, predictions_xgb_1250)
Confusion_matrix_xgb_1250 = confusion_matrix(y_test_1250,predictions_xgb_1250)



In [231]:
Train_accuracy_xgb_1250

1.0

In [232]:
Test_accuracy_xgb_1250

0.5333333333333333

#### 1 Year

In [233]:
trained_model_xgb_250 = xgb.fit(X_train_250, y_train_250)
predictions_xgb_250 = trained_model_xgb_250.predict(X_test_250)

Train_accuracy_xgb_250 = accuracy_score(y_train_250,trained_model_xgb_250.predict(X_train_250))
Test_accuracy_xgb_250 = accuracy_score(y_test_250, predictions_xgb_250)
Confusion_matrix_xgb_250 = confusion_matrix(y_test_250,predictions_xgb_250)



In [234]:
Train_accuracy_xgb_250

1.0

In [235]:
Test_accuracy_xgb_250

0.64

#### 6 Months

In [236]:
trained_model_xgb_125 = xgb.fit(X_train_125, y_train_125)
predictions_xgb_125 = trained_model_xgb_125.predict(X_test_125)

Train_accuracy_xgb_125 = accuracy_score(y_train_125,trained_model_xgb_125.predict(X_train_125))
Test_accuracy_xgb_125 = accuracy_score(y_test_125, predictions_xgb_125)
Confusion_matrix_xgb_125 = confusion_matrix(y_test_125,predictions_xgb_125)



In [237]:
Train_accuracy_xgb_125

1.0

In [238]:
Test_accuracy_xgb_125

0.5263157894736842

#### Last 100 Days based on Best Model

In [242]:
predictions_xgb_100 = trained_model_xgb_250.predict(last100)
predictions_xgb_100

array([1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1])

## KFold cross validation
### Basic example

In [166]:
from sklearn.model_selection import cross_val_score

cross_val_score(LogisticRegression(solver='liblinear',multi_class='ovr'), complete_data, target, cv=10)

array([0.53703704, 0.51157407, 0.57407407, 0.54398148, 0.52546296,
       0.52546296, 0.52204176, 0.57540603, 0.53828306, 0.51508121])

In [167]:
cross_val_score(SVC(gamma='auto'), complete_data, target, cv=10)

array([0.52777778, 0.52777778, 0.52546296, 0.52546296, 0.52546296,
       0.52546296, 0.52668213, 0.52668213, 0.52668213, 0.52668213])

In [168]:
cross_val_score(RandomForestClassifier(n_estimators=5), complete_data, target, cv=10)

array([0.54861111, 0.60416667, 0.57407407, 0.56712963, 0.58101852,
       0.59722222, 0.58236659, 0.57076566, 0.5638051 , 0.52668213])

In [169]:
cross_val_score(KNeighborsClassifier(n_neighbors=5), complete_data, target, cv=10)

array([0.49305556, 0.53009259, 0.53935185, 0.51388889, 0.46990741,
       0.47685185, 0.49883991, 0.49883991, 0.44547564, 0.49419954])