In [1]:
import numpy as np
import pandas as pd
import ta
import seaborn as sn
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from ta import add_all_ta_features
from ta import momentum
from ta.utils import dropna

In [2]:
#pip install xgboost

In [3]:
pip install autofeat

Note: you may need to restart the kernel to use updated packages.


In [4]:
def get_data(company_symbol):
    return pd.read_csv("dataset/"+company_symbol+".csv")

In [5]:
data = get_data("AAPL")
data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,12/12/1980,0.128348,0.128906,0.128348,0.128348,0.100453,469033600
1,12/15/1980,0.122210,0.122210,0.121652,0.121652,0.095213,175884800
2,12/16/1980,0.113281,0.113281,0.112723,0.112723,0.088224,105728000
3,12/17/1980,0.115513,0.116071,0.115513,0.115513,0.090408,86441600
4,12/18/1980,0.118862,0.119420,0.118862,0.118862,0.093029,73449600
...,...,...,...,...,...,...,...
10321,11/18/2021,153.710007,158.669998,153.050003,157.869995,157.869995,137827700
10322,11/19/2021,157.649994,161.020004,156.529999,160.550003,160.550003,117147500
10323,11/22/2021,161.679993,165.699997,161.000000,161.020004,161.020004,117467900
10324,11/23/2021,161.119995,161.800003,159.059998,161.410004,161.410004,95933900


In [6]:
data.dtypes

Date          object
Open         float64
High         float64
Low          float64
Close        float64
Adj Close    float64
Volume         int64
dtype: object

In [7]:
data.isna().any()

Date         False
Open         False
High         False
Low          False
Close        False
Adj Close    False
Volume       False
dtype: bool

In [8]:
data[data.columns[data.isna().any()]]

0
1
2
3
4
...
10321
10322
10323
10324
10325


In [9]:
cond1 = data.Date.isna() # aapl['Date'].isna()
cond2 = data.Date.str.contains(r'^\s*$', na=False)
cond3 = data.Date == ''

data_checked = data.assign(cond1= cond1, cond2= cond2, cond3= cond3)
print (data_checked)

             Date        Open        High         Low       Close   Adj Close  \
0      12/12/1980    0.128348    0.128906    0.128348    0.128348    0.100453   
1      12/15/1980    0.122210    0.122210    0.121652    0.121652    0.095213   
2      12/16/1980    0.113281    0.113281    0.112723    0.112723    0.088224   
3      12/17/1980    0.115513    0.116071    0.115513    0.115513    0.090408   
4      12/18/1980    0.118862    0.119420    0.118862    0.118862    0.093029   
...           ...         ...         ...         ...         ...         ...   
10321  11/18/2021  153.710007  158.669998  153.050003  157.869995  157.869995   
10322  11/19/2021  157.649994  161.020004  156.529999  160.550003  160.550003   
10323  11/22/2021  161.679993  165.699997  161.000000  161.020004  161.020004   
10324  11/23/2021  161.119995  161.800003  159.059998  161.410004  161.410004   
10325  11/24/2021  160.750000  162.139999  159.639999  161.940002  161.940002   

          Volume  cond1  co

In [10]:
def simple_moving_average_5(close):
    return close.rolling(5, min_periods=1).mean()

In [11]:
def simple_moving_average_10(close):
    return close.rolling(10, min_periods=1).mean()

In [12]:
def stochastic_k(high, low, close):
    return (((close-low.rolling(14).min())/(high.rolling(14).max()-low.rolling(14).min()))*100)

In [13]:
def stochastic_d(stochastic_k):
    return stochastic_k.rolling(3,min_periods=1).mean()

In [14]:
def larry_williams_r(high, low, close):
    return (((high.rolling(14).max()-close)/(high.rolling(14).max()-low.rolling(14).min()))*100)

In [15]:
def rate_of_change(close):
    return ((close-close.shift(12))/close.shift(12))

In [16]:
def price_volume_trend(close, volume):
    return (((close-close.shift(1))/close.shift(1))*volume)

In [17]:
def accumulation_distribution_oscillator(high, low, close):
    return((high-close.shift(1))/(high-low))

In [18]:
def weighted_moving_average_10(close):
    return ((10*close+9*close.shift(1)+8*close.shift(2)+7*close.shift(3)
             +6*close.shift(4)+5*close.shift(5)+4*close.shift(6)+3*close.shift(7)
             +2*close.shift(8)+close.shift(9))/(10+9+8+7+6+5+4+3+2+1))

In [19]:
def exponential_moving_average_12(close):
    return (close.ewm(span=12, adjust=False).mean())

In [20]:
def exponential_moving_average_26(close):
    return (close.ewm(span=26, adjust=False).mean())

In [21]:
def moving_average_convergence_divergence(ema_12, ema_26):
    return (ema_12 - ema_26)

In [22]:
def momentum(close):
    return ((close/close.shift(14))*100)

In [23]:
def change(close):
    return (close.shift(-1) - close)

In [24]:
def direction(change):
    return (np.where(change>0, 1, 0))

In [25]:
def relative_strength_index(direction):
    return (100-(100/(1+(direction.rolling(14).sum()/14)/(14-direction.rolling(14).sum())/14)))

In [26]:
def channel_commodity_index(df, ndays): 
    df['TP'] = (df['High'] + df['Low'] + df['Close']) / 3 
    df['sma'] = df['TP'].rolling(ndays).mean()
    df['mad'] = df['TP'].rolling(ndays).apply(lambda x: pd.Series(x).mad())
    df['CCI'] = (df['TP'] - df['sma']) / (0.015 * df['mad']) 
    return df['CCI']

In [27]:
def disparity_index(close):
    return (((close - close.rolling(14).mean())/(close.rolling(14).mean()))*100)

In [28]:
def get_adx(high, low, close, lookback):
    plus_dm = high.diff()
    minus_dm = low.diff()
    plus_dm[plus_dm < 0] = 0
    minus_dm[minus_dm > 0] = 0
    
    tr1 = pd.DataFrame(high - low)
    tr2 = pd.DataFrame(abs(high - close.shift(1)))
    tr3 = pd.DataFrame(abs(low - close.shift(1)))
    frames = [tr1, tr2, tr3]
    tr = pd.concat(frames, axis = 1, join = 'inner').max(axis = 1)
    atr = tr.rolling(lookback).mean()
    
    plus_di = 100 * (plus_dm.ewm(alpha = 1/lookback).mean() / atr)
    minus_di = abs(100 * (minus_dm.ewm(alpha = 1/lookback).mean() / atr))
    dx = (abs(plus_di - minus_di) / abs(plus_di + minus_di)) * 100
    adx = ((dx.shift(1) * (lookback - 1)) + dx) / lookback
    adx_smooth = adx.ewm(alpha = 1/lookback).mean()
    return adx_smooth

In [29]:
def aroon(close):
    return (ta.trend.AroonIndicator(close, 25, True).aroon_indicator())

In [30]:
def compute_all_indicators(data):
    data['SMA5'] = simple_moving_average_5(data['Close'])
    data['SMA10'] = simple_moving_average_10(data['Close'])
    data['StochasticK'] = stochastic_k(data['High'], data['Low'], data['Close'])
    data['StochasticD'] = stochastic_d(data['StochasticK'])
    data['LarryWilliamsR'] = larry_williams_r(data['High'], data['Low'], data['Close'])
    data['ROC'] = rate_of_change(data['Close'])
    data['PVT'] = price_volume_trend(data['Close'], data['Volume'])
    data['ADO'] = accumulation_distribution_oscillator(data['High'], data['Low'], data['Close'])
    data['WMA10'] = weighted_moving_average_10(data['Close'])
    data['EMA12'] = exponential_moving_average_12(data['Close'])
    data['EMA26'] = exponential_moving_average_26(data['Close'])
    data['MACD'] = moving_average_convergence_divergence(data['EMA12'], data['EMA26'])
    data['Momentum'] = momentum(data['Close'])
    data['Change'] = change(data['Close'])
    data['Direction'] = direction(data['Change'])
    data['RSI'] = relative_strength_index(data['Direction'])
    data['CCI'] = channel_commodity_index(data, 14)
    data['DI'] = disparity_index(data['Close'])
    data['ADX'] = get_adx(data['High'], data['Low'], data['Close'], 14)
    data['Aroon'] = aroon(data['Close'])    
    return data

In [31]:
complete_data = compute_all_indicators(data)
complete_data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,...,Change,Direction,RSI,TP,sma,mad,CCI,DI,ADX,Aroon
0,12/12/1980,0.128348,0.128906,0.128348,0.128348,0.100453,469033600,0.128348,0.128348,,...,-0.006696,0,,0.128534,,,,,,0.0
1,12/15/1980,0.122210,0.122210,0.121652,0.121652,0.095213,175884800,0.125000,0.125000,,...,-0.008929,0,,0.121838,,,,,,-4.0
2,12/16/1980,0.113281,0.113281,0.112723,0.112723,0.088224,105728000,0.120908,0.120908,,...,0.002790,1,,0.112909,,,,,,-8.0
3,12/17/1980,0.115513,0.116071,0.115513,0.115513,0.090408,86441600,0.119559,0.119559,,...,0.003349,1,,0.115699,,,,,,-8.0
4,12/18/1980,0.118862,0.119420,0.118862,0.118862,0.093029,73449600,0.119420,0.119420,,...,0.007254,1,,0.119048,,,,,,-8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10321,11/18/2021,153.710007,158.669998,153.050003,157.869995,157.869995,137827700,152.470001,151.067000,92.850734,...,2.680008,1,1.259446,156.529999,150.785238,1.340068,285.794377,4.643715,15.458962,96.0
10322,11/19/2021,157.649994,161.020004,156.529999,160.550003,160.550003,117147500,154.582001,151.994000,96.528798,...,0.470001,1,1.259446,159.366669,151.538572,2.062994,252.968849,5.839366,17.452579,96.0
10323,11/22/2021,161.679993,165.699997,161.000000,161.020004,161.020004,117467900,156.786001,153.052000,74.313981,...,0.390000,1,1.259446,162.573334,152.430953,3.129456,216.062687,5.602220,19.791095,28.0
10324,11/23/2021,161.119995,161.800003,159.059998,161.410004,161.410004,95933900,158.868002,154.112001,76.454485,...,0.529998,1,1.836394,160.756668,153.121190,3.825817,133.051798,5.368344,22.560291,32.0


In [32]:
complete_data = complete_data.dropna()

In [33]:
complete_data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,...,Change,Direction,RSI,TP,sma,mad,CCI,DI,ADX,Aroon
14,1/5/1981,0.151228,0.151228,0.150670,0.150670,0.117924,35728000,0.154911,0.147433,78.162269,...,-0.006697,0,0.910010,0.150856,0.138991,0.015240,51.905087,8.558577,39.422947,32.0
15,1/6/1981,0.144531,0.144531,0.143973,0.143973,0.112683,45158400,0.151563,0.149219,64.367958,...,-0.006138,0,0.910010,0.144159,0.140585,0.014156,16.831178,2.555255,34.025492,32.0
16,1/7/1981,0.138393,0.138393,0.137835,0.137835,0.107879,55686400,0.147768,0.149777,48.781660,...,-0.002790,0,0.675676,0.138021,0.142379,0.012106,-23.998205,-3.055644,26.178222,32.0
17,1/8/1981,0.135603,0.135603,0.135045,0.135045,0.105695,39827200,0.144308,0.149498,38.158453,...,0.007254,1,0.675676,0.135231,0.143774,0.010511,-54.181713,-5.940912,19.002172,32.0
18,1/9/1981,0.142299,0.142857,0.142299,0.142299,0.111372,21504000,0.141964,0.149219,46.031972,...,-0.001116,0,0.507614,0.142485,0.145448,0.008807,-22.428997,-2.030802,15.894879,32.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10320,11/17/2021,151.000000,155.000000,150.990005,153.490005,153.490005,88807000,150.470001,150.376001,82.421470,...,4.379990,1,0.910010,153.160003,150.227143,1.070885,182.581771,2.130675,14.297264,96.0
10321,11/18/2021,153.710007,158.669998,153.050003,157.869995,157.869995,137827700,152.470001,151.067000,92.850734,...,2.680008,1,1.259446,156.529999,150.785238,1.340068,285.794377,4.643715,15.458962,96.0
10322,11/19/2021,157.649994,161.020004,156.529999,160.550003,160.550003,117147500,154.582001,151.994000,96.528798,...,0.470001,1,1.259446,159.366669,151.538572,2.062994,252.968849,5.839366,17.452579,96.0
10323,11/22/2021,161.679993,165.699997,161.000000,161.020004,161.020004,117467900,156.786001,153.052000,74.313981,...,0.390000,1,1.259446,162.573334,152.430953,3.129456,216.062687,5.602220,19.791095,28.0


In [34]:
complete_data = complete_data.drop(['TP', 'sma', 'mad', 'Change'], axis=1)

In [35]:
complete_data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,...,EMA12,EMA26,MACD,Momentum,Direction,RSI,CCI,DI,ADX,Aroon
14,1/5/1981,0.151228,0.151228,0.150670,0.150670,0.117924,35728000,0.154911,0.147433,78.162269,...,0.145433,0.138131,0.007302,117.391779,0,0.910010,51.905087,8.558577,39.422947,32.0
15,1/6/1981,0.144531,0.144531,0.143973,0.143973,0.112683,45158400,0.151563,0.149219,64.367958,...,0.145208,0.138564,0.006645,118.348239,0,0.910010,16.831178,2.555255,34.025492,32.0
16,1/7/1981,0.138393,0.138393,0.137835,0.137835,0.107879,55686400,0.147768,0.149777,48.781660,...,0.144074,0.138510,0.005564,122.277619,0,0.675676,-23.998205,-3.055644,26.178222,32.0
17,1/8/1981,0.135603,0.135603,0.135045,0.135045,0.105695,39827200,0.144308,0.149498,38.158453,...,0.142685,0.138253,0.004432,116.908919,1,0.675676,-54.181713,-5.940912,19.002172,32.0
18,1/9/1981,0.142299,0.142857,0.142299,0.142299,0.111372,21504000,0.141964,0.149219,46.031972,...,0.142626,0.138553,0.004073,119.717824,0,0.507614,-22.428997,-2.030802,15.894879,32.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10320,11/17/2021,151.000000,155.000000,150.990005,153.490005,153.490005,88807000,150.470001,150.376001,82.421470,...,150.348567,149.181217,1.167349,100.603001,1,0.910010,182.581771,2.130675,14.297264,96.0
10321,11/18/2021,153.710007,158.669998,153.050003,157.869995,157.869995,137827700,152.470001,151.067000,92.850734,...,151.505710,149.824831,1.680879,105.387177,1,1.259446,285.794377,4.643715,15.458962,96.0
10322,11/19/2021,157.649994,161.020004,156.529999,160.550003,160.550003,117147500,154.582001,151.994000,96.528798,...,152.897139,150.619288,2.277852,107.780609,1,1.259446,252.968849,5.839366,17.452579,96.0
10323,11/22/2021,161.679993,165.699997,161.000000,161.020004,161.020004,117467900,156.786001,153.052000,74.313981,...,154.146811,151.389711,2.757100,107.332355,1,1.259446,216.062687,5.602220,19.791095,28.0


In [36]:
#corrMatrix = complete_data.corr()
#print(corrMatrix)

In [37]:
#sn.heatmap(corrMatrix, annot=False)
#plt.show()

In [38]:
#corr_pairs = corrMatrix.unstack()
#corr_pairs

In [39]:
#sorted_pairs = corr_pairs.sort_values(kind="quicksort")
#sorted_pairs

In [40]:
#strong_pairs = sorted_pairs[abs(sorted_pairs) > 0.5]

#print(strong_pairs)

In [41]:
## strong_pairs[strong_pairs.index[0][0] == strong_pairs.index[0][1]]

#removed_diagonal = [(i, j) for (i, j) in strong_pairs.index if i!=j]
#len(removed_diagonal)

In [42]:
## # Create correlation matrix
## corr_matrix = complete_data.corr().abs()

## # Select upper triangle of correlation matrix
## upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool8))

## # Find index of feature columns with correlation greater than 0.95
## to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
## to_drop

In [43]:
## Find index of feature columns with correlation greater than 0.8

#correlated_features = set()
#for i in range(len(corrMatrix.columns)):
 #   for j in range(i):
  #      if abs(corrMatrix.iloc[i, j]) > 0.8:
   #         colname = corrMatrix.columns[i]
    #        correlated_features.add(colname)

In [44]:
#correlated_features

In [45]:
#complete_data = complete_data.drop(labels=correlated_features, axis=1)

In [46]:
#complete_data

In [47]:
target = complete_data['Direction']
complete_data = complete_data.drop(['Date', 'Direction'], axis=1)
complete_data

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,StochasticD,...,WMA10,EMA12,EMA26,MACD,Momentum,RSI,CCI,DI,ADX,Aroon
14,0.151228,0.151228,0.150670,0.150670,0.117924,35728000,0.154911,0.147433,78.162269,81.610332,...,0.151826,0.145433,0.138131,0.007302,117.391779,0.910010,51.905087,8.558577,39.422947,32.0
15,0.144531,0.144531,0.143973,0.143973,0.112683,45158400,0.151563,0.149219,64.367958,75.862874,...,0.151197,0.145208,0.138564,0.006645,118.348239,0.910010,16.831178,2.555255,34.025492,32.0
16,0.138393,0.138393,0.137835,0.137835,0.107879,55686400,0.147768,0.149777,48.781660,63.770629,...,0.149127,0.144074,0.138510,0.005564,122.277619,0.675676,-23.998205,-3.055644,26.178222,32.0
17,0.135603,0.135603,0.135045,0.135045,0.105695,39827200,0.144308,0.149498,38.158453,50.436024,...,0.146449,0.142685,0.138253,0.004432,116.908919,0.675676,-54.181713,-5.940912,19.002172,32.0
18,0.142299,0.142857,0.142299,0.142299,0.111372,21504000,0.141964,0.149219,46.031972,44.324028,...,0.145140,0.142626,0.138553,0.004073,119.717824,0.507614,-22.428997,-2.030802,15.894879,32.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10320,151.000000,155.000000,150.990005,153.490005,153.490005,88807000,150.470001,150.376001,82.421470,67.809125,...,150.522365,150.348567,149.181217,1.167349,100.603001,0.910010,182.581771,2.130675,14.297264,96.0
10321,153.710007,158.669998,153.050003,157.869995,157.869995,137827700,152.470001,151.067000,92.850734,81.057205,...,151.884909,151.505710,149.824831,1.680879,105.387177,1.259446,285.794377,4.643715,15.458962,96.0
10322,157.649994,161.020004,156.529999,160.550003,160.550003,117147500,154.582001,151.994000,96.528798,90.600334,...,153.609091,152.897139,150.619288,2.277852,107.780609,1.259446,252.968849,5.839366,17.452579,96.0
10323,161.679993,165.699997,161.000000,161.020004,161.020004,117467900,156.786001,153.052000,74.313981,87.897838,...,155.250183,154.146811,151.389711,2.757100,107.332355,1.259446,216.062687,5.602220,19.791095,28.0


### autofeat Classification

In [48]:
from sklearn.model_selection import train_test_split

In [49]:
np.any(np.isnan(complete_data))

False

In [50]:
np.all(np.isinf(complete_data))

False

In [51]:
np.any(np.isnan(target))

False

In [52]:
np.all(np.isinf(target))

False

In [53]:
count1 = np.isinf(complete_data).values.sum()
count1

25

In [54]:
count2 = np.isinf(target).values.sum()
count2

0

In [55]:
type(complete_data.columns.to_series())

pandas.core.series.Series

In [56]:
col_name = complete_data.columns.to_series()[np.isinf(complete_data).any()]
print(col_name)

ADO    ADO
dtype: object


In [57]:
row_index = complete_data.index[np.isinf(complete_data).any(1)]
print(row_index)

Int64Index([ 999, 1006, 1049, 1067, 1082, 1111, 1112, 1132, 1142, 1146, 1151,
            1155, 1162, 1177, 1184, 1188, 1189, 1192, 1196, 1200, 1204, 1205,
            1217, 1221, 1233],
           dtype='int64')


In [58]:
target.drop(complete_data[np.isinf(complete_data).any(1)].index, inplace = True)
complete_data.drop(complete_data[np.isinf(complete_data).any(1)].index, inplace = True)

In [59]:
complete_data = complete_data.reset_index(drop = True)
target = target.reset_index(drop = True)

In [60]:
count1 = np.isinf(complete_data).values.sum()
count1

0

In [61]:
from autofeat import AutoFeatClassifier
X_train, X_test, y_train, y_test = train_test_split(complete_data,target,test_size=0.3)
model = AutoFeatClassifier()
df = model.fit_transform(X_train, y_train)
y_pred = model.predict(X_test)

  x = um.multiply(x, x, out=x)
  sqr = np.multiply(arr, arr, out=arr)


In [62]:
df_test = model.transform(X_test)
model.score(df_test,y_test)

0.5981853532080363

In [63]:
df

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,StochasticD,...,ADX/RSI,DI/Aroon,ADO*Aroon,WMA10/High,Volume/Aroon,1/(Aroon*RSI),Momentum**3/ADX,sqrt(RSI)*Abs(CCI),Aroon*LarryWilliamsR,LarryWilliamsR**3*StochasticK**2
0,0.190290,0.195313,0.189732,0.195313,0.152865,262080000.0,0.188058,0.180971,100.000000,97.058979,...,37.829582,0.123937,96.000000,0.950855,2.730000e+06,0.011447,55537.071778,108.962832,0.000000,0.000000e+00
1,0.067522,0.068080,0.066964,0.066964,0.052410,82812800.0,0.067522,0.068973,0.000000,1.851667,...,131.123367,-0.119223,22.000000,1.001642,1.882109e+06,0.163561,43167.113010,36.019154,4400.000000,0.000000e+00
2,0.267857,0.274554,0.265625,0.273438,0.228570,531596800.0,0.285268,0.282924,50.000830,58.333610,...,24.597847,-0.028879,8.002688,1.035992,1.661240e+07,0.034340,63414.056861,19.077334,1599.973451,3.124948e+08
3,21.589643,21.657143,21.454643,21.596786,19.372286,234836000.0,21.353571,21.221250,90.296354,83.831317,...,39.261495,0.018096,30.645096,0.984004,2.446208e+06,0.011447,30121.299448,165.970431,931.550047,7.449812e+06
4,0.090960,0.090960,0.089844,0.089844,0.070318,41910400.0,0.095424,0.101228,0.000000,1.801715,...,157.315522,0.146196,480.000000,1.074956,-4.365667e+05,-0.074965,23864.522365,52.526562,-9600.000000,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7193,6.871429,6.964286,6.850714,6.929643,5.941235,384801200.0,6.795714,6.942643,36.614830,24.381072,...,34.728537,-0.017097,73.785863,0.986118,6.871450e+06,0.026429,46812.515331,25.615523,3549.569542,3.414106e+08
7194,0.397321,0.404018,0.397321,0.399554,0.320066,88558400.0,0.401786,0.401786,41.176936,37.254695,...,45.658139,0.012176,-29.328953,0.996685,-2.012691e+06,-0.059621,53770.360047,7.589348,-2588.214832,3.451048e+08
7195,0.089844,0.090402,0.089844,0.089844,0.070318,38035200.0,0.086161,0.090792,21.278786,9.015873,...,145.956203,0.113599,-432.086022,0.979458,-7.924000e+05,-0.102292,17032.898438,22.250576,-3778.618271,2.208864e+08
7196,19.177500,19.236786,18.779642,18.821428,16.280138,594333600.0,18.999214,18.747071,47.926322,70.944120,...,77.144738,-0.006812,21.687346,0.981347,-7.429170e+06,-0.032792,33489.884642,30.381721,-4165.894240,3.243418e+08


In [64]:
df_test

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,StochasticD,...,ADX/RSI,DI/Aroon,ADO*Aroon,WMA10/High,Volume/Aroon,1/(Aroon*RSI),Momentum**3/ADX,sqrt(RSI)*Abs(CCI),Aroon*LarryWilliamsR,LarryWilliamsR**3*StochasticK**2
0,0.303571,0.308594,0.281250,0.287388,0.246396,551880000.0,0.322433,0.332310,7.913159,10.355999,...,15.295408,-0.165073,-13.060269,1.055399,6.898500e+06,0.013736,43021.773058,204.890721,7366.947284,4.889816e+07
1,0.101563,0.102121,0.101563,0.101563,0.079490,45740800.0,0.107478,0.111998,6.559737,2.186579,...,102.702023,0.168131,-400.143369,1.064376,-5.717600e+05,-0.044225,16418.552579,63.430593,-7475.221057,3.510549e+07
2,0.372768,0.377232,0.369420,0.370536,0.293161,102681600.0,0.373438,0.364844,75.001120,81.667488,...,44.394540,-0.044171,-34.285714,0.980474,-1.711360e+06,-0.032833,46441.895196,51.871864,-1499.932799,8.788144e+07
3,0.321429,0.325893,0.319196,0.325893,0.259233,115920000.0,0.327679,0.326451,29.412075,25.686866,...,79.531818,0.027359,-34.661490,1.002117,-2.229231e+06,-0.068038,37906.517975,38.630879,-3670.572114,3.042584e+08
4,0.397321,0.401786,0.395089,0.397321,0.317495,56291200.0,0.397545,0.390960,89.998432,87.498600,...,24.340140,0.044948,53.337315,0.982322,7.036400e+05,0.013736,54180.702777,77.511743,800.125440,8.103528e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3081,2.183929,2.183929,2.141786,2.142857,1.837211,605785600.0,2.177286,2.223464,13.702995,26.384444,...,78.860842,0.067642,29.389270,1.008780,-8.908612e+06,-0.052029,31106.662295,67.331368,-5868.196316,1.206752e+08
3082,13.980357,14.125000,13.936786,13.952143,11.962082,376356400.0,13.952928,13.701536,82.623165,81.333733,...,19.438434,-0.056882,-58.383500,0.980537,-7.840758e+06,-0.030833,84441.716085,74.663764,-834.088067,3.581918e+07
3083,15.998571,16.418215,15.982143,16.261429,14.335449,399380800.0,16.505286,17.156178,13.344337,6.187120,...,42.874739,0.075430,-59.692152,1.025166,-5.255011e+06,-0.019474,27936.807883,108.558461,-6585.830424,1.158737e+08
3084,0.146205,0.149554,0.145089,0.147879,0.126786,230417600.0,0.146763,0.147265,40.905759,28.785534,...,40.970128,-0.006744,15.001120,0.983985,1.152088e+07,0.131167,63309.720099,11.589308,1181.884825,3.453062e+08


### Generate New Features Based on autofeat Results

In [None]:
import math

In [89]:
df_test.columns

Index(['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'SMA5', 'SMA10',
       'StochasticK', 'StochasticD', 'LarryWilliamsR', 'ROC', 'PVT', 'ADO',
       'WMA10', 'EMA12', 'EMA26', 'MACD', 'Momentum', 'RSI', 'CCI', 'DI',
       'ADX', 'Aroon', 'LarryWilliamsR*RSI', 'sqrt(StochasticK)*log(RSI)',
       'RSI/ADX', '1/(ADX*RSI)', 'Momentum**3/RSI', 'sqrt(StochasticD)/RSI',
       'log(RSI)', 'Aroon*Volume', 'ROC*StochasticD**3', 'ROC**2/Aroon',
       'Aroon*StochasticD**3', 'ADX/RSI', 'DI/Aroon', 'ADO*Aroon',
       'WMA10/High', 'Volume/Aroon', '1/(Aroon*RSI)', 'Momentum**3/ADX',
       'sqrt(RSI)*Abs(CCI)', 'Aroon*LarryWilliamsR',
       'LarryWilliamsR**3*StochasticK**2'],
      dtype='object')

In [66]:
def lwr_rsi (larry_williams_r, rsi):
    return (larry_williams_r*rsi)

In [67]:
def sqrt_stochastic_k_log_rsi (stochastic_k, rsi):
    return np.sqrt(stochastic_k)*np.log(rsi)

In [68]:
def rsi_adx (rsi, adx):
    return rsi / adx

In [69]:
def inverse_adx_rsi (adx, rsi):
    return 1 / (adx * rsi)

In [70]:
def momentum_3_rsi (momentum, rsi):
    return (momentum**3) / rsi

In [71]:
def sqrt_stochastic_d_rsi (stochastic_d, rsi):
    return (np.sqrt(stochastic_d)/rsi)

In [72]:
def log_rsi (rsi):
    return np.log(rsi)

In [73]:
def aroon_volume (aroon, volume):
    return aroon * volume

In [75]:
def roc_stochastic_d_3 (roc, stochastic_d):
    return roc * stochastic_d**3

In [74]:
def roc_2_aroon (roc, aroon):
    return ((roc**2)/aroon)

In [76]:
def aroon_stochastic_d_3 (aroon, stochastic_d):
    return aroon*stochastic_d**3

In [77]:
def adx_rsi (adx, rsi):
    return adx / rsi

In [78]:
def di_aroon (di, aroon):
    return di / aroon

In [79]:
def ado_aroon (ado, aroon):
    return ado * aroon

In [81]:
def wma10_high (wma10, high):
    return wma10 / high

In [82]:
def volume_aroon (volume, aroon):
    return volume / aroon

In [84]:
def inverse_aroon_rsi (aroon, rsi):
    return 1 / (aroon * rsi)

In [85]:
def momentum_3_adx (momentum, adx):
    return ((momentum**3)/adx)

In [91]:
def sqrt_rsi_abs_cci (rsi, cci):
    return (np.sqrt(rsi)*np.abs(cci))

In [87]:
def aroon_lwr (aroon, larry_williams_r):
    return (aroon * larry_williams_r)

In [88]:
def lwr_3_stochastic_k_2 (larry_williams_r, stochastic_k):
    return ((larry_williams_r**3)*(stochastic_k**2))

In [93]:
def compute_adv_indicators(complete_data):
    complete_data['LarryWilliamsR*RSI'] = lwr_rsi(complete_data['LarryWilliamsR'], complete_data['RSI'])
    complete_data['sqrt(StochasticK)*log(RSI)'] = sqrt_stochastic_k_log_rsi(complete_data['StochasticK'], complete_data['RSI'])
    complete_data['RSI/ADX'] = rsi_adx(complete_data['RSI'], complete_data['ADX'])
    complete_data['1/(ADX*RSI)'] = inverse_adx_rsi(complete_data['ADX'], complete_data['RSI'])
    complete_data['Momentum**3/RSI'] = momentum_3_rsi(complete_data['Momentum'], complete_data['RSI'])
    complete_data['sqrt(StochasticD)/RSI'] = sqrt_stochastic_d_rsi(complete_data['StochasticD'], complete_data['RSI'])
    complete_data['log(RSI)'] = log_rsi(complete_data['RSI'])
    complete_data['Aroon*Volume'] = aroon_volume(complete_data['Aroon'], complete_data['Volume'])
    complete_data['ROC*StochasticD**3'] =  roc_stochastic_d_3(complete_data['ROC'], complete_data['StochasticD'])
    complete_data['ROC**2/Aroon'] = roc_2_aroon(complete_data['ROC'], complete_data['Aroon'])
    complete_data['Aroon*StochasticD**3'] = aroon_stochastic_d_3(complete_data['Aroon'], complete_data['StochasticD'])
    complete_data['ADX/RSI'] = adx_rsi(complete_data['ADX'], complete_data['RSI'])
    complete_data['DI/Aroon'] = di_aroon(complete_data['DI'], complete_data['Aroon'])
    complete_data['ADO*Aroon'] = ado_aroon(complete_data['ADO'], complete_data['Aroon'])
    complete_data['WMA10/High'] = wma10_high(complete_data['WMA10'], complete_data['High'])
    complete_data['Volume/Aroon'] = volume_aroon(complete_data['Volume'], complete_data['Aroon'])
    complete_data['1/(Aroon*RSI)'] = inverse_aroon_rsi(complete_data['Aroon'], complete_data['RSI'])
    complete_data['Momentum**3/ADX'] = momentum_3_adx(complete_data['Momentum'], complete_data['ADX'])
    complete_data['sqrt(RSI)*Abs(CCI)'] = sqrt_rsi_abs_cci(complete_data['RSI'], complete_data['CCI'])
    complete_data['Aroon*LarryWilliamsR'] = aroon_lwr(complete_data['Aroon'], complete_data['LarryWilliamsR'])
    complete_data['LarryWilliamsR**3*StochasticK**2'] = lwr_3_stochastic_k_2(complete_data['LarryWilliamsR'], complete_data['StochasticK'])
    return complete_data


In [95]:
enhanced_data = compute_adv_indicators(complete_data)
enhanced_data

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,StochasticD,...,ADX/RSI,DI/Aroon,ADO*Aroon,WMA10/High,Volume/Aroon,1/(Aroon*RSI),Momentum**3/ADX,sqrt(RSI)*Abs(CCI),Aroon*LarryWilliamsR,LarryWilliamsR**3*StochasticK**2
0,0.151228,0.151228,0.150670,0.150670,0.117924,35728000,0.154911,0.147433,78.162269,81.610332,...,43.321438,0.267456,-160.000000,1.003957,1.116500e+06,0.034340,41035.900341,49.514572,698.807390,6.362340e+07
1,0.144531,0.144531,0.143973,0.143973,0.112683,45158400,0.151563,0.149219,64.367958,75.862874,...,37.390235,0.079852,-352.057348,1.046123,1.411200e+06,0.034340,48717.051162,16.056009,1140.225339,1.874397e+08
2,0.138393,0.138393,0.137835,0.137835,0.107879,55686400,0.147768,0.149777,48.781660,63.770629,...,38.743768,-0.095489,-320.000000,1.077565,1.740200e+06,0.046250,69839.443784,19.726403,1638.986866,3.197346e+08
3,0.135603,0.135603,0.135045,0.135045,0.105695,39827200,0.144308,0.149498,38.158453,50.436024,...,28.123215,-0.185653,-128.000000,1.079983,1.244600e+06,0.046250,84089.096079,44.537094,1978.929498,3.443678e+08
4,0.142299,0.142857,0.142299,0.142299,0.111372,21504000,0.141964,0.149219,46.031972,44.324028,...,31.312912,-0.063463,448.000000,1.015982,6.720000e+05,0.061563,107949.145840,15.979999,1726.976903,3.330649e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10279,151.000000,155.000000,150.990005,153.490005,153.490005,88807000,150.470001,150.376001,82.421470,67.809125,...,15.711104,0.022195,95.760718,0.971112,9.250729e+05,0.011447,71216.376092,174.172876,1687.538853,3.690017e+07
10280,153.710007,158.669998,153.050003,157.869995,157.869995,137827700,152.470001,151.067000,92.850734,81.057205,...,12.274416,0.048372,88.483945,0.957238,1.435705e+06,0.008271,75715.183966,320.732840,686.329529,3.150323e+06
10281,157.649994,161.020004,156.529999,160.550003,160.550003,117147500,154.582001,151.994000,96.528798,90.600334,...,13.857348,0.060827,67.349783,0.953975,1.220286e+06,0.008271,71740.151267,283.894378,333.235372,3.897206e+05
10282,161.679993,165.699997,161.000000,161.020004,161.020004,117467900,156.786001,153.052000,74.313981,87.897838,...,15.714129,0.200079,30.680835,0.936935,4.195282e+06,0.028357,62477.286181,242.476426,719.208545,9.359044e+07


### autofeat Feature Selection

In [96]:
from autofeat import FeatureSelector
fsel = FeatureSelector(verbose=1)
selected_data = fsel.fit_transform(pd.DataFrame(enhanced_data), pd.DataFrame(target))

  return f(**kwargs)


[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 30 features after 5 feature selection runs
[featsel] 26 features after correlation filtering
[featsel] 16 features after noise filtering


In [97]:
selected_data

Unnamed: 0,ROC,log(RSI),WMA10/High,ROC**2/Aroon,sqrt(RSI)*Abs(CCI),LarryWilliamsR*RSI,ROC*StochasticD**3,Aroon*StochasticD**3,Aroon*Volume,Aroon*LarryWilliamsR,sqrt(StochasticK)*log(RSI),ADO*Aroon,1/(ADX*RSI),sqrt(StochasticD)/RSI,RSI/ADX,Momentum**3/RSI
0,0.336639,-0.094300,1.003957,0.003541,49.514572,19.872556,182978.616638,1.739344e+07,1.143296e+09,698.807390,-0.833697,-160.000000,0.027874,9.927190,0.023083,1.777734e+06
1,0.246379,-0.094300,1.046123,0.001897,16.056009,32.425518,107570.182685,1.397133e+07,1.445069e+09,1140.225339,-0.756562,-352.057348,0.032296,9.571245,0.026745,1.821542e+06
2,0.159622,-0.392042,1.077565,0.000796,19.726403,34.606986,41395.685567,8.298739e+06,1.781965e+09,1638.986866,-2.738174,-320.000000,0.056536,11.818764,0.025811,2.705843e+06
3,0.070800,-0.392042,1.079983,0.000157,44.537094,41.784829,9083.540593,4.105561e+06,1.274470e+09,1978.929498,-2.421743,-128.000000,0.077886,10.510712,0.035558,2.364856e+06
4,0.075952,-0.678034,1.015982,0.000180,15.979999,27.394938,6613.917914,2.786555e+06,6.881280e+08,1726.976903,-4.600245,448.000000,0.123939,13.115530,0.031936,3.380202e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10279,0.030411,-0.094300,0.971112,0.000010,174.172876,15.996640,9481.842930,2.993199e+07,8.525472e+09,1687.538853,-0.856111,95.760718,0.076860,9.048943,0.063649,1.118888e+06
10280,0.052326,0.230672,0.957238,0.000029,320.732840,9.004113,27867.297407,5.112650e+07,1.323146e+10,686.329529,2.222733,88.483945,0.051362,7.148523,0.081470,9.293597e+05
10281,0.059806,0.230672,0.953975,0.000037,283.894378,4.371791,44476.798718,7.139382e+07,1.124616e+10,333.235372,2.266329,67.349783,0.045495,7.557626,0.072164,9.941282e+05
10282,0.066640,0.230672,0.936935,0.000159,242.476426,32.350150,45255.411385,1.901484e+07,3.289101e+09,719.208545,1.988519,30.680835,0.040119,7.444055,0.063637,9.817762e+05


In [98]:
from sklearn.preprocessing import StandardScaler

In [99]:
scaler = StandardScaler()  
scaler.fit(selected_data)

StandardScaler()

In [100]:

selected_scaled_data = scaler.transform(selected_data)

In [101]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(selected_scaled_data,target,test_size=0.3)

In [102]:
X_train = scaler.transform(X_train)  
X_test = scaler.transform(X_test) 

In [103]:
selected_scaled_data_df = pd.DataFrame(selected_scaled_data)
selected_scaled_data_1250 = selected_scaled_data_df.tail(1250)
selected_scaled_data_250 = selected_scaled_data_df.tail(250)
selected_scaled_data_125 = selected_scaled_data_df.tail(125)
target_1250 = target.tail(1250)
target_250 = target.tail(250)
target_125 = target.tail(125)

In [104]:
X_train_1250, X_test_1250, y_train_1250, y_test_1250 = train_test_split(selected_scaled_data_1250,target_1250,test_size=0.3)

In [105]:
X_train_250, X_test_250, y_train_250, y_test_250 = train_test_split(selected_scaled_data_250,target_250,test_size=0.3)

In [106]:
X_train_125, X_test_125, y_train_125, y_test_125 = train_test_split(selected_scaled_data_125,target_125,test_size=0.3)

### Logistic Regression

#### Max Duration

In [107]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [108]:
lr = LogisticRegression(solver='liblinear', multi_class='ovr')
trained_model_lr = lr.fit(X_train, y_train)
predictions_lr = trained_model_lr.predict(X_test)

Train_accuracy_lr = accuracy_score(y_train,trained_model_lr.predict(X_train))
Test_accuracy_lr = accuracy_score(y_test, predictions_lr)
Confusion_matrix_lr = confusion_matrix(y_test,predictions_lr)

In [109]:
Train_accuracy_lr

0.6578216171158655

In [110]:
Test_accuracy_lr

0.6642903434867142

#### 5 Years

In [111]:
trained_model_lr_1250 = lr.fit(X_train_1250, y_train_1250)
predictions_lr_1250 = trained_model_lr_1250.predict(X_test_1250)

Train_accuracy_lr_1250 = accuracy_score(y_train_1250,trained_model_lr_1250.predict(X_train_1250))
Test_accuracy_lr_1250 = accuracy_score(y_test_1250, predictions_lr_1250)
Confusion_matrix_lr_1250 = confusion_matrix(y_test_1250,predictions_lr_1250)

In [112]:
Train_accuracy_lr_1250

0.6948571428571428

In [113]:
Test_accuracy_lr_1250

0.6586666666666666

#### 1 Year

In [114]:
trained_model_lr_250 = lr.fit(X_train_250, y_train_250)
predictions_lr_250 = trained_model_lr_250.predict(X_test_250)

Train_accuracy_lr_250 = accuracy_score(y_train_250,trained_model_lr_250.predict(X_train_250))
Test_accuracy_lr_250 = accuracy_score(y_test_250, predictions_lr_250)
Confusion_matrix_lr_250 = confusion_matrix(y_test_250,predictions_lr_250)

In [115]:
Train_accuracy_lr_250

0.7085714285714285

In [116]:
Test_accuracy_lr_250

0.6

#### 6 Months

In [117]:
trained_model_lr_125 = lr.fit(X_train_125, y_train_125)
predictions_lr_125 = trained_model_lr_125.predict(X_test_125)

Train_accuracy_lr_125 = accuracy_score(y_train_125,trained_model_lr_125.predict(X_train_125))
Test_accuracy_lr_125 = accuracy_score(y_test_125, predictions_lr_125)
Confusion_matrix_lr_125 = confusion_matrix(y_test_125,predictions_lr_125)

In [118]:
Train_accuracy_lr_125

0.7011494252873564

In [119]:
Test_accuracy_lr_125

0.7368421052631579

#### Last 100 Days based on Best Model - Max Duration

In [120]:
last100 = selected_scaled_data_df.tail(100)

In [174]:
predictions_lr_100 = trained_model_lr_125.predict(last100)
predictions_lr_100

array([1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1])

### SVM

In [122]:
svm = SVC(gamma='auto')
trained_model_svm = svm.fit(X_train, y_train)
predictions_svm = trained_model_svm.predict(X_test)

Train_accuracy_svm = accuracy_score(y_train,trained_model_svm.predict(X_train))
Test_accuracy_svm = accuracy_score(y_test, predictions_svm)
Confusion_matrix_svm = confusion_matrix(y_test,predictions_svm)

In [123]:
Train_accuracy_svm

0.9973603778827452

In [124]:
Test_accuracy_svm

0.5029163966299417

#### 5 Years

In [125]:
trained_model_svm_1250 = svm.fit(X_train_1250, y_train_1250)
predictions_svm_1250 = trained_model_svm_1250.predict(X_test_1250)

Train_accuracy_svm_1250 = accuracy_score(y_train_1250,trained_model_svm_1250.predict(X_train_1250))
Test_accuracy_svm_1250 = accuracy_score(y_test_1250, predictions_svm_1250)
Confusion_matrix_svm_1250 = confusion_matrix(y_test_1250,predictions_svm_1250)

In [126]:
Train_accuracy_svm_1250

0.7085714285714285

In [127]:
Test_accuracy_svm_1250

0.6426666666666667

#### 1 Year

In [128]:
trained_model_svm_250 = svm.fit(X_train_250, y_train_250)
predictions_svm_250 = trained_model_svm_1250.predict(X_test_250)

Train_accuracy_svm_250 = accuracy_score(y_train_250,trained_model_svm_250.predict(X_train_250))
Test_accuracy_svm_250 = accuracy_score(y_test_250, predictions_svm_250)
Confusion_matrix_svm_250 = confusion_matrix(y_test_250,predictions_svm_250)

In [129]:
Train_accuracy_svm_250

0.7314285714285714

In [130]:
Test_accuracy_svm_250

0.6133333333333333

#### 6 Months

In [131]:
trained_model_svm_125 = svm.fit(X_train_125, y_train_125)
predictions_svm_125 = trained_model_svm_1250.predict(X_test_125)

Train_accuracy_svm_125 = accuracy_score(y_train_125,trained_model_svm_125.predict(X_train_125))
Test_accuracy_svm_125 = accuracy_score(y_test_125, predictions_svm_125)
Confusion_matrix_svm_125 = confusion_matrix(y_test_125,predictions_svm_125)

In [132]:
Train_accuracy_svm_125

0.6896551724137931

In [133]:
Test_accuracy_svm_125

0.7105263157894737

#### Last 100 Days based on Best Model - Max Duration

In [176]:
predictions_svm_100 = trained_model_svm_125.predict(last100)
predictions_svm_100

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1])

### RandomForest

In [135]:
rf = RandomForestClassifier(n_estimators=10)
trained_model_rf = rf.fit(X_train, y_train)
predictions_rf = trained_model_rf.predict(X_test)

Train_accuracy_rf = accuracy_score(y_train, trained_model_rf.predict(X_train))
Test_accuracy_rf = accuracy_score(y_test, predictions_rf)
Confusion_matrix_rf = confusion_matrix(y_test, predictions_rf)

In [136]:
Train_accuracy_rf

0.9805501528202278

In [137]:
Test_accuracy_rf

0.6386908619572261

#### 5 Years

In [138]:
trained_model_rf_1250 = rf.fit(X_train_1250, y_train_1250)
predictions_rf_1250 = trained_model_rf_1250.predict(X_test_1250)

Train_accuracy_rf_1250 = accuracy_score(y_train_1250,trained_model_rf_1250.predict(X_train_1250))
Test_accuracy_rf_1250 = accuracy_score(y_test_1250, predictions_rf_1250)
Confusion_matrix_rf_1250 = confusion_matrix(y_test_1250,predictions_rf_1250)

In [139]:
Train_accuracy_rf_1250

0.9828571428571429

In [140]:
Test_accuracy_rf_1250

0.6453333333333333

#### 1 Year

In [141]:
trained_model_rf_250 = rf.fit(X_train_250, y_train_250)
predictions_rf_250 = trained_model_rf_250.predict(X_test_250)

Train_accuracy_rf_250 = accuracy_score(y_train_250,trained_model_rf_250.predict(X_train_250))
Test_accuracy_rf_250 = accuracy_score(y_test_250, predictions_rf_250)
Confusion_matrix_rf_250 = confusion_matrix(y_test_250,predictions_rf_250)

In [142]:
Train_accuracy_rf_250

0.9885714285714285

In [143]:
Test_accuracy_rf_250

0.6666666666666666

#### 6 Months

In [144]:
trained_model_rf_125 = rf.fit(X_train_125, y_train_125)
predictions_rf_125 = trained_model_rf_125.predict(X_test_125)

Train_accuracy_rf_125 = accuracy_score(y_train_125,trained_model_rf_125.predict(X_train_125))
Test_accuracy_rf_125 = accuracy_score(y_test_125, predictions_rf_125)
Confusion_matrix_rf_125 = confusion_matrix(y_test_125,predictions_rf_125)

In [145]:
Train_accuracy_rf_125

0.9770114942528736

In [146]:
Test_accuracy_rf_125

0.5263157894736842

#### Last 100 Days based on Best Model

In [177]:
predictions_rf_100 = trained_model_rf_250.predict(last100)
predictions_rf_100

array([0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1])

### KNN

In [148]:
knn = KNeighborsClassifier(n_neighbors=5)
trained_model_knn = knn.fit(X_train, y_train)
predictions_knn = trained_model_knn.predict(X_test)

Train_accuracy_knn = accuracy_score(y_train, trained_model_knn.predict(X_train))
Test_accuracy_knn = accuracy_score(y_test, predictions_knn)
Confusion_matrix_knn = confusion_matrix(y_test, predictions_knn)

In [149]:
Train_accuracy_knn

0.7254792998055015

In [150]:
Test_accuracy_knn

0.5729099157485418

#### 5 Years

In [151]:
trained_model_knn_1250 = knn.fit(X_train_1250, y_train_1250)
predictions_knn_1250 = trained_model_knn_1250.predict(X_test_1250)

Train_accuracy_knn_1250 = accuracy_score(y_train_1250,trained_model_knn_1250.predict(X_train_1250))
Test_accuracy_knn_1250 = accuracy_score(y_test_1250, predictions_knn_1250)
Confusion_matrix_knn_1250 = confusion_matrix(y_test_1250,predictions_knn_1250)

In [152]:
Train_accuracy_knn_1250

0.736

In [153]:
Test_accuracy_knn_1250

0.608

#### 1 Year

In [154]:
trained_model_knn_250 = knn.fit(X_train_250, y_train_250)
predictions_knn_250 = trained_model_knn_250.predict(X_test_250)

Train_accuracy_knn_250 = accuracy_score(y_train_250,trained_model_knn_250.predict(X_train_250))
Test_accuracy_knn_250 = accuracy_score(y_test_250, predictions_knn_250)
Confusion_matrix_knn_250 = confusion_matrix(y_test_250,predictions_knn_250)

In [155]:
Train_accuracy_knn_250

0.7428571428571429

In [156]:
Test_accuracy_knn_250

0.5333333333333333

#### 6 Months

In [157]:
trained_model_knn_125 = knn.fit(X_train_125, y_train_125)
predictions_knn_125 = trained_model_knn_125.predict(X_test_125)

Train_accuracy_knn_125 = accuracy_score(y_train_125,trained_model_knn_125.predict(X_train_125))
Test_accuracy_knn_125 = accuracy_score(y_test_125, predictions_knn_125)
Confusion_matrix_knn_125 = confusion_matrix(y_test_125,predictions_knn_125)

In [158]:
Train_accuracy_knn_125

0.7586206896551724

In [159]:
Test_accuracy_knn_125

0.7105263157894737

#### Last 100 Days based on Best Model

In [178]:
predictions_knn_100 = trained_model_knn_125.predict(last100)
predictions_knn_100

array([1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1])

### XGBoost

In [161]:
xgb = XGBClassifier(use_label_encoder=False)
trained_model_xgb = xgb.fit(X_train, y_train)
predictions_xgb = trained_model_xgb.predict(X_test)

Train_accuracy_xgb = accuracy_score(y_train, trained_model_xgb.predict(X_train))
Test_accuracy_xgb = accuracy_score(y_test, predictions_xgb)
Confusion_matrix_xgb = confusion_matrix(y_test, predictions_xgb)



In [162]:
Train_accuracy_xgb

0.9394276187829953

In [163]:
Test_accuracy_xgb

0.6373946856772521

#### 5 Years

In [164]:
trained_model_xgb_1250 = xgb.fit(X_train_1250, y_train_1250)
predictions_xgb_1250 = trained_model_xgb_1250.predict(X_test_1250)

Train_accuracy_xgb_1250 = accuracy_score(y_train_1250,trained_model_xgb_1250.predict(X_train_1250))
Test_accuracy_xgb_1250 = accuracy_score(y_test_1250, predictions_xgb_1250)
Confusion_matrix_xgb_1250 = confusion_matrix(y_test_1250,predictions_xgb_1250)



In [165]:
Train_accuracy_xgb_1250

1.0

In [166]:
Test_accuracy_xgb_1250

0.6426666666666667

#### 1 Year

In [167]:
trained_model_xgb_250 = xgb.fit(X_train_250, y_train_250)
predictions_xgb_250 = trained_model_xgb_250.predict(X_test_250)

Train_accuracy_xgb_250 = accuracy_score(y_train_250,trained_model_xgb_250.predict(X_train_250))
Test_accuracy_xgb_250 = accuracy_score(y_test_250, predictions_xgb_250)
Confusion_matrix_xgb_250 = confusion_matrix(y_test_250,predictions_xgb_250)



In [168]:
Train_accuracy_xgb_250

1.0

In [169]:
Test_accuracy_xgb_250

0.6

#### 6 Months

In [170]:
trained_model_xgb_125 = xgb.fit(X_train_125, y_train_125)
predictions_xgb_125 = trained_model_xgb_125.predict(X_test_125)

Train_accuracy_xgb_125 = accuracy_score(y_train_125,trained_model_xgb_125.predict(X_train_125))
Test_accuracy_xgb_125 = accuracy_score(y_test_125, predictions_xgb_125)
Confusion_matrix_xgb_125 = confusion_matrix(y_test_125,predictions_xgb_125)



In [171]:
Train_accuracy_xgb_125

1.0

In [172]:
Test_accuracy_xgb_125

0.631578947368421

#### Last 100 Days based on Best Model

In [179]:
predictions_xgb_100 = trained_model_xgb_1250.predict(last100)
predictions_xgb_100

array([1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1])

## KFold cross validation
### Basic example

In [None]:
from sklearn.model_selection import cross_val_score

cross_val_score(LogisticRegression(solver='liblinear',multi_class='ovr'), complete_data, target, cv=10)

In [None]:
cross_val_score(SVC(gamma='auto'), complete_data, target, cv=10)

In [None]:
cross_val_score(RandomForestClassifier(n_estimators=5), complete_data, target, cv=10)

In [None]:
cross_val_score(KNeighborsClassifier(n_neighbors=5), complete_data, target, cv=10)

In [None]:
cross_val_score(XGBClassifier(use_label_encoder=False), complete_data, target, cv=10)