In [1]:
import numpy as np
import pandas as pd
import ta
import seaborn as sn
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from ta import add_all_ta_features
from ta import momentum
from ta.utils import dropna

In [2]:
#pip install xgboost

In [3]:
pip install autofeat

Note: you may need to restart the kernel to use updated packages.


In [4]:
def get_data(company_symbol):
    return pd.read_csv("dataset/"+company_symbol+".csv")

In [5]:
data = get_data("GOOGL")
data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,8/19/2004,50.050049,52.082081,48.028027,50.220219,50.220219,44659096
1,8/20/2004,50.555557,54.594597,50.300301,54.209209,54.209209,22834343
2,8/23/2004,55.430431,56.796799,54.579578,54.754753,54.754753,18256126
3,8/24/2004,55.675674,55.855858,51.836838,52.487488,52.487488,15247337
4,8/25/2004,52.532532,54.054054,51.991993,53.053055,53.053055,9188602
...,...,...,...,...,...,...,...
4326,10/25/2021,2751.000000,2760.000000,2708.479980,2748.939941,2748.939941,1720600
4327,10/26/2021,2785.270020,2801.659912,2766.090088,2786.169922,2786.169922,2461800
4328,10/27/2021,2788.100098,2973.000000,2788.100098,2924.350098,2924.350098,4289100
4329,10/28/2021,2942.689941,2943.159912,2892.540039,2916.979980,2916.979980,1810300


In [6]:
data.dtypes

Date          object
Open         float64
High         float64
Low          float64
Close        float64
Adj Close    float64
Volume         int64
dtype: object

In [7]:
data.isna().any()

Date         False
Open         False
High         False
Low          False
Close        False
Adj Close    False
Volume       False
dtype: bool

In [8]:
data[data.columns[data.isna().any()]]

0
1
2
3
4
...
4326
4327
4328
4329
4330


In [9]:
cond1 = data.Date.isna() # aapl['Date'].isna()
cond2 = data.Date.str.contains(r'^\s*$', na=False)
cond3 = data.Date == ''

data_checked = data.assign(cond1= cond1, cond2= cond2, cond3= cond3)
print (data_checked)

            Date         Open         High          Low        Close  \
0      8/19/2004    50.050049    52.082081    48.028027    50.220219   
1      8/20/2004    50.555557    54.594597    50.300301    54.209209   
2      8/23/2004    55.430431    56.796799    54.579578    54.754753   
3      8/24/2004    55.675674    55.855858    51.836838    52.487488   
4      8/25/2004    52.532532    54.054054    51.991993    53.053055   
...          ...          ...          ...          ...          ...   
4326  10/25/2021  2751.000000  2760.000000  2708.479980  2748.939941   
4327  10/26/2021  2785.270020  2801.659912  2766.090088  2786.169922   
4328  10/27/2021  2788.100098  2973.000000  2788.100098  2924.350098   
4329  10/28/2021  2942.689941  2943.159912  2892.540039  2916.979980   
4330  10/29/2021  2901.560059  2964.840088  2892.989990  2960.919922   

        Adj Close    Volume  cond1  cond2  cond3  
0       50.220219  44659096  False  False  False  
1       54.209209  22834343  Fals

In [10]:
def simple_moving_average_5(close):
    return close.rolling(5, min_periods=1).mean()

In [11]:
def simple_moving_average_10(close):
    return close.rolling(10, min_periods=1).mean()

In [12]:
def stochastic_k(high, low, close):
    return (((close-low.rolling(14).min())/(high.rolling(14).max()-low.rolling(14).min()))*100)

In [13]:
def stochastic_d(stochastic_k):
    return stochastic_k.rolling(3,min_periods=1).mean()

In [14]:
def larry_williams_r(high, low, close):
    return (((high.rolling(14).max()-close)/(high.rolling(14).max()-low.rolling(14).min()))*100)

In [15]:
def rate_of_change(close):
    return ((close-close.shift(12))/close.shift(12))

In [16]:
def price_volume_trend(close, volume):
    return (((close-close.shift(1))/close.shift(1))*volume)

In [17]:
def accumulation_distribution_oscillator(high, low, close):
    return((high-close.shift(1))/(high-low))

In [18]:
def weighted_moving_average_10(close):
    return ((10*close+9*close.shift(1)+8*close.shift(2)+7*close.shift(3)
             +6*close.shift(4)+5*close.shift(5)+4*close.shift(6)+3*close.shift(7)
             +2*close.shift(8)+close.shift(9))/(10+9+8+7+6+5+4+3+2+1))

In [19]:
def exponential_moving_average_12(close):
    return (close.ewm(span=12, adjust=False).mean())

In [20]:
def exponential_moving_average_26(close):
    return (close.ewm(span=26, adjust=False).mean())

In [21]:
def moving_average_convergence_divergence(ema_12, ema_26):
    return (ema_12 - ema_26)

In [22]:
def momentum(close):
    return ((close/close.shift(14))*100)

In [23]:
def change(close):
    return (close.shift(-1) - close)

In [24]:
def direction(change):
    return (np.where(change>0, 1, 0))

In [25]:
def relative_strength_index(direction):
    return (100-(100/(1+(direction.rolling(14).sum()/14)/(14-direction.rolling(14).sum())/14)))

In [26]:
def channel_commodity_index(df, ndays): 
    df['TP'] = (df['High'] + df['Low'] + df['Close']) / 3 
    df['sma'] = df['TP'].rolling(ndays).mean()
    df['mad'] = df['TP'].rolling(ndays).apply(lambda x: pd.Series(x).mad())
    df['CCI'] = (df['TP'] - df['sma']) / (0.015 * df['mad']) 
    return df['CCI']

In [27]:
def disparity_index(close):
    return (((close - close.rolling(14).mean())/(close.rolling(14).mean()))*100)

In [28]:
def get_adx(high, low, close, lookback):
    plus_dm = high.diff()
    minus_dm = low.diff()
    plus_dm[plus_dm < 0] = 0
    minus_dm[minus_dm > 0] = 0
    
    tr1 = pd.DataFrame(high - low)
    tr2 = pd.DataFrame(abs(high - close.shift(1)))
    tr3 = pd.DataFrame(abs(low - close.shift(1)))
    frames = [tr1, tr2, tr3]
    tr = pd.concat(frames, axis = 1, join = 'inner').max(axis = 1)
    atr = tr.rolling(lookback).mean()
    
    plus_di = 100 * (plus_dm.ewm(alpha = 1/lookback).mean() / atr)
    minus_di = abs(100 * (minus_dm.ewm(alpha = 1/lookback).mean() / atr))
    dx = (abs(plus_di - minus_di) / abs(plus_di + minus_di)) * 100
    adx = ((dx.shift(1) * (lookback - 1)) + dx) / lookback
    adx_smooth = adx.ewm(alpha = 1/lookback).mean()
    return adx_smooth

In [29]:
def aroon(close):
    return (ta.trend.AroonIndicator(close, 25, True).aroon_indicator())

In [30]:
def compute_all_indicators(data):
    data['SMA5'] = simple_moving_average_5(data['Close'])
    data['SMA10'] = simple_moving_average_10(data['Close'])
    data['StochasticK'] = stochastic_k(data['High'], data['Low'], data['Close'])
    data['StochasticD'] = stochastic_d(data['StochasticK'])
    data['LarryWilliamsR'] = larry_williams_r(data['High'], data['Low'], data['Close'])
    data['ROC'] = rate_of_change(data['Close'])
    data['PVT'] = price_volume_trend(data['Close'], data['Volume'])
    data['ADO'] = accumulation_distribution_oscillator(data['High'], data['Low'], data['Close'])
    data['WMA10'] = weighted_moving_average_10(data['Close'])
    data['EMA12'] = exponential_moving_average_12(data['Close'])
    data['EMA26'] = exponential_moving_average_26(data['Close'])
    data['MACD'] = moving_average_convergence_divergence(data['EMA12'], data['EMA26'])
    data['Momentum'] = momentum(data['Close'])
    data['Change'] = change(data['Close'])
    data['Direction'] = direction(data['Change'])
    data['RSI'] = relative_strength_index(data['Direction'])
    data['CCI'] = channel_commodity_index(data, 14)
    data['DI'] = disparity_index(data['Close'])
    data['ADX'] = get_adx(data['High'], data['Low'], data['Close'], 14)
    data['Aroon'] = aroon(data['Close'])    
    return data

In [31]:
complete_data = compute_all_indicators(data)
complete_data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,...,Change,Direction,RSI,TP,sma,mad,CCI,DI,ADX,Aroon
0,8/19/2004,50.050049,52.082081,48.028027,50.220219,50.220219,44659096,50.220219,50.220219,,...,3.988990,1,,50.110109,,,,,,0.0
1,8/20/2004,50.555557,54.594597,50.300301,54.209209,54.209209,22834343,52.214714,52.214714,,...,0.545544,1,,53.034702,,,,,,4.0
2,8/23/2004,55.430431,56.796799,54.579578,54.754753,54.754753,18256126,53.061394,53.061394,,...,-2.267265,0,,55.377043,,,,,,8.0
3,8/24/2004,55.675674,55.855858,51.836838,52.487488,52.487488,15247337,52.917917,52.917917,,...,0.565567,1,,53.393395,,,,,,8.0
4,8/25/2004,52.532532,54.054054,51.991993,53.053055,53.053055,9188602,52.944945,52.944945,,...,0.955955,1,,53.033034,,,,,,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4326,10/25/2021,2751.000000,2760.000000,2708.479980,2748.939941,2748.939941,1720600,2807.621973,2802.466992,31.791463,...,37.229981,1,0.910010,2739.139974,2793.822144,36.525489,-99.806410,-1.659154,15.283130,44.0
4327,10/26/2021,2785.270020,2801.659912,2766.090088,2786.169922,2786.169922,2461800,2791.907959,2808.185986,47.150533,...,138.180176,1,0.910010,2784.639974,2797.551188,33.080892,-26.019480,-0.416017,15.934061,44.0
4328,10/27/2021,2788.100098,2973.000000,2788.100098,2924.350098,2924.350098,4289100,2809.702002,2825.457007,81.608234,...,-7.370118,0,0.675676,2895.150065,2805.339047,39.235729,152.600736,4.150993,15.753143,68.0
4329,10/28/2021,2942.689941,2943.159912,2892.540039,2916.979980,2916.979980,1810300,2825.554004,2834.853003,78.822011,...,43.939942,1,0.910010,2917.559977,2814.053327,46.064779,149.798686,3.568992,16.191420,68.0


In [32]:
complete_data = complete_data.dropna()

In [33]:
complete_data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,...,Change,Direction,RSI,TP,sma,mad,CCI,DI,ADX,Aroon
14,9/9/2004,51.316315,51.406406,50.550552,51.206207,51.206207,4061734,50.821821,51.371371,23.177440,...,1.511509,1,0.910010,51.054388,52.055508,1.348866,-49.479574,-1.556034,13.189346,-36.0
15,9/10/2004,50.850849,53.333332,50.700703,52.717716,52.717716,8698892,51.204204,51.242242,43.947690,...,1.086086,1,0.910010,52.250584,51.999500,1.284856,13.027860,1.557848,13.278814,-36.0
16,9/13/2004,53.368366,54.259258,53.283283,53.803802,53.803802,7844148,51.953953,51.309809,67.614480,...,1.996998,1,1.259446,53.782114,51.885576,1.154658,109.500695,3.785942,14.363601,-36.0
17,9/14/2004,53.778778,56.056057,53.448448,55.800800,55.800800,10828960,52.945945,51.784284,96.094920,...,0.255257,1,1.259446,55.101768,52.007603,1.294117,159.396447,7.148936,17.732901,24.0
18,9/15/2004,55.335335,57.172173,55.155155,56.056057,56.056057,10713076,53.916916,52.266266,85.415306,...,0.985985,1,1.259446,56.127795,52.228657,1.546751,168.057138,7.197552,22.985583,28.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4325,10/22/2021,2783.000000,2811.659912,2721.120117,2751.330078,2751.330078,2527700,2828.945996,2805.401001,36.641914,...,-2.390137,0,0.910010,2761.370036,2792.116194,38.231440,-53.614090,-1.501967,14.685884,44.0
4326,10/25/2021,2751.000000,2760.000000,2708.479980,2748.939941,2748.939941,1720600,2807.621973,2802.466992,31.791463,...,37.229981,1,0.910010,2739.139974,2793.822144,36.525489,-99.806410,-1.659154,15.283130,44.0
4327,10/26/2021,2785.270020,2801.659912,2766.090088,2786.169922,2786.169922,2461800,2791.907959,2808.185986,47.150533,...,138.180176,1,0.910010,2784.639974,2797.551188,33.080892,-26.019480,-0.416017,15.934061,44.0
4328,10/27/2021,2788.100098,2973.000000,2788.100098,2924.350098,2924.350098,4289100,2809.702002,2825.457007,81.608234,...,-7.370118,0,0.675676,2895.150065,2805.339047,39.235729,152.600736,4.150993,15.753143,68.0


In [34]:
complete_data = complete_data.drop(['TP', 'sma', 'mad', 'Change'], axis=1)

In [35]:
complete_data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,...,EMA12,EMA26,MACD,Momentum,Direction,RSI,CCI,DI,ADX,Aroon
14,9/9/2004,51.316315,51.406406,50.550552,51.206207,51.206207,4061734,50.821821,51.371371,23.177440,...,51.241682,51.171435,0.070247,101.963329,1,0.910010,-49.479574,-1.556034,13.189346,-36.0
15,9/10/2004,50.850849,53.333332,50.700703,52.717716,52.717716,8698892,51.204204,51.242242,43.947690,...,51.468764,51.285974,0.182790,97.248635,1,0.910010,13.027860,1.557848,13.278814,-36.0
16,9/13/2004,53.368366,54.259258,53.283283,53.803802,53.803802,7844148,51.953953,51.309809,67.614480,...,51.828001,51.472480,0.355521,98.263254,1,1.259446,109.500695,3.785942,14.363601,-36.0
17,9/14/2004,53.778778,56.056057,53.448448,55.800800,55.800800,10828960,52.945945,51.784284,96.094920,...,52.439201,51.793096,0.646105,106.312575,1,1.259446,159.396447,7.148936,17.732901,24.0
18,9/15/2004,55.335335,57.172173,55.155155,56.056057,56.056057,10713076,53.916916,52.266266,85.415306,...,52.995640,52.108871,0.886769,105.660375,1,1.259446,168.057138,7.197552,22.985583,28.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4325,10/22/2021,2783.000000,2811.659912,2721.120117,2751.330078,2751.330078,2527700,2828.945996,2805.401001,36.641914,...,2801.459443,2792.721461,8.737982,102.923105,0,0.910010,-53.614090,-1.501967,14.685884,44.0
4326,10/25/2021,2751.000000,2760.000000,2708.479980,2748.939941,2748.939941,1720600,2807.621973,2802.466992,31.791463,...,2793.379519,2789.478385,3.901134,101.046881,1,0.910010,-99.806410,-1.659154,15.283130,44.0
4327,10/26/2021,2785.270020,2801.659912,2766.090088,2786.169922,2786.169922,2461800,2791.907959,2808.185986,47.150533,...,2792.270350,2789.233314,3.037036,101.267396,1,0.910010,-26.019480,-0.416017,15.934061,44.0
4328,10/27/2021,2788.100098,2973.000000,2788.100098,2924.350098,2924.350098,4289100,2809.702002,2825.457007,81.608234,...,2812.590312,2799.241965,13.348347,105.022449,0,0.675676,152.600736,4.150993,15.753143,68.0


In [36]:
#corrMatrix = complete_data.corr()
#print(corrMatrix)

In [37]:
#sn.heatmap(corrMatrix, annot=False)
#plt.show()

In [38]:
#corr_pairs = corrMatrix.unstack()
#corr_pairs

In [39]:
#sorted_pairs = corr_pairs.sort_values(kind="quicksort")
#sorted_pairs

In [40]:
#strong_pairs = sorted_pairs[abs(sorted_pairs) > 0.5]

#print(strong_pairs)

In [41]:
## strong_pairs[strong_pairs.index[0][0] == strong_pairs.index[0][1]]

#removed_diagonal = [(i, j) for (i, j) in strong_pairs.index if i!=j]
#len(removed_diagonal)

In [42]:
## # Create correlation matrix
## corr_matrix = complete_data.corr().abs()

## # Select upper triangle of correlation matrix
## upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool8))

## # Find index of feature columns with correlation greater than 0.95
## to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
## to_drop

In [43]:
## Find index of feature columns with correlation greater than 0.8

#correlated_features = set()
#for i in range(len(corrMatrix.columns)):
 #   for j in range(i):
  #      if abs(corrMatrix.iloc[i, j]) > 0.8:
   #         colname = corrMatrix.columns[i]
    #        correlated_features.add(colname)

In [44]:
#correlated_features

In [45]:
#complete_data = complete_data.drop(labels=correlated_features, axis=1)

In [46]:
#complete_data

In [47]:
target = complete_data['Direction']
complete_data = complete_data.drop(['Date', 'Direction'], axis=1)
complete_data

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,SMA5,SMA10,StochasticK,StochasticD,...,WMA10,EMA12,EMA26,MACD,Momentum,RSI,CCI,DI,ADX,Aroon
14,51.316315,51.406406,50.550552,51.206207,51.206207,4061734,50.821821,51.371371,23.177440,29.682331,...,50.983165,51.241682,51.171435,0.070247,101.963329,0.910010,-49.479574,-1.556034,13.189346,-36.0
15,50.850849,53.333332,50.700703,52.717716,52.717716,8698892,51.204204,51.242242,43.947690,34.437451,...,51.227955,51.468764,51.285974,0.182790,97.248635,0.910010,13.027860,1.557848,13.278814,-36.0
16,53.368366,54.259258,53.283283,53.803802,53.803802,7844148,51.953953,51.309809,67.614480,44.913203,...,51.693693,51.828001,51.472480,0.355521,98.263254,1.259446,109.500695,3.785942,14.363601,-36.0
17,53.778778,56.056057,53.448448,55.800800,55.800800,10828960,52.945945,51.784284,96.094920,69.219030,...,52.510237,52.439201,51.793096,0.646105,106.312575,1.259446,159.396447,7.148936,17.732901,24.0
18,55.335335,57.172173,55.155155,56.056057,56.056057,10713076,53.916916,52.266266,85.415306,83.041569,...,53.286923,52.995640,52.108871,0.886769,105.660375,1.259446,168.057138,7.197552,22.985583,28.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4325,2783.000000,2811.659912,2721.120117,2751.330078,2751.330078,2527700,2828.945996,2805.401001,36.641914,69.181246,...,2815.316366,2801.459443,2792.721461,8.737982,102.923105,0.910010,-53.614090,-1.501967,14.685884,44.0
4326,2751.000000,2760.000000,2708.479980,2748.939941,2748.939941,1720600,2807.621973,2802.466992,31.791463,51.449377,...,2805.050719,2793.379519,2789.478385,3.901134,101.046881,0.910010,-99.806410,-1.659154,15.283130,44.0
4327,2785.270020,2801.659912,2766.090088,2786.169922,2786.169922,2461800,2791.907959,2808.185986,47.150533,38.527970,...,2802.087615,2792.270350,2789.233314,3.037036,101.267396,0.910010,-26.019480,-0.416017,15.934061,44.0
4328,2788.100098,2973.000000,2788.100098,2924.350098,2924.350098,4289100,2809.702002,2825.457007,81.608234,53.516743,...,2823.208363,2812.590312,2799.241965,13.348347,105.022449,0.675676,152.600736,4.150993,15.753143,68.0


### autofeat Classification

In [48]:
from sklearn.model_selection import train_test_split

In [49]:
from autofeat import AutoFeatClassifier
X_train, X_test, y_train, y_test = train_test_split(complete_data,target,test_size=0.3)
model = AutoFeatClassifier()
df = model.fit_transform(X_train, y_train)
y_pred = model.predict(X_test)

  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  sqr = np.multiply(arr, arr, out=arr)
  return abs(PVT)/PVT


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
df_test = model.transform(X_test)
model.score(df_test,y_test)

In [None]:
df

In [None]:
df_test

### Generate New Features Based on autofeat Results

In [None]:
import math

In [None]:
def sqrt_rsi_momentum(rsi, momentum):
    return (np.sqrt(rsi)/momentum)

In [None]:
def sqrt_stochastic_d_rsi(stochastic_d, rsi):
    return(np.sqrt(stochastic_d)/rsi)

In [None]:
def sqrt_lwr_rsi (larry_williams_r, rsi):
    return (np.sqrt(larry_williams_r)*rsi)

In [None]:
def momentum_3_rsi(momentum, rsi):
    return ((momentum**3)/rsi)

In [None]:
def ado_macd_2(ado, macd):
    return (ado*(macd**2))

In [None]:
def adx_abs_roc(adx, roc):
    return (adx*np.abs(roc))

In [None]:
def sqrt_stochastic_k_log_rsi (stochastic_k, rsi):
    return ((np.sqrt(stochastic_k))*np.log(rsi))

In [None]:
def compute_adv_indicators(complete_data):
    complete_data['SqrtRSI/Momentum'] = sqrt_rsi_momentum(complete_data['RSI'], complete_data['Momentum'])
    complete_data['SqrtStochasticD/RSI'] = sqrt_stochastic_d_rsi(complete_data['StochasticD'], complete_data['RSI'])
    complete_data['SqrtLarryWilliamsR*RSI'] = sqrt_lwr_rsi(complete_data['LarryWilliamsR'], complete_data['RSI'])
    complete_data['Momentum**3/RSI'] = momentum_3_rsi(complete_data['Momentum'], complete_data['RSI'])
    complete_data['ADO*MACD**2'] = ado_macd_2(complete_data['ADO'], complete_data['MACD'])
    complete_data['ADX*AbsROC'] = adx_abs_roc(complete_data['ADX'], complete_data['ROC'])
    complete_data['SqrtStochasticK*LogRSI'] = sqrt_stochastic_k_log_rsi(complete_data['StochasticK'], complete_data['RSI'])   
    return complete_data

In [None]:
enhanced_data = compute_adv_indicators(complete_data)
enhanced_data

### autofeat Feature Selection

In [None]:
from autofeat import FeatureSelector
fsel = FeatureSelector(verbose=1)
selected_data = fsel.fit_transform(pd.DataFrame(enhanced_data), pd.DataFrame(target))

In [None]:
selected_data

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()  
scaler.fit(selected_data)

In [None]:

selected_scaled_data = scaler.transform(selected_data)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(selected_scaled_data,target,test_size=0.3)

In [None]:
X_train = scaler.transform(X_train)  
X_test = scaler.transform(X_test) 

In [None]:
selected_scaled_data_df = pd.DataFrame(selected_scaled_data)
selected_scaled_data_1250 = selected_scaled_data_df.tail(1250)
selected_scaled_data_250 = selected_scaled_data_df.tail(250)
selected_scaled_data_125 = selected_scaled_data_df.tail(125)
target_1250 = target.tail(1250)
target_250 = target.tail(250)
target_125 = target.tail(125)

In [None]:
X_train_1250, X_test_1250, y_train_1250, y_test_1250 = train_test_split(selected_scaled_data_1250,target_1250,test_size=0.3)

In [None]:
X_train_250, X_test_250, y_train_250, y_test_250 = train_test_split(selected_scaled_data_250,target_250,test_size=0.3)

In [None]:
X_train_125, X_test_125, y_train_125, y_test_125 = train_test_split(selected_scaled_data_125,target_125,test_size=0.3)

### Logistic Regression

#### Max Duration

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [None]:
lr = LogisticRegression(solver='liblinear', multi_class='ovr')
trained_model_lr = lr.fit(X_train, y_train)
predictions_lr = trained_model_lr.predict(X_test)

Train_accuracy_lr = accuracy_score(y_train,trained_model_lr.predict(X_train))
Test_accuracy_lr = accuracy_score(y_test, predictions_lr)
Confusion_matrix_lr = confusion_matrix(y_test,predictions_lr)

In [None]:
Train_accuracy_lr

In [None]:
Test_accuracy_lr

#### 5 Years

In [None]:
trained_model_lr_1250 = lr.fit(X_train_1250, y_train_1250)
predictions_lr_1250 = trained_model_lr_1250.predict(X_test_1250)

Train_accuracy_lr_1250 = accuracy_score(y_train_1250,trained_model_lr_1250.predict(X_train_1250))
Test_accuracy_lr_1250 = accuracy_score(y_test_1250, predictions_lr_1250)
Confusion_matrix_lr_1250 = confusion_matrix(y_test_1250,predictions_lr_1250)

In [None]:
Train_accuracy_lr_1250

In [None]:
Test_accuracy_lr_1250

#### 1 Year

In [None]:
trained_model_lr_250 = lr.fit(X_train_250, y_train_250)
predictions_lr_250 = trained_model_lr_250.predict(X_test_250)

Train_accuracy_lr_250 = accuracy_score(y_train_250,trained_model_lr_250.predict(X_train_250))
Test_accuracy_lr_250 = accuracy_score(y_test_250, predictions_lr_250)
Confusion_matrix_lr_250 = confusion_matrix(y_test_250,predictions_lr_250)

In [None]:
Train_accuracy_lr_250

In [None]:
Test_accuracy_lr_250

#### 6 Months

In [None]:
trained_model_lr_125 = lr.fit(X_train_125, y_train_125)
predictions_lr_125 = trained_model_lr_125.predict(X_test_125)

Train_accuracy_lr_125 = accuracy_score(y_train_125,trained_model_lr_125.predict(X_train_125))
Test_accuracy_lr_125 = accuracy_score(y_test_125, predictions_lr_125)
Confusion_matrix_lr_125 = confusion_matrix(y_test_125,predictions_lr_125)

In [None]:
Train_accuracy_lr_125

In [None]:
Test_accuracy_lr_125

#### Last 100 Days based on Best Model - Max Duration

In [None]:
last100 = selected_scaled_data_df.tail(100)

In [None]:
predictions_lr_100 = trained_model_lr_1250.predict(last100)
predictions_lr_100

### SVM

In [None]:
svm = SVC(gamma='auto')
trained_model_svm = svm.fit(X_train, y_train)
predictions_svm = trained_model_svm.predict(X_test)

Train_accuracy_svm = accuracy_score(y_train,trained_model_svm.predict(X_train))
Test_accuracy_svm = accuracy_score(y_test, predictions_svm)
Confusion_matrix_svm = confusion_matrix(y_test,predictions_svm)

In [None]:
Train_accuracy_svm

In [None]:
Test_accuracy_svm

#### 5 Years

In [None]:
trained_model_svm_1250 = svm.fit(X_train_1250, y_train_1250)
predictions_svm_1250 = trained_model_svm_1250.predict(X_test_1250)

Train_accuracy_svm_1250 = accuracy_score(y_train_1250,trained_model_svm_1250.predict(X_train_1250))
Test_accuracy_svm_1250 = accuracy_score(y_test_1250, predictions_svm_1250)
Confusion_matrix_svm_1250 = confusion_matrix(y_test_1250,predictions_svm_1250)

In [None]:
Train_accuracy_svm_1250

In [None]:
Test_accuracy_svm_1250

#### 1 Year

In [None]:
trained_model_svm_250 = svm.fit(X_train_250, y_train_250)
predictions_svm_250 = trained_model_svm_1250.predict(X_test_250)

Train_accuracy_svm_250 = accuracy_score(y_train_250,trained_model_svm_250.predict(X_train_250))
Test_accuracy_svm_250 = accuracy_score(y_test_250, predictions_svm_250)
Confusion_matrix_svm_250 = confusion_matrix(y_test_250,predictions_svm_250)

In [None]:
Train_accuracy_svm_250

In [None]:
Test_accuracy_svm_250

#### 6 Months

In [None]:
trained_model_svm_125 = svm.fit(X_train_125, y_train_125)
predictions_svm_125 = trained_model_svm_1250.predict(X_test_125)

Train_accuracy_svm_125 = accuracy_score(y_train_125,trained_model_svm_125.predict(X_train_125))
Test_accuracy_svm_125 = accuracy_score(y_test_125, predictions_svm_125)
Confusion_matrix_svm_125 = confusion_matrix(y_test_125,predictions_svm_125)

In [None]:
Train_accuracy_svm_125

In [None]:
Test_accuracy_svm_125

#### Last 100 Days based on Best Model - Max Duration

In [None]:
predictions_svm_100 = trained_model_svm.predict(last100)
predictions_svm_100

### RandomForest

In [None]:
rf = RandomForestClassifier(n_estimators=10)
trained_model_rf = rf.fit(X_train, y_train)
predictions_rf = trained_model_rf.predict(X_test)

Train_accuracy_rf = accuracy_score(y_train, trained_model_rf.predict(X_train))
Test_accuracy_rf = accuracy_score(y_test, predictions_rf)
Confusion_matrix_rf = confusion_matrix(y_test, predictions_rf)

In [None]:
Train_accuracy_rf

In [None]:
Test_accuracy_rf

#### 5 Years

In [None]:
trained_model_rf_1250 = rf.fit(X_train_1250, y_train_1250)
predictions_rf_1250 = trained_model_rf_1250.predict(X_test_1250)

Train_accuracy_rf_1250 = accuracy_score(y_train_1250,trained_model_rf_1250.predict(X_train_1250))
Test_accuracy_rf_1250 = accuracy_score(y_test_1250, predictions_rf_1250)
Confusion_matrix_rf_1250 = confusion_matrix(y_test_1250,predictions_rf_1250)

In [None]:
Train_accuracy_rf_1250

In [None]:
Test_accuracy_rf_1250

#### 1 Year

In [None]:
trained_model_rf_250 = rf.fit(X_train_250, y_train_250)
predictions_rf_250 = trained_model_rf_250.predict(X_test_250)

Train_accuracy_rf_250 = accuracy_score(y_train_250,trained_model_rf_250.predict(X_train_250))
Test_accuracy_rf_250 = accuracy_score(y_test_250, predictions_rf_250)
Confusion_matrix_rf_250 = confusion_matrix(y_test_250,predictions_rf_250)

In [None]:
Train_accuracy_rf_250

In [None]:
Test_accuracy_rf_250

#### 6 Months

In [None]:
trained_model_rf_125 = rf.fit(X_train_125, y_train_125)
predictions_rf_125 = trained_model_rf_125.predict(X_test_125)

Train_accuracy_rf_125 = accuracy_score(y_train_125,trained_model_rf_125.predict(X_train_125))
Test_accuracy_rf_125 = accuracy_score(y_test_125, predictions_rf_125)
Confusion_matrix_rf_125 = confusion_matrix(y_test_125,predictions_rf_125)

In [None]:
Train_accuracy_rf_125

In [None]:
Test_accuracy_rf_125

#### Last 100 Days based on Best Model

In [None]:
predictions_rf_100 = trained_model_rf_125.predict(last100)
predictions_rf_100

### KNN

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
trained_model_knn = knn.fit(X_train, y_train)
predictions_knn = trained_model_knn.predict(X_test)

Train_accuracy_knn = accuracy_score(y_train, trained_model_knn.predict(X_train))
Test_accuracy_knn = accuracy_score(y_test, predictions_knn)
Confusion_matrix_knn = confusion_matrix(y_test, predictions_knn)

In [None]:
Train_accuracy_knn

In [None]:
Test_accuracy_knn

#### 5 Years

In [None]:
trained_model_knn_1250 = knn.fit(X_train_1250, y_train_1250)
predictions_knn_1250 = trained_model_knn_1250.predict(X_test_1250)

Train_accuracy_knn_1250 = accuracy_score(y_train_1250,trained_model_knn_1250.predict(X_train_1250))
Test_accuracy_knn_1250 = accuracy_score(y_test_1250, predictions_knn_1250)
Confusion_matrix_knn_1250 = confusion_matrix(y_test_1250,predictions_knn_1250)

In [None]:
Train_accuracy_knn_1250

In [None]:
Test_accuracy_knn_1250

#### 1 Year

In [None]:
trained_model_knn_250 = knn.fit(X_train_250, y_train_250)
predictions_knn_250 = trained_model_knn_250.predict(X_test_250)

Train_accuracy_knn_250 = accuracy_score(y_train_250,trained_model_knn_250.predict(X_train_250))
Test_accuracy_knn_250 = accuracy_score(y_test_250, predictions_knn_250)
Confusion_matrix_knn_250 = confusion_matrix(y_test_250,predictions_knn_250)

In [None]:
Train_accuracy_knn_250

In [None]:
Test_accuracy_knn_250

#### 6 Months

In [None]:
trained_model_knn_125 = knn.fit(X_train_125, y_train_125)
predictions_knn_125 = trained_model_knn_125.predict(X_test_125)

Train_accuracy_knn_125 = accuracy_score(y_train_125,trained_model_knn_125.predict(X_train_125))
Test_accuracy_knn_125 = accuracy_score(y_test_125, predictions_knn_125)
Confusion_matrix_knn_125 = confusion_matrix(y_test_125,predictions_knn_125)

In [None]:
Train_accuracy_knn_125

In [None]:
Test_accuracy_knn_125

#### Last 100 Days based on Best Model

In [None]:
predictions_knn_100 = trained_model_knn.predict(last100)
predictions_knn_100

### XGBoost

In [None]:
xgb = XGBClassifier(use_label_encoder=False)
trained_model_xgb = xgb.fit(X_train, y_train)
predictions_xgb = trained_model_xgb.predict(X_test)

Train_accuracy_xgb = accuracy_score(y_train, trained_model_xgb.predict(X_train))
Test_accuracy_xgb = accuracy_score(y_test, predictions_xgb)
Confusion_matrix_xgb = confusion_matrix(y_test, predictions_xgb)

In [None]:
Train_accuracy_xgb

In [None]:
Test_accuracy_xgb

#### 5 Years

In [None]:
trained_model_xgb_1250 = xgb.fit(X_train_1250, y_train_1250)
predictions_xgb_1250 = trained_model_xgb_1250.predict(X_test_1250)

Train_accuracy_xgb_1250 = accuracy_score(y_train_1250,trained_model_xgb_1250.predict(X_train_1250))
Test_accuracy_xgb_1250 = accuracy_score(y_test_1250, predictions_xgb_1250)
Confusion_matrix_xgb_1250 = confusion_matrix(y_test_1250,predictions_xgb_1250)

In [None]:
Train_accuracy_xgb_1250

In [None]:
Test_accuracy_xgb_1250

#### 1 Year

In [None]:
trained_model_xgb_250 = xgb.fit(X_train_250, y_train_250)
predictions_xgb_250 = trained_model_xgb_250.predict(X_test_250)

Train_accuracy_xgb_250 = accuracy_score(y_train_250,trained_model_xgb_250.predict(X_train_250))
Test_accuracy_xgb_250 = accuracy_score(y_test_250, predictions_xgb_250)
Confusion_matrix_xgb_250 = confusion_matrix(y_test_250,predictions_xgb_250)

In [None]:
Train_accuracy_xgb_250

In [None]:
Test_accuracy_xgb_250

#### 6 Months

In [None]:
trained_model_xgb_125 = xgb.fit(X_train_125, y_train_125)
predictions_xgb_125 = trained_model_xgb_125.predict(X_test_125)

Train_accuracy_xgb_125 = accuracy_score(y_train_125,trained_model_xgb_125.predict(X_train_125))
Test_accuracy_xgb_125 = accuracy_score(y_test_125, predictions_xgb_125)
Confusion_matrix_xgb_125 = confusion_matrix(y_test_125,predictions_xgb_125)

In [None]:
Train_accuracy_xgb_125

In [None]:
Test_accuracy_xgb_125

#### Last 100 Days based on Best Model

In [None]:
predictions_xgb_100 = trained_model_xgb.predict(last100)
predictions_xgb_100

## KFold cross validation
### Basic example

In [None]:
from sklearn.model_selection import cross_val_score

cross_val_score(LogisticRegression(solver='liblinear',multi_class='ovr'), complete_data, target, cv=10)

In [None]:
cross_val_score(SVC(gamma='auto'), complete_data, target, cv=10)

In [None]:
cross_val_score(RandomForestClassifier(n_estimators=5), complete_data, target, cv=10)

In [None]:
cross_val_score(KNeighborsClassifier(n_neighbors=5), complete_data, target, cv=10)

In [None]:
cross_val_score(XGBClassifier(use_label_encoder=False), complete_data, target, cv=10)