# Decision Trees
(Reon)

### Contents:
1. Data Exploration
2. Feature Engineering
3. Training

## Packages


In [1]:
from sklearn.ensemble import *
from sklearn.metrics import *

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from ta import *

## Data Exploration

In [3]:
# This is from Samuel's part
df = pd.read_csv('GOOG.csv') #Read the data in
df.Date = pd.to_datetime(df.Date, format='%Y-%m-%d') #Set the date column to datetime
#df.set_index('Date', inplace=True) #Set the index to the date column
df = df.rename(columns = {'Adj Close':'Adj_Close'})
df = df.rename(columns = {'Date':'Timestamp'})
df.head(20) #Observe a few rows of data

Unnamed: 0,Timestamp,Open,High,Low,Close,Adj_Close,Volume
0,2004-08-19,49.813286,51.835709,47.800831,49.982655,49.982655,44871300
1,2004-08-20,50.316402,54.336334,50.062355,53.95277,53.95277,22942800
2,2004-08-23,55.168217,56.528118,54.321388,54.495735,54.495735,18342800
3,2004-08-24,55.4123,55.591629,51.591621,52.239193,52.239193,15319700
4,2004-08-25,52.284027,53.798351,51.746044,52.802086,52.802086,9232100
5,2004-08-26,52.279045,53.773445,52.134586,53.753517,53.753517,7128600
6,2004-08-27,53.848164,54.107193,52.647663,52.876804,52.876804,6241200
7,2004-08-30,52.443428,52.548038,50.814533,50.814533,50.814533,5221400
8,2004-08-31,50.958992,51.661362,50.889256,50.993862,50.993862,4941200
9,2004-09-01,51.158245,51.292744,49.648903,49.93782,49.93782,9181600


In [4]:
#Target - 5 day later price
target =  list(df["Adj_Close"])[4:] + [0,0,0,0]
df["Binary_Target"] = target > df["Adj_Close"]

df.head()

Unnamed: 0,Timestamp,Open,High,Low,Close,Adj_Close,Volume,Binary_Target
0,2004-08-19,49.813286,51.835709,47.800831,49.982655,49.982655,44871300,True
1,2004-08-20,50.316402,54.336334,50.062355,53.95277,53.95277,22942800,False
2,2004-08-23,55.168217,56.528118,54.321388,54.495735,54.495735,18342800,False
3,2004-08-24,55.4123,55.591629,51.591621,52.239193,52.239193,15319700,False
4,2004-08-25,52.284027,53.798351,51.746044,52.802086,52.802086,9232100,False


## Feature Engineering

In this section we will be creating technical indicators that are commonly used for stock technical analysis. We use the package "ta" to create the technical indicators.

### Indicators:

#### Volume
1. Accumulation/Distribution Index (ADI)
2. On-Balance Volume (OBV)
3. Chaikin Money Flow (CMF)
4. Force Index (FI)
5. Ease of Movement (EoM, EMV)
6. Volume-price Trend (VPT)
7. Negative Volume Index (NVI)

#### Volatility
1. Average True Range (ATR)
2. Bollinger Bands (BB)
3. Keltner Channel (KC)
4. Donchian Channel (DC)

#### Trend
1. Moving Average Convergence Divergence (MACD)
2. Average Directional Movement Index (ADX)
3. Vortex Indicator (VI)
4. Trix (TRIX)
5. Mass Index (MI)
6. Commodity Channel Index (CCI)
7. Detrended Price Oscillator (DPO)
8. KST Oscillator (KST)
9. Ichimoku Kinkō Hyō (Ichimoku)

#### Momentum
1. Money Flow Index (MFI)
2. Relative Strength Index (RSI)
3. True strength index (TSI)
4. Ultimate Oscillator (UO)
5. Stochastic Oscillator (SR)
6. Williams %R (WR)
7. Awesome Oscillator (AO)
8. Kaufman's Adaptive Moving Average (KAMA)

#### Others
1. Daily Return (DR)
2. Daily Log Return (DLR)
3. Cumulative Return (CR)

For a start, we simply add every single indicator into our dataset. The decision tree algorithm will conduct feature selection automatically.


In [5]:
df= add_all_ta_features(df, "Open", "High", "Low", "Close", "Volume", fillna=True)
df.head()

  dip[i] = 100 * (dip_mio[i]/trs[i])
  dip[i] = 100 * (dip_mio[i]/trs[i])
  din[i] = 100 * (din_mio[i]/trs[i])
  din[i] = 100 * (din_mio[i]/trs[i])
  dx = 100 * np.abs((dip - din) / (dip + din))


Unnamed: 0,Timestamp,Open,High,Low,Close,Adj_Close,Volume,Binary_Target,volume_adi,volume_obv,...,momentum_mfi,momentum_tsi,momentum_uo,momentum_stoch,momentum_stoch_signal,momentum_wr,momentum_ao,others_dr,others_dlr,others_cr
0,2004-08-19,49.813286,51.835709,47.800831,49.982655,49.982655,44871300,True,3768562.0,0.0,...,0.0,-100.0,0.5056,54.074101,54.074101,-45.925899,0.0,-89.572437,0.0,0.0
1,2004-08-20,50.316402,54.336334,50.062355,53.95277,53.95277,22942800,False,22481050.0,22942800.0,...,35.11312,-99.337671,1.411368,94.131071,74.102586,-5.868929,0.0,7.942985,7.643299,7.942985
2,2004-08-23,55.168217,56.528118,54.321388,54.495735,54.495735,18342800,False,3380460.0,41285600.0,...,49.82179,-99.040366,1.526912,76.712316,74.972496,-23.287684,0.0,1.006371,1.001341,9.029292
3,2004-08-24,55.4123,55.591629,51.591621,52.239193,52.239193,15319700,False,-25803790.0,25965900.0,...,42.131154,-98.877689,1.659466,50.856148,73.899845,-49.143852,0.0,-4.140768,-4.22894,4.514642
4,2004-08-25,52.284027,53.798351,51.746044,52.802086,52.802086,9232100,False,-10090500.0,35198000.0,...,47.025751,-98.740367,1.889377,57.305953,61.624806,-42.694047,0.0,1.07753,1.071766,5.640819


For all the features we would also like to implement some feature scaling. This helps prevent bias in our data set. We will use normalization in this case for our data.

In [6]:
from sklearn.preprocessing import * 

In [7]:
features = pd.DataFrame(normalize(df.copy().drop('Timestamp', axis =1).drop('Binary_Target', axis =1)))
features["Timestamp"] = df["Timestamp"]
features["Binary_Target"] = df["Binary_Target"]
features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,56,57,58,59,60,61,62,63,Timestamp,Binary_Target
0,8.253353e-07,8.588439e-07,7.919918e-07,8.281415e-07,8.281415e-07,0.743454,0.06244,0.0,1.350041e-09,0.0,...,8.377081e-09,8.959309e-07,8.959309e-07,-7.609268e-07,0.0,-1.484088e-06,0.0,0.0,2004-08-19,True
1,9.140287e-07,9.870533e-07,9.094138e-07,9.800856e-07,9.800856e-07,0.41677,0.408382,0.41677,6.022084e-09,0.0,...,2.563837e-08,1.709949e-06,1.34612e-06,-1.066127e-07,0.0,1.442893e-07,1.388453e-07,1.442893e-07,2004-08-20,False
2,4.309156e-07,4.415377e-07,4.243011e-07,4.256629e-07,4.256629e-07,0.143274,0.026405,0.322479,6.379402e-10,-0.935167,...,1.192662e-08,5.991952e-07,5.856056e-07,-1.818987e-07,0.0,7.860702e-09,7.821412e-09,7.052725e-08,2004-08-23,False
3,1.326278e-06,1.33057e-06,1.234831e-06,1.25033e-06,1.25033e-06,0.366673,-0.617606,0.621486,-7.837164e-10,0.312654,...,3.971884e-08,1.217227e-06,1.768772e-06,-1.176244e-06,0.0,-9.910811e-08,-1.012185e-07,1.080567e-07,2004-08-24,False
4,1.28159e-06,1.31871e-06,1.268403e-06,1.294289e-06,1.294289e-06,0.226298,-0.247339,0.862776,-6.761529e-10,0.37823,...,4.631257e-08,1.404688e-06,1.510552e-06,-1.04652e-06,0.0,2.641251e-08,2.627122e-08,1.382682e-07,2004-08-25,False




## Iteration 1 (All 31 Features, Random Forest)
We will split the data into train and test set. We then use a randomforest classifier to predict our binary target with randomly selected features.

In [21]:
from sklearn.model_selection import train_test_split

In [31]:
# Split into independent and dependent variables
X = df.copy().drop('Binary_Target', axis =1)
y = df[['Timestamp','Binary_Target']]

# Get Training set
#X_train = X[X["Timestamp"] <= '2018-08-31']
#y_train = Y[Y["Timestamp"] <= '2018-08-31']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Training our Classifier
The decision tree algorithm automatically does feature selection for us, by picking features that best split the data. In this case, we use the GINI Index to calculate our information gain.

In [30]:
clf = RandomForestClassifier(n_estimators=200, random_state=0)

In [32]:
X_train.set_index('Timestamp', inplace=True) #Set the index to the date column
y_train.set_index('Timestamp', inplace=True) #Set the index to the date column
y_train = np.ravel(y_train)

In [33]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [34]:
#Test set
#X_test = X[X["Timestamp"] > '2018-08-31']
#y_test = Y[Y["Timestamp"] > '2018-08-31']
X_test.set_index('Timestamp', inplace=True) #Set the index to the date column
y_test.set_index('Timestamp', inplace=True) #Set the index to the date column

In [35]:
predicted = clf.predict(X_test)

## Prediction Accuracy
In this part we evaluate our model training accuracy 

In [36]:
pd.crosstab(y_test["Binary_Target"],predicted, rownames=['Actual'], colnames=['Predicted'])

Predicted,False,True
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
False,373,166
True,138,584


In [37]:
print(classification_report(y_train, clf.predict(X_train)))

              precision    recall  f1-score   support

       False       1.00      1.00      1.00      1151
        True       1.00      1.00      1.00      1409

    accuracy                           1.00      2560
   macro avg       1.00      1.00      1.00      2560
weighted avg       1.00      1.00      1.00      2560



In [38]:
print(classification_report(y_test["Binary_Target"], predicted))

              precision    recall  f1-score   support

       False       0.73      0.69      0.71       539
        True       0.78      0.81      0.79       722

    accuracy                           0.76      1261
   macro avg       0.75      0.75      0.75      1261
weighted avg       0.76      0.76      0.76      1261



Our model accuracy is pretty bad - an accuracy of 0.46 is worse than random chance, given that this is a binary classification problem.

## Iteration 2 (xgBoost, all 31 features)

In [39]:
xgb_clf = GradientBoostingClassifier(n_estimators=100, random_state=0).fit(X_train, y_train)

In [40]:
predicted_xgb = xgb_clf.predict(X_test)

In [41]:
pd.crosstab(y_test["Binary_Target"],predicted_xgb, rownames=['Actual'], colnames=['Predicted'])

Predicted,False,True
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
False,241,298
True,165,557


In [42]:
print(classification_report(y_test["Binary_Target"], predicted_xgb))

              precision    recall  f1-score   support

       False       0.59      0.45      0.51       539
        True       0.65      0.77      0.71       722

    accuracy                           0.63      1261
   macro avg       0.62      0.61      0.61      1261
weighted avg       0.63      0.63      0.62      1261



## Iteration 3 (Random Forest, with feature selection)


In [43]:
from sklearn.feature_selection import *

In [44]:
k = 15
X_train_kbest = pd.DataFrame(SelectKBest(f_classif, k=k).fit(X_train, y_train).transform(X_train))

  f = msb / msw


In [45]:
X_train_kbest

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1366400.0,9.496190e+05,1.235660e+09,2.926552e-05,20891.448529,0.0,0.0,23.553758,0.816909,20.0,20.0,4.0,79.810455,86.431052,59.655510
1,4613900.0,5.472704e+06,1.047373e+09,-1.847244e-07,146753.690229,0.0,0.0,-7.558651,-1.162565,20.0,20.0,80.0,38.057863,19.542667,-18.381032
2,3671800.0,3.122212e+05,1.194429e+09,1.506428e-06,70025.857987,0.0,0.0,-0.144647,1.351592,20.0,20.0,84.0,57.269619,83.784742,-4.829150
3,4186700.0,8.950555e+05,1.066413e+09,-1.808323e-07,-54115.952253,0.0,0.0,-2.130560,0.216734,20.0,20.0,24.0,44.233905,32.870101,-4.362397
4,3618100.0,6.324028e+05,9.894532e+08,4.204312e-07,38509.402013,0.0,0.0,1.690351,0.860167,20.0,20.0,20.0,56.284524,77.146912,5.357829
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2555,8324300.0,7.027938e+06,9.474040e+08,7.338854e-07,32782.946011,0.0,0.0,6.695911,0.736296,20.0,20.0,28.0,58.630106,66.430185,18.155242
2556,8639800.0,1.269951e+06,1.056943e+09,5.295495e-07,312746.275365,0.0,0.0,5.092331,0.391154,20.0,20.0,4.0,45.467303,79.950180,9.366187
2557,17143100.0,-2.498273e+07,1.384948e+09,-3.508565e-06,-421391.865970,0.0,1.0,-9.420349,-3.771130,20.0,20.0,100.0,17.926190,5.654462,-25.034095
2558,2127800.0,1.195248e+06,1.261248e+09,1.240763e-04,143833.377055,1.0,0.0,28.439135,8.245835,20.0,20.0,40.0,72.747102,95.370147,59.774558


In [46]:
clf3 = RandomForestClassifier(n_estimators=200, random_state=0)
clf3.fit(X_train_kbest, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [47]:
predicted3 = clf3.predict(pd.DataFrame(SelectKBest(f_classif, k=k).fit(X_train, y_train).transform(X_test)))

  f = msb / msw


In [48]:
print(classification_report(y_test["Binary_Target"], predicted3))

              precision    recall  f1-score   support

       False       0.62      0.54      0.58       539
        True       0.69      0.75      0.72       722

    accuracy                           0.66      1261
   macro avg       0.65      0.65      0.65      1261
weighted avg       0.66      0.66      0.66      1261

