# Decision Trees
(Reon)

### Contents:
1. Data Exploration
2. Feature Engineering
3. Training

## Packages


In [1]:
from sklearn.ensemble import *
from sklearn.metrics import *

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from ta import *

## Data Exploration

In [3]:
# This is from Samuel's part
df = pd.read_csv('AAPL.csv') #Read the data in
df.Date = pd.to_datetime(df.Date, format='%Y-%m-%d') #Set the date column to datetime
#df.set_index('Date', inplace=True) #Set the index to the date column
df = df.rename(columns = {'Adj Close':'Adj_Close'})
df = df.rename(columns = {'Date':'Timestamp'})
df.head(20) #Observe a few rows of data

Unnamed: 0,Timestamp,Open,High,Low,Close,Adj_Close,Volume
0,2000-08-31,4.212054,4.392857,4.209821,4.352679,3.798641,104899200
1,2000-09-01,4.379464,4.544643,4.366071,4.53125,3.954481,64218000
2,2000-09-05,4.475446,4.580357,4.446429,4.459821,3.892146,74660600
3,2000-09-06,4.383929,4.455357,4.125,4.174107,3.642799,88851000
4,2000-09-07,4.223214,4.46875,4.160714,4.428571,3.864874,54366200
5,2000-09-08,4.401786,4.401786,4.178571,4.205357,3.670071,48879600
6,2000-09-11,4.191964,4.3125,4.151786,4.174107,3.642799,46845400
7,2000-09-12,4.095982,4.290179,4.071429,4.125,3.599942,46999400
8,2000-09-13,4.053571,4.25,4.053571,4.142857,3.615526,76496000
9,2000-09-14,4.183036,4.258929,4.058036,4.061384,3.544423,106638000


In [4]:
#Target - 5 day later price
target =  list(df["Adj_Close"])[4:] + [0,0,0,0]
df["Binary_Target"] = target > df["Adj_Close"]

df.head()

Unnamed: 0,Timestamp,Open,High,Low,Close,Adj_Close,Volume,Binary_Target
0,2000-08-31,4.212054,4.392857,4.209821,4.352679,3.798641,104899200,True
1,2000-09-01,4.379464,4.544643,4.366071,4.53125,3.954481,64218000,False
2,2000-09-05,4.475446,4.580357,4.446429,4.459821,3.892146,74660600,False
3,2000-09-06,4.383929,4.455357,4.125,4.174107,3.642799,88851000,False
4,2000-09-07,4.223214,4.46875,4.160714,4.428571,3.864874,54366200,False


## Feature Engineering

In this section we will be creating technical indicators that are commonly used for stock technical analysis. We use the package "ta" to create the technical indicators.

### Indicators:

#### Volume
1. Accumulation/Distribution Index (ADI)
2. On-Balance Volume (OBV)
3. Chaikin Money Flow (CMF)
4. Force Index (FI)
5. Ease of Movement (EoM, EMV)
6. Volume-price Trend (VPT)
7. Negative Volume Index (NVI)

#### Volatility
1. Average True Range (ATR)
2. Bollinger Bands (BB)
3. Keltner Channel (KC)
4. Donchian Channel (DC)

#### Trend
1. Moving Average Convergence Divergence (MACD)
2. Average Directional Movement Index (ADX)
3. Vortex Indicator (VI)
4. Trix (TRIX)
5. Mass Index (MI)
6. Commodity Channel Index (CCI)
7. Detrended Price Oscillator (DPO)
8. KST Oscillator (KST)
9. Ichimoku Kinkō Hyō (Ichimoku)

#### Momentum
1. Money Flow Index (MFI)
2. Relative Strength Index (RSI)
3. True strength index (TSI)
4. Ultimate Oscillator (UO)
5. Stochastic Oscillator (SR)
6. Williams %R (WR)
7. Awesome Oscillator (AO)
8. Kaufman's Adaptive Moving Average (KAMA)

#### Others
1. Daily Return (DR)
2. Daily Log Return (DLR)
3. Cumulative Return (CR)

For a start, we simply add every single indicator into our dataset. The decision tree algorithm will conduct feature selection automatically.


In [5]:
df= add_all_ta_features(df, "Open", "High", "Low", "Close", "Volume", fillna=True)
df.head()

  dip[i] = 100 * (dip_mio[i]/trs[i])
  dip[i] = 100 * (dip_mio[i]/trs[i])
  din[i] = 100 * (din_mio[i]/trs[i])
  din[i] = 100 * (din_mio[i]/trs[i])
  dx = 100 * np.abs((dip - din) / (dip + din))


Unnamed: 0,Timestamp,Open,High,Low,Close,Adj_Close,Volume,Binary_Target,volume_adi,volume_obv,...,momentum_mfi,momentum_tsi,momentum_uo,momentum_stoch,momentum_stoch_signal,momentum_wr,momentum_ao,others_dr,others_dlr,others_cr
0,2000-08-31,4.212054,4.392857,4.209821,4.352679,3.798641,104899200,True,62944820.0,0.0,...,0.0,-100.0,0.264866,78.049127,78.049127,-21.950873,0.0,-92.514194,0.0,0.0
1,2000-09-01,4.379464,4.544643,4.366071,4.53125,3.954481,64218000,False,113431800.0,64218000.0,...,38.844715,-99.761719,0.593832,95.999964,87.024546,-4.000036,0.0,4.102554,4.020632,4.102554
2,2000-09-05,4.475446,4.580357,4.446429,4.459821,3.892146,74660600,False,-5144144.0,-10442600.0,...,26.732066,-99.673667,0.617047,67.469827,80.506306,-32.530173,0.0,-1.576364,-1.588921,2.461519
3,2000-09-06,4.383929,4.455357,4.125,4.174107,3.642799,88851000,False,-122165300.0,-99293600.0,...,19.787697,-99.625467,0.703208,10.784286,58.084692,-89.215714,0.0,-6.406401,-6.620819,-4.102577
4,2000-09-07,4.223214,4.46875,4.160714,4.428571,3.864874,54366200,False,-22252330.0,-44927400.0,...,31.01415,-99.482044,1.187121,66.666593,48.306902,-33.333407,0.0,6.09625,5.917652,1.74357


For all the features we would also like to implement some feature scaling. This helps prevent bias in our data set. We will use normalization in this case for our data.

In [6]:
from sklearn.preprocessing import * 

In [7]:
features = pd.DataFrame(normalize(df.copy().drop('Timestamp', axis =1).drop('Binary_Target', axis =1)))
features["Timestamp"] = df["Timestamp"]
features["Binary_Target"] = df["Binary_Target"]
features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,56,57,58,59,60,61,62,63,Timestamp,Binary_Target
0,2.697066e-08,2.812838e-08,2.695636e-08,2.787111e-08,2.432349e-08,0.671691,0.403049,0.0,3.592088e-09,0.0,...,1.695992e-09,4.997648e-07,4.997648e-07,-1.40556e-07,0.0,-5.923876e-07,0.0,0.0,2000-08-31,True
1,2.527292e-08,2.622613e-08,2.519563e-08,2.614884e-08,2.282044e-08,0.370588,0.65459,0.370588,3.87063e-09,0.0,...,3.426875e-09,5.539946e-07,5.021994e-07,-2.308332e-08,0.0,2.367493e-08,2.320218e-08,2.367493e-08,2000-09-01,False
2,5.916292e-08,6.054979e-08,5.877933e-08,5.895637e-08,5.145202e-08,0.986972,-0.068003,-0.138045,2.912148e-09,-0.042829,...,8.15702e-09,8.919138e-07,1.064249e-06,-4.300309e-07,0.0,-2.083866e-08,-2.100466e-08,3.253991e-08,2000-09-05,False
3,2.420522e-08,2.45996e-08,2.277558e-08,2.304672e-08,2.011318e-08,0.490578,-0.674518,-0.548235,-1.449674e-10,-0.048574,...,3.882659e-09,5.954385e-08,3.207061e-07,-4.925914e-07,0.0,-3.5372e-08,-3.655588e-08,-2.265177e-08,2000-09-06,False
4,5.70737e-08,6.039195e-08,5.622906e-08,5.984896e-08,5.2231e-08,0.73472,-0.300724,-0.607162,1.098273e-09,0.008571,...,1.604308e-08,9.009511e-07,6.528331e-07,-4.50477e-07,0.0,8.238644e-08,7.997281e-08,2.356309e-08,2000-09-07,False




## Iteration 1 (All 31 Features, Random Forest)
We will split the data into train and test set. We then use a randomforest classifier to predict our binary target with randomly selected features.

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
# Split into independent and dependent variables
X = df.copy().drop('Binary_Target', axis =1)
y = df[['Timestamp','Binary_Target']]

# Get Training set
#X_train = X[X["Timestamp"] <= '2018-08-31']
#y_train = Y[Y["Timestamp"] <= '2018-08-31']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Training our Classifier
The decision tree algorithm automatically does feature selection for us, by picking features that best split the data. In this case, we use the GINI Index to calculate our information gain.

In [10]:
clf = RandomForestClassifier(n_estimators=200, random_state=0)

In [11]:
X_train.set_index('Timestamp', inplace=True) #Set the index to the date column
y_train.set_index('Timestamp', inplace=True) #Set the index to the date column
y_train = np.ravel(y_train)

In [12]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [13]:
#Test set
#X_test = X[X["Timestamp"] > '2018-08-31']
#y_test = Y[Y["Timestamp"] > '2018-08-31']
X_test.set_index('Timestamp', inplace=True) #Set the index to the date column
y_test.set_index('Timestamp', inplace=True) #Set the index to the date column

In [14]:
predicted = clf.predict(X_test)

## Prediction Accuracy
In this part we evaluate our model training accuracy 

In [15]:
pd.crosstab(y_test["Binary_Target"],predicted, rownames=['Actual'], colnames=['Predicted'])

Predicted,False,True
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
False,459,231
True,125,763


In [16]:
print(classification_report(y_train, clf.predict(X_train)))

              precision    recall  f1-score   support

       False       1.00      1.00      1.00      1423
        True       1.00      1.00      1.00      1778

    accuracy                           1.00      3201
   macro avg       1.00      1.00      1.00      3201
weighted avg       1.00      1.00      1.00      3201



In [17]:
print(classification_report(y_test["Binary_Target"], predicted))

              precision    recall  f1-score   support

       False       0.79      0.67      0.72       690
        True       0.77      0.86      0.81       888

    accuracy                           0.77      1578
   macro avg       0.78      0.76      0.77      1578
weighted avg       0.78      0.77      0.77      1578



Our model accuracy is pretty bad - an accuracy of 0.46 is worse than random chance, given that this is a binary classification problem.

## Iteration 2 (xgBoost, all 31 features)

In [18]:
xgb_clf = GradientBoostingClassifier(n_estimators=100, random_state=0).fit(X_train, y_train)

In [19]:
predicted_xgb = xgb_clf.predict(X_test)

In [20]:
pd.crosstab(y_test["Binary_Target"],predicted_xgb, rownames=['Actual'], colnames=['Predicted'])

Predicted,False,True
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
False,227,463
True,143,745


In [21]:
print(classification_report(y_test["Binary_Target"], predicted_xgb))

              precision    recall  f1-score   support

       False       0.61      0.33      0.43       690
        True       0.62      0.84      0.71       888

    accuracy                           0.62      1578
   macro avg       0.62      0.58      0.57      1578
weighted avg       0.62      0.62      0.59      1578



## Iteration 3 (Random Forest, with feature selection)


In [22]:
from sklearn.feature_selection import *

In [23]:
k = 15
X_train_kbest = pd.DataFrame(SelectKBest(f_classif, k=k).fit(X_train, y_train).transform(X_train))

  f = msb / msw


In [24]:
X_train_kbest

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,-2.222489e+05,-1.720716e-11,-2.765488e+06,0.061869,0.0,20.0,-169.138570,-28.709844,6.763595,-35.473439,44.0,-56.0,28.475585,1.015883,-0.057509
1,4.174152e+05,5.032203e-10,1.858959e+05,0.906078,0.0,20.0,62.039616,130.008012,126.820119,3.187893,76.0,40.0,60.862062,30.498954,2.795303
2,-2.717447e+06,1.730016e-11,4.930590e+06,0.173833,0.0,20.0,97.685413,81.903117,68.028662,13.874455,100.0,84.0,73.488363,28.862636,0.344082
3,-1.654800e+05,-4.038319e-13,-1.099071e+06,0.063822,0.0,20.0,-184.225542,-83.914616,-97.152902,13.238286,8.0,-88.0,36.159940,-13.168558,-0.038105
4,1.366825e+06,6.309604e-12,2.169306e+06,0.069259,0.0,20.0,177.212550,116.321688,102.931882,13.389806,100.0,96.0,69.390016,0.167761,0.116950
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3196,-5.732375e+06,-1.464565e-08,-6.075913e+05,4.361446,0.0,20.0,-26.565291,-15.042453,-1.532554,-13.509899,24.0,-36.0,48.523351,-0.717568,-4.074824
3197,2.047500e+04,8.838406e-12,7.758316e+05,0.078102,0.0,20.0,51.167337,-165.323987,-209.284781,43.960793,20.0,-28.0,48.862260,-20.409454,-0.109261
3198,-3.305606e+08,-6.696333e-09,7.902064e+06,2.576285,0.0,20.0,-75.551335,-67.327528,-49.049405,-18.278123,36.0,-56.0,45.273599,-18.449204,-4.657437
3199,-1.319060e+08,-3.705649e-08,-3.697814e+06,4.252946,0.0,20.0,-78.298196,-95.490030,-91.365664,-4.124366,4.0,-72.0,38.083798,-19.151642,-6.730942


In [25]:
clf3 = RandomForestClassifier(n_estimators=200, random_state=0)
clf3.fit(X_train_kbest, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [26]:
predicted3 = clf3.predict(pd.DataFrame(SelectKBest(f_classif, k=k).fit(X_train, y_train).transform(X_test)))

  f = msb / msw


In [27]:
print(classification_report(y_test["Binary_Target"], predicted3))

              precision    recall  f1-score   support

       False       0.72      0.55      0.63       690
        True       0.71      0.83      0.76       888

    accuracy                           0.71      1578
   macro avg       0.71      0.69      0.69      1578
weighted avg       0.71      0.71      0.70      1578

