# Decision Trees
(Reon)

### Contents:
1. Data Exploration
2. Feature Engineering
3. Training

## Packages


In [1]:
from sklearn.ensemble import *
from sklearn.metrics import *

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from ta import *

## Data Exploration

In [3]:
# This is from Samuel's part
df = pd.read_csv('AAPL.csv') #Read the data in
df.Date = pd.to_datetime(df.Date, format='%Y-%m-%d') #Set the date column to datetime
#df.set_index('Date', inplace=True) #Set the index to the date column
df = df.rename(columns = {'Adj Close':'Adj_Close'})
df = df.rename(columns = {'Date':'Timestamp'})
df.head(5) #Observe a few rows of data

Unnamed: 0,Timestamp,Open,High,Low,Close,Adj_Close,Volume
0,2000-08-31,4.212054,4.392857,4.209821,4.352679,3.798641,104899200
1,2000-09-01,4.379464,4.544643,4.366071,4.53125,3.954481,64218000
2,2000-09-05,4.475446,4.580357,4.446429,4.459821,3.892146,74660600
3,2000-09-06,4.383929,4.455357,4.125,4.174107,3.642799,88851000
4,2000-09-07,4.223214,4.46875,4.160714,4.428571,3.864874,54366200


In [4]:
#Target - next day price
target =  list(df["Adj_Close"])[1:] + [0]
#df["Target"] = target
df["Binary_Target"] = target > df["Adj_Close"]

df.head()

Unnamed: 0,Timestamp,Open,High,Low,Close,Adj_Close,Volume,Binary_Target
0,2000-08-31,4.212054,4.392857,4.209821,4.352679,3.798641,104899200,True
1,2000-09-01,4.379464,4.544643,4.366071,4.53125,3.954481,64218000,False
2,2000-09-05,4.475446,4.580357,4.446429,4.459821,3.892146,74660600,False
3,2000-09-06,4.383929,4.455357,4.125,4.174107,3.642799,88851000,True
4,2000-09-07,4.223214,4.46875,4.160714,4.428571,3.864874,54366200,False


## Feature Engineering

In this section we will be creating technical indicators that are commonly used for stock technical analysis. We use the package "ta" to create the technical indicators.

### Indicators:

#### Volume
1. Accumulation/Distribution Index (ADI)
2. On-Balance Volume (OBV)
3. Chaikin Money Flow (CMF)
4. Force Index (FI)
5. Ease of Movement (EoM, EMV)
6. Volume-price Trend (VPT)
7. Negative Volume Index (NVI)

#### Volatility
1. Average True Range (ATR)
2. Bollinger Bands (BB)
3. Keltner Channel (KC)
4. Donchian Channel (DC)

#### Trend
1. Moving Average Convergence Divergence (MACD)
2. Average Directional Movement Index (ADX)
3. Vortex Indicator (VI)
4. Trix (TRIX)
5. Mass Index (MI)
6. Commodity Channel Index (CCI)
7. Detrended Price Oscillator (DPO)
8. KST Oscillator (KST)
9. Ichimoku Kinkō Hyō (Ichimoku)

#### Momentum
1. Money Flow Index (MFI)
2. Relative Strength Index (RSI)
3. True strength index (TSI)
4. Ultimate Oscillator (UO)
5. Stochastic Oscillator (SR)
6. Williams %R (WR)
7. Awesome Oscillator (AO)
8. Kaufman's Adaptive Moving Average (KAMA)

#### Others
1. Daily Return (DR)
2. Daily Log Return (DLR)
3. Cumulative Return (CR)

For a start, we simply add every single indicator into our dataset. The decision tree algorithm will conduct feature selection automatically.


In [5]:
df= add_all_ta_features(df, "Open", "High", "Low", "Close", "Volume", fillna=True)
df.head(15)

  dip[i] = 100 * (dip_mio[i]/trs[i])
  dip[i] = 100 * (dip_mio[i]/trs[i])
  din[i] = 100 * (din_mio[i]/trs[i])
  din[i] = 100 * (din_mio[i]/trs[i])
  dx = 100 * np.abs((dip - din) / (dip + din))


Unnamed: 0,Timestamp,Open,High,Low,Close,Adj_Close,Volume,Binary_Target,volume_adi,volume_obv,...,momentum_mfi,momentum_tsi,momentum_uo,momentum_stoch,momentum_stoch_signal,momentum_wr,momentum_ao,others_dr,others_dlr,others_cr
0,2000-08-31,4.212054,4.392857,4.209821,4.352679,3.798641,104899200,True,62944820.0,0.0,...,0.0,-100.0,0.264866,78.049127,78.049127,-21.950873,0.0,-92.514194,0.0,0.0
1,2000-09-01,4.379464,4.544643,4.366071,4.53125,3.954481,64218000,False,113431800.0,64218000.0,...,38.844715,-99.761719,0.593832,95.999964,87.024546,-4.000036,0.0,4.102554,4.020632,4.102554
2,2000-09-05,4.475446,4.580357,4.446429,4.459821,3.892146,74660600,False,-5144144.0,-10442600.0,...,26.732066,-99.673667,0.617047,67.469827,80.506306,-32.530173,0.0,-1.576364,-1.588921,2.461519
3,2000-09-06,4.383929,4.455357,4.125,4.174107,3.642799,88851000,True,-122165300.0,-99293600.0,...,19.787697,-99.625467,0.703208,10.784286,58.084692,-89.215714,0.0,-6.406401,-6.620819,-4.102577
4,2000-09-07,4.223214,4.46875,4.160714,4.428571,3.864874,54366200,False,-22252330.0,-44927400.0,...,31.01415,-99.482044,1.187121,66.666593,48.306902,-33.333407,0.0,6.09625,5.917652,1.74357
5,2000-09-08,4.401786,4.401786,4.178571,4.205357,3.670071,48879600,False,3035167.0,-93807000.0,...,27.612057,-99.378545,1.230305,17.647033,31.699304,-82.352967,0.011905,-5.040317,-5.171777,-3.384628
6,2000-09-11,4.191964,4.3125,4.151786,4.174107,3.642799,46845400,False,-70981420.0,-140652400.0,...,25.01277,-99.298794,1.267083,10.784286,31.699304,-89.215714,-0.014349,-0.7431,-0.745875,-4.102577
7,2000-09-12,4.095982,4.290179,4.071429,4.125,3.599942,46999400,True,-57812490.0,-187651800.0,...,22.877998,-99.234557,22.449698,10.526243,12.985854,-89.473757,-0.060658,-1.176467,-1.183442,-5.230779
8,2000-09-13,4.053571,4.25,4.053571,4.142857,3.615526,76496000,False,-30933580.0,-111155800.0,...,32.257838,-99.176272,19.271963,16.949198,12.753242,-83.050802,-0.069395,0.432897,0.431963,-4.820525
9,2000-09-14,4.183036,4.258929,4.058036,4.061384,3.544423,106638000,False,-110037700.0,-217793800.0,...,27.603366,-99.126577,18.181522,1.483145,9.652862,-98.516855,-0.086161,-1.96659,-1.986184,-6.692315


In [6]:
# Split into independent and dependent variables
df = df[df["Timestamp"] > '2000-08-31']
X = df.copy().drop('Binary_Target', axis =1)
y = df[['Timestamp','Binary_Target']]

# Get Training set
X_train = X[X["Timestamp"] <= '2018-08-31']
y_train = y[y["Timestamp"] <= '2018-08-31']

X_test = X[X["Timestamp"] > '2018-08-31']
y_test = y[y["Timestamp"] > '2018-08-31']

In [7]:
#Test set
X_test.set_index('Timestamp', inplace=True) #Set the index to the date column
y_test.set_index('Timestamp', inplace=True) #Set the index to the date column

In [8]:
X_train.set_index('Timestamp', inplace=True) #Set the index to the date column
y_train.set_index('Timestamp', inplace=True) #Set the index to the date column
y_train = np.ravel(y_train)

In [9]:
from sklearn.tree import DecisionTreeClassifier

In [10]:
clf0 = DecisionTreeClassifier(min_samples_split = 5)
clf0.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [11]:
print(classification_report(y_test["Binary_Target"], clf0.predict(X_test)))

              precision    recall  f1-score   support

       False       0.48      0.36      0.41       119
        True       0.53      0.64      0.58       131

    accuracy                           0.51       250
   macro avg       0.50      0.50      0.49       250
weighted avg       0.50      0.51      0.50       250



## Iteration 1 (All 31 Features, Random Forest)
For the first iteration, no featue selection will be conducted. Instead, the decision tree algorithm automatically does feature selection for us by picking features that best split the data. In this case, we use the GINI Index to calculate our information gain.

In [12]:
clf = RandomForestClassifier(n_estimators=100, bootstrap = "False" ,min_samples_split = 5)

In [13]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap='False', class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [14]:
predicted = clf.predict(X_test)

In [15]:
pd.crosstab(y_test["Binary_Target"],predicted, rownames=['Actual'], colnames=['Predicted'])

Predicted,False,True
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
False,27,92
True,21,110


In [16]:
print(classification_report(y_test["Binary_Target"], predicted))

              precision    recall  f1-score   support

       False       0.56      0.23      0.32       119
        True       0.54      0.84      0.66       131

    accuracy                           0.55       250
   macro avg       0.55      0.53      0.49       250
weighted avg       0.55      0.55      0.50       250



Our model accuracy is pretty bad -  given that this is a binary classification problem, the nearer the accuracy is to 50%, the worse the model is.

## Iteration 2 (xgBoost, all 31 features)

In [67]:
xgb_clf = GradientBoostingClassifier(n_estimators=300, learning_rate = 0.1, min_samples_split = 5).fit(X_train, y_train)

In [68]:
predicted_xgb = xgb_clf.predict(X_test)

In [69]:
pd.crosstab(y_test["Binary_Target"],predicted_xgb, rownames=['Actual'], colnames=['Predicted'])

Predicted,False,True
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
False,22,97
True,23,108


In [70]:
print(classification_report(y_test["Binary_Target"], predicted_xgb))

              precision    recall  f1-score   support

       False       0.49      0.18      0.27       119
        True       0.53      0.82      0.64       131

    accuracy                           0.52       250
   macro avg       0.51      0.50      0.46       250
weighted avg       0.51      0.52      0.46       250



## Iteration 3 (Random Forest, with feature selection)


In [21]:
from sklearn.feature_selection import *

In [88]:
k = 9
X_train_kbest = pd.DataFrame(SelectKBest(f_classif, k=k).fit(X_train, y_train).transform(X_train))

  f = msb / msw


In [89]:
X_train_kbest

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1.134318e+08,0.0,0.0,0.622373,1.993932,-923.606395,-924.374168,4.0,-99.761719
1,-5.144144e+06,0.0,0.0,0.365246,2.925131,-923.504028,-924.084121,4.0,-99.673667
2,-1.221653e+08,0.0,0.0,0.068974,4.083825,-924.681284,-924.233412,-8.0,-99.625467
3,-2.225233e+07,0.0,0.0,0.007939,5.277060,-924.512375,-924.289204,-8.0,-99.482044
4,3.035167e+06,0.0,0.0,-0.061208,6.391636,-925.039580,-924.414267,-8.0,-99.378545
...,...,...,...,...,...,...,...,...,...
4523,1.009841e+07,1.0,20.0,0.423318,25.914690,110.762158,104.912862,80.0,32.435996
4524,6.054898e+06,1.0,20.0,0.425141,25.834287,111.747007,107.113618,84.0,33.292511
4525,1.959744e+07,1.0,20.0,0.429232,25.841784,113.189658,109.135601,88.0,34.412831
4526,1.544511e+07,1.0,20.0,0.435695,26.004257,113.258200,110.529937,92.0,35.629117


In [90]:
clf3 = RandomForestClassifier(n_estimators=200, random_state=0)
clf3.fit(X_train_kbest, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [91]:
predicted3 = clf3.predict(pd.DataFrame(SelectKBest(f_classif, k=k).fit(X_train, y_train).transform(X_test)))

  f = msb / msw


In [92]:
print(classification_report(y_test["Binary_Target"], predicted3))

              precision    recall  f1-score   support

       False       0.49      0.48      0.48       119
        True       0.53      0.54      0.54       131

    accuracy                           0.51       250
   macro avg       0.51      0.51      0.51       250
weighted avg       0.51      0.51      0.51       250



## Iteration 4 (xgboost, feature selection)

In [95]:
xgb_clf2 = GradientBoostingClassifier(n_estimators=300, learning_rate = 0.1, min_samples_split = 5).fit(X_train_kbest , y_train)

In [96]:
predicted_xgb2 = xgb_clf2.predict(pd.DataFrame(SelectKBest(f_classif, k=k).fit(X_train, y_train).transform(X_test)))

  f = msb / msw


In [97]:
print(classification_report(y_test["Binary_Target"], predicted_xgb2))

              precision    recall  f1-score   support

       False       0.41      0.36      0.38       119
        True       0.48      0.53      0.50       131

    accuracy                           0.45       250
   macro avg       0.44      0.44      0.44       250
weighted avg       0.44      0.45      0.44       250

