In [29]:
import numpy as np
import talib
import pandas as pd
import pathlib
from talib import abstract
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder

In [30]:
PATH = r'./data.csv'
df = pd.read_csv(PATH, thousands=',')

In [31]:
df

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,"Dec 29, 2017",14392.6,14398.5,15109.8,13951.1,118.88K,-0.04%
1,"Dec 28, 2017",14398.7,15416.3,15505.5,13466.1,170.37K,-6.60%
2,"Dec 27, 2017",15416.6,15757.0,16514.6,14534.7,138.71K,-2.16%
3,"Dec 26, 2017",15756.6,13830.2,16094.7,13748.5,143.14K,13.90%
4,"Dec 25, 2017",13833.5,13790.0,14467.4,13010.7,107.48K,0.32%
...,...,...,...,...,...,...,...
2185,"Jan 05, 2012",6.9,5.6,7.2,5.6,182.33K,24.78%
2186,"Jan 04, 2012",5.6,4.9,5.7,4.8,131.17K,14.14%
2187,"Jan 03, 2012",4.9,5.2,5.3,4.7,125.17K,-6.51%
2188,"Jan 02, 2012",5.2,5.3,5.5,4.8,69.15K,-0.95%


### Splitting

Whole data set: 01.01.2012 - 29.12.2017 | 2190 (should be 2168)

- 02.01.2012 - 29.04.2012 | 120
- 30.04.2012 - 19.07.2016 (training) | 1539
- 20.07.2016 - 29.12.2017 (test) | 509

In [32]:
df = df.rename(columns={"Date": "date", 'Price': 'close', 'Open': 'open', "High": "high", "Low": "low", "Vol.": "volume", "Change %": "change"})

In [33]:
df['date'] = pd.to_datetime(df['date'])
df = df.set_index(df['date'])
df = df.sort_index()

In [34]:
df["change"] = df["change"].str.rstrip('%').astype('float')

In [35]:
df.volume = (df.volume.replace(r'[KMB]+$', '', regex=True).astype(float) * df.volume.str.extract(r'[\d\.]+([KMB]+)', expand=False).fillna(1).replace(['K','M', 'B'], [10**3, 10**6, 10**9]).astype(int))

In [36]:
df["close"] = df["close"].astype('float')

In [37]:
df = df.astype({"close": "float64", "low": "float64", "high": "float64", "open": "float64"})

In [38]:
bins = [-float("inf"), -100, -11, -9, -7, -5, -3, -1, -0.8, -0.6, -0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1, 3 , 5, 7, 9, 11, 100, float("inf")]
# Use the bins below to produce a binary result: Price will go UP or DOWN
# bins = [-float("inf"), -100, 0, 100, float("inf")]
df['binned'] = pd.cut(df['change'], bins)

In [39]:
labelencoder = LabelEncoder()
df['change_cat'] = labelencoder.fit_transform(df['binned'])

In [40]:
df

Unnamed: 0_level_0,date,close,open,high,low,volume,change,binned,change_cat
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2012-01-01,2012-01-01,5.3,4.7,5.5,4.6,108510.0,11.65,"(11.0, 100.0]",21
2012-01-02,2012-01-02,5.2,5.3,5.5,4.8,69150.0,-0.95,"(-1.0, -0.8]",6
2012-01-03,2012-01-03,4.9,5.2,5.3,4.7,125170.0,-6.51,"(-7.0, -5.0]",3
2012-01-04,2012-01-04,5.6,4.9,5.7,4.8,131170.0,14.14,"(11.0, 100.0]",21
2012-01-05,2012-01-05,6.9,5.6,7.2,5.6,182330.0,24.78,"(11.0, 100.0]",21
...,...,...,...,...,...,...,...,...,...
2017-12-25,2017-12-25,13833.5,13790.0,14467.4,13010.7,107480.0,0.32,"(0.2, 0.4]",12
2017-12-26,2017-12-26,15756.6,13830.2,16094.7,13748.5,143140.0,13.90,"(11.0, 100.0]",21
2017-12-27,2017-12-27,15416.6,15757.0,16514.6,14534.7,138710.0,-2.16,"(-3.0, -1.0]",5
2017-12-28,2017-12-28,14398.7,15416.3,15505.5,13466.1,170370.0,-6.60,"(-7.0, -5.0]",3


In [41]:
indicators = [
    {
        "name": "BBANDS"
    },
    {
        "name": "DEMA"
    },
    {
        "name": "EMA"
    },
    {
        "name": "HT_TRENDLINE"
    },
    {
        "name": "KAMA"
    },
    {
        "name": "MIDPOINT"
    },
    {
        "name": "MIDPRICE"
    },
    {
        "name": "SAR"
    },
    {
        "name": "SAREXT"
    },
    {
        "name": "SMA",
        "display_name": "SMA_3",
        "params": {
            "timeperiod": 3
        }
    },
    {
        "name": "SMA",
        "display_name": "SMA_5",
        "params": {
            "timeperiod": 5
        }
    },
    {
        "name": "SMA",
        "display_name": "SMA_10",
        "params": {
            "timeperiod": 10
        }
    },
    {
        "name": "SMA",
        "display_name": "SMA_20",
        "params": {
            "timeperiod": 20
        }
    },
    {
        "name": "T3"
    },
    {
        "name": "TEMA"
    },
    {
        "name": "TRIMA"
    },
    {
        "name": "WMA"
    },
    {
        "name": "ADX",
        "display_name": "ADX_14",
        "params": {
            "timeperiod": 14
        }
    },
    {
        "name": "ADX",
        "display_name": "ADX_20",
        "params": {
            "timeperiod": 20
        }
    },
    {
        "name": "ADXR"
    },
    {
        "name": "APO"
    },
    {
        "name": "AROONOSC"
    },
    {
        "name": "BOP"
    },
    {
        "name": "CCI",
        "display_name": "CCI_3",
        "params": {
            "timeperiod": 3
        }
    },
    {
        "name": "CCI",
        "display_name": "CCI_5",
        "params": {
            "timeperiod": 5
        }
    },
    {
        "name": "CCI",
        "display_name": "CCI_10",
        "params": {
            "timeperiod": 10
        }
    },
    {
        "name": "CCI",
        "display_name": "CCI_14",
        "params": {
            "timeperiod": 14
        }
    },
    {
        "name": "CMO"
    },
    {
        "name": "DX"
    },
    {
        "name": "BOP"
    },
    {
        "name": "MACD"
    },
    {
        "name": "MINUS_DI"
    },
    {
        "name": "MINUS_DM"
    },
    {
        "name": "MOM",
        "display_name": "MOM_1",
        "params": {
            "timeperiod": 1
        }
    },
    {
        "name": "MOM",
        "display_name": "MOM_3",
        "params": {
            "timeperiod": 3
        }
    },
    {
        "name": "MOM",
        "display_name": "MOM_5",
        "params": {
            "timeperiod": 5
        }
    },
    {
        "name": "MOM",
        "display_name": "MOM_10",
        "params": {
            "timeperiod": 10
        }
    },
    {
        "name": "PLUS_DI"
    },
    {
        "name": "PLUS_DM"
    },
    {
        "name": "PPO"
    },
    {
        "name": "ROC"
    },
    {
        "name": "ROCR"
    },
    {
        "name": "ROCR100"
    },
    {
        "name": "RSI",
        "display_name": "RSI_5",
        "params": {
            "timeperiod": 5
        }
    },
    {
        "name": "RSI",
        "display_name": "RSI_10",
        "params": {
            "timeperiod": 10
        }
    },
    {
        "name": "RSI",
        "display_name": "RSI_15",
        "params": {
            "timeperiod": 15
        }
    },
    {
        "name": "STOCH"
    },
    {
        "name": "STOCHF"
    },
    {
        "name": "TRIX"
    },
    {
        "name": "ULTOSC"
    },
    {
        "name": "WILLR"
    },
    {
        "name": "ATR"
    },
    {
        "name": "NATR"
    },
    {
        "name": "TRANGE"
    },
    {
        "name": "CDL2CROWS"
    },
    {
        "name": "CDL3BLACKCROWS"
    },
    {
        "name": "CDL3INSIDE"
    },
    {
        "name": "CDL3LINESTRIKE"
    },
    {
        "name": "CDL3OUTSIDE"
    },
    {
        "name": "CDL3STARSINSOUTH"
    },
    {
        "name": "CDL3WHITESOLDIERS"
    },
    {
        "name": "CDLABANDONEDBABY"
    },
    {
        "name": "CDLADVANCEBLOCK"
    },
    {
        "name": "CDLBELTHOLD"
    },
    {
        "name": "CDLBREAKAWAY"
    },
    {
        "name": "CDLCLOSINGMARUBOZU"
    },
    {
        "name": "CDLCONCEALBABYSWALL"
    },
    {
        "name": "CDLCOUNTERATTACK"
    },
    {
        "name": "CDLDARKCLOUDCOVER"
    },
    {
        "name": "CDLDOJI"
    },
    {
        "name": "CDLDOJISTAR"
    },
    {
        "name": "CDLDRAGONFLYDOJI"
    },
    {
        "name": "CDLENGULFING"
    },
    {
        "name": "CDLEVENINGDOJISTAR"
    },
    {
        "name": "CDLEVENINGSTAR"
    },
    {
        "name": "CDLGAPSIDESIDEWHITE"
    },
    {
        "name": "CDLGRAVESTONEDOJI"
    },
    {
        "name": "CDLHAMMER"
    },
    {
        "name": "CDLHANGINGMAN"
    },
    {
        "name": "CDLHARAMI"
    },
    {
        "name": "CDLHARAMICROSS"
    },
    {
        "name": "CDLHIGHWAVE"
    },
    {
        "name": "CDLHIKKAKE"
    },
    {
        "name": "CDLHIKKAKEMOD"
    },
    {
        "name": "CDLHOMINGPIGEON"
    },
    {
        "name": "CDLIDENTICAL3CROWS"
    },
    {
        "name": "CDLINNECK"
    },
    {
        "name": "CDLINVERTEDHAMMER"
    },
    {
        "name": "CDLKICKING"
    },
    {
        "name": "CDLKICKINGBYLENGTH"
    },
    {
        "name": "CDLLADDERBOTTOM"
    },
    {
        "name": "CDLLONGLEGGEDDOJI"
    },
    {
        "name": "CDLLONGLINE"
    },
    {
        "name": "CDLMARUBOZU"
    },
    {
        "name": "CDLMATCHINGLOW"
    },
    {
        "name": "CDLMATHOLD"
    },
    {
        "name": "CDLMORNINGDOJISTAR"
    },
    {
        "name": "CDLMORNINGSTAR"
    },
    {
        "name": "CDLONNECK"
    },
    {
        "name": "CDLPIERCING"
    },
    {
        "name": "CDLRICKSHAWMAN"
    },
    {
        "name": "CDLRISEFALL3METHODS"
    },
    {
        "name": "CDLSEPARATINGLINES"
    },
    {
        "name": "CDLSHOOTINGSTAR"
    },
    {
        "name": "CDLSHORTLINE"
    },
    {
        "name": "CDLSPINNINGTOP"
    },
    {
        "name": "CDLSTALLEDPATTERN"
    },
    {
        "name": "CDLSTICKSANDWICH"
    },
    {
        "name": "CDLTAKURI"
    },
    {
        "name": "CDLTASUKIGAP"
    },
    {
        "name": "CDLTHRUSTING"
    },
    {
        "name": "CDLTRISTAR"
    },
    {
        "name": "CDLUNIQUE3RIVER"
    },
    {
        "name": "CDLUPSIDEGAP2CROWS"
    },
    {
        "name": "CDLXSIDEGAP3METHODS"
    },
    {
        "name": "HT_DCPERIOD"
    },
    {
        "name": "HT_DCPHASE"
    },
    {
        "name": "HT_TRENDMODE"
    }
]

## Indicators

In [42]:
for indicator in indicators:
    name = indicator.get("name")
    params = indicator.get("params")
    display_name = indicator.get("display_name")
    
    fn = abstract.Function(name)
    
    if params == None:
        res = fn(df)
    else:
        res = fn(df, **params)
    
    name = display_name if display_name != None else name
    
    if isinstance(res, pd.core.series.Series):
        res = res.rename(name)
    elif isinstance(res, pd.core.frame.DataFrame):
        new_columns = {}
        for col in res:
            new_columns[col] = name + "_" + col
        res.rename(columns=new_columns, inplace=True)

    df =  pd.concat([df, res], axis=1)


In [43]:
#pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

In [44]:
df.describe()

Unnamed: 0,close,open,high,low,volume,change,change_cat,BBANDS_upperband,BBANDS_middleband,BBANDS_lowerband,...,CDLTAKURI,CDLTASUKIGAP,CDLTHRUSTING,CDLTRISTAR,CDLUNIQUE3RIVER,CDLUPSIDEGAP2CROWS,CDLXSIDEGAP3METHODS,HT_DCPERIOD,HT_DCPHASE,HT_TRENDMODE
count,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2186.0,2186.0,2186.0,...,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2158.0,2127.0,2190.0
mean,913.453881,906.881598,943.633379,869.993744,57980.694064,0.608178,11.143836,978.248713,901.638747,825.02878,...,2.785388,0.0,0.0,0.0,0.0,0.0,0.0,23.253379,139.881984,0.835616
std,2104.922934,2085.197192,2194.919523,1965.082588,55408.184755,9.219659,5.504751,2303.025809,2059.696648,1825.517557,...,16.459165,0.0,0.0,0.0,0.0,0.0,0.0,5.750955,90.841002,0.370708
min,4.2,4.2,4.4,3.9,400.0,-57.21,0.0,4.469666,4.32,-35.663811,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.351935,-44.872591,0.0
25%,115.0,114.35,120.2,108.2,22102.5,-1.06,5.0,124.560808,115.57,104.480753,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.945852,61.63238,1.0
50%,375.8,375.2,384.15,365.05,43440.0,0.16,11.0,392.849605,375.34,348.974139,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.090524,166.12547,1.0
75%,655.075,654.65,668.925,636.7,75200.0,2.03,16.0,684.487793,654.725,620.977702,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.225334,198.345008,1.0
max,19345.5,19346.6,19870.6,18750.9,572350.0,336.84,22.0,20484.508314,18502.4,16949.990271,...,100.0,0.0,0.0,0.0,0.0,0.0,0.0,45.457506,314.373168,1.0


In [45]:
init_start = "2012-01-02"
init_end = "2012-04-29"

train_start = "2012-04-30"
train_end = "2016-07-19"

test_start = "2016-07-20"
test_end = "2017-12-29"

In [46]:
df.sum()

close                  2.000464e+06
open                   1.986071e+06
high                   2.066557e+06
low                    1.905286e+06
volume                 1.269777e+08
                           ...     
CDLUPSIDEGAP2CROWS     0.000000e+00
CDLXSIDEGAP3METHODS    0.000000e+00
HT_DCPERIOD            5.018079e+04
HT_DCPHASE             2.975290e+05
HT_TRENDMODE           1.830000e+03
Length: 131, dtype: float64

In [47]:
# create train test partition
init = df[init_start:init_end]
train = df[train_start:train_end]
test  = df[test_start:test_end]

print('Init Dataset:',init.shape)
print('Train Dataset:',train.shape)
print('Test Dataset:',test.shape)

Init Dataset: (119, 133)
Train Dataset: (1542, 133)
Test Dataset: (528, 133)


# Machine learning magic

In [48]:
features = [c for c in df.columns if c not in ["change_cat", "binned", "date", "close", "open", "high", "low", "volume", "change"]]

In [49]:
X_train = train[features]
y_train = train["change_cat"]

In [50]:
clf = DecisionTreeClassifier(criterion="entropy", splitter="random", max_features="sqrt", )

In [51]:
tree = BaggingClassifier(clf, n_estimators=1000)

In [52]:
tree.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy',
                                                        max_features='sqrt',
                                                        splitter='random'),
                  n_estimators=1000)

In [53]:
X_test = test[features]
y_test = test["change_cat"]

In [54]:
y_pred = tree.predict(X_test)

In [55]:
y_test.describe()

count    528.000000
mean      11.676136
std        5.382454
min        0.000000
25%        7.000000
50%       12.000000
75%       16.000000
max       21.000000
Name: change_cat, dtype: float64

In [56]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.4185606060606061


# Trading Strategy

In [62]:
labelencoder.classes_

array([Interval(-100.0, -11.0, closed='right'),
       Interval(-11.0, -9.0, closed='right'),
       Interval(-9.0, -7.0, closed='right'),
       Interval(-7.0, -5.0, closed='right'),
       Interval(-5.0, -3.0, closed='right'),
       Interval(-3.0, -1.0, closed='right'),
       Interval(-1.0, -0.8, closed='right'),
       Interval(-0.8, -0.6, closed='right'),
       Interval(-0.6, -0.4, closed='right'),
       Interval(-0.4, -0.2, closed='right'),
       Interval(-0.2, 0.0, closed='right'),
       Interval(0.0, 0.2, closed='right'),
       Interval(0.2, 0.4, closed='right'),
       Interval(0.4, 0.6, closed='right'),
       Interval(0.6, 0.8, closed='right'),
       Interval(0.8, 1.0, closed='right'),
       Interval(1.0, 3.0, closed='right'),
       Interval(3.0, 5.0, closed='right'),
       Interval(5.0, 7.0, closed='right'),
       Interval(7.0, 9.0, closed='right'),
       Interval(9.0, 11.0, closed='right'),
       Interval(11.0, 100.0, closed='right'),
       Interval(100.0, in

In [75]:
trading_data_X = train[features]
trading_data_X["prediction"] = tree.predict(trading_data_X)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trading_data_X["prediction"] = tree.predict(trading_data_X)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trading_data_X["prediction"] = labelencoder.inverse_transform(trading_data_X["prediction"])
