# Train a Deep NN to predict Asset Price movements

## Setup Docker for GPU acceleration

`docker run -it -p 8889:8888 -v /path/to/machine-learning-for-trading/16_convolutions_neural_nets/cnn:/cnn --name tensorflow tensorflow/tensorflow:latest-gpu-py3 bash`

## Imports & Settings

In [9]:
import warnings
warnings.filterwarnings('ignore')

In [10]:
import os
from pathlib import Path
from importlib import reload
from joblib import dump, load

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score

import tensorflow as tf
from keras.models import Sequential
from keras import backend as K
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers import Dense, Dropout, Activation
from keras.models import load_model
from keras.callbacks import Callback, EarlyStopping, TensorBoard, ModelCheckpoint

Using TensorFlow backend.


In [11]:
np.random.seed(42)

## Build Dataset

In [32]:
prices = (pd.read_hdf('../data/assets.h5', 'quandl/wiki/prices')
          .adj_close
          .unstack().loc['2007':])
prices.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2896 entries, 2007-01-01 to 2018-03-27
Columns: 3199 entries, A to ZUMZ
dtypes: float64(3199)
memory usage: 70.7 MB


In [38]:
returns = (prices
           .resample('W')
           .last()
           .pct_change()
           .loc['2008': '2017']
           .dropna(axis=1)
           .sort_index(ascending=False))
returns.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2576 entries, 2017-12-29 to 2008-01-01
Columns: 2489 entries, A to ZUMZ
dtypes: float64(2489)
memory usage: 48.9 MB


In [39]:
returns.head().append(returns.tail())

ticker,A,AAL,AAN,AAON,AAP,AAPL,AAWW,ABAX,ABC,ABCB,...,ZEUS,ZIGO,ZINC,ZION,ZIOP,ZIXI,ZLC,ZMH,ZQK,ZUMZ
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-12-29,-0.007116,-0.008197,-0.009692,-0.002717,-0.000201,-0.010814,-0.008453,-0.005622,-0.008316,-0.011282,...,-0.009677,0.0,0.0,-0.009934,0.035,-0.006803,0.0,0.0,0.0,-0.017689
2017-12-28,0.002229,0.001145,-0.008867,0.008219,-0.000601,0.002814,-0.004209,0.015912,-0.000108,0.006192,...,-0.005955,0.0,0.0,0.012424,-0.007444,-0.032895,0.0,0.0,0.0,0.002364
2017-12-27,0.000743,-0.008515,0.005946,-0.002732,-0.021479,0.000176,-0.007519,-0.012092,-0.006971,-0.003086,...,-0.016667,0.0,0.0,-0.002949,-0.009828,0.027027,0.0,0.0,0.0,-0.032037
2017-12-26,-0.001485,0.004944,0.002484,-0.004082,0.014023,-0.02537,-0.004988,0.000202,0.008544,-0.01119,...,0.002257,0.0,0.0,-0.009156,0.004938,-0.002247,0.0,0.0,0.0,0.018648
2017-12-22,-0.002518,-0.003789,-0.007152,0.006849,0.004195,0.0,0.005013,0.012862,-0.0057,-0.00203,...,-0.01643,0.0,0.0,-0.002526,-0.026442,0.00907,0.0,0.0,0.0,-0.002326
2008-01-07,0.016393,-0.014504,0.016119,0.037457,0.045136,-0.013385,-0.001922,-0.039675,0.013242,0.054017,...,-0.035795,-0.041885,-0.095238,0.020755,-0.092179,-0.089921,0.108407,0.032481,0.032043,-0.091567
2008-01-04,-0.032554,-0.024371,-0.038737,-0.021655,-0.049328,-0.054517,-0.017378,-0.069631,-0.022758,-0.044974,...,-0.060872,-0.014617,0.002387,-0.044615,0.011299,-0.047945,-0.055052,0.00211,-0.040973,-0.080498
2008-01-03,-0.009917,-0.045045,-0.002378,-0.041001,0.009577,0.000462,0.006464,0.028177,0.006061,-0.055,...,0.025466,-0.059822,-0.001787,-0.020093,0.017241,-0.004545,-0.060249,0.001056,-0.084408,-0.086486
2008-01-02,-0.011976,-0.094494,-0.025492,-0.052472,-0.010529,-0.016357,-0.029878,-0.030117,-0.007132,-0.050445,...,0.015453,-0.007223,-0.010607,-0.029985,-0.016949,-0.043478,-0.049191,0.002116,-0.005828,-0.08867
2008-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
n = len(returns)
T = 21
tcols = list(range(T))

In [41]:
data = pd.DataFrame()
for i in range(n-T-1):
    if i % 50 == 0:
        print(i, end=' ', flush=True)
    df = returns.iloc[i:i+T+1]
    data = pd.concat([data, (df
                             .reset_index(drop=True)
                             .transpose()
                             .reset_index()
                             .assign(year=df.index[0].year,
                                     month=df.index[0].month))],
                     ignore_index=True)
data.info()

0 50 100 150 200 250 300 350 400 450 500 550 600 650 700 750 800 850 900 950 1000 1050 1100 1150 1200 1250 1300 1350 1400 1450 1500 1550 1600 1650 1700 1750 1800 1850 1900 1950 2000 2050 2100 2150 2200 2250 2300 2350 2400 2450 2500 2550 <class 'pandas.core.frame.DataFrame'>
RangeIndex: 6356906 entries, 0 to 6356905
Data columns (total 25 columns):
ticker    object
0         float64
1         float64
2         float64
3         float64
4         float64
5         float64
6         float64
7         float64
8         float64
9         float64
10        float64
11        float64
12        float64
13        float64
14        float64
15        float64
16        float64
17        float64
18        float64
19        float64
20        float64
21        float64
year      int64
month     int64
dtypes: float64(22), int64(2), object(1)
memory usage: 1.2+ GB


In [42]:
data[tcols] = (data[tcols].apply(lambda x: x.clip(lower=x.quantile(.01),
                                                  upper=x.quantile(.99))))
data.ticker = pd.factorize(data.ticker)[0]
data['label'] = (data[0] > 0).astype(int)
data['date'] = pd.to_datetime(data.assign(day=1)[['year', 'month', 'day']])
data = pd.get_dummies((data.drop(0, axis=1)
                       .set_index('date')
                       .apply(pd.to_numeric)), 
                      columns=['year', 'month']).sort_index()
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6356906 entries, 2008-01-01 to 2017-12-01
Data columns (total 45 columns):
ticker       int64
1            float64
2            float64
3            float64
4            float64
5            float64
6            float64
7            float64
8            float64
9            float64
10           float64
11           float64
12           float64
13           float64
14           float64
15           float64
16           float64
17           float64
18           float64
19           float64
20           float64
21           float64
label        int64
year_2008    uint8
year_2009    uint8
year_2010    uint8
year_2011    uint8
year_2012    uint8
year_2013    uint8
year_2014    uint8
year_2015    uint8
year_2016    uint8
year_2017    uint8
month_1      uint8
month_2      uint8
month_3      uint8
month_4      uint8
month_5      uint8
month_6      uint8
month_7      uint8
month_8      uint8
month_9      uint8
month_10     uint8
month_11     

In [43]:
data.to_hdf('data.h5', 'returns_daily')

In [8]:
data.shape

(1167341, 74)