In [60]:
from math import sqrt
from numpy import split
from numpy import array
from pandas import read_csv
from sklearn.metrics import mean_squared_error
from matplotlib import pyplot
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers import Dropout

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from time import time
from tensorflow.python.keras.callbacks import TensorBoard

tensorboard = TensorBoard(log_dir='./final_logs/{}'.format(time()))

## Reading in the data

In [2]:
data = pd.read_csv('../data/events_prices/events_prices/Consumer Discretionary.csv')
data.name = 'ConsumerDiscretion'
data.head(5)

Unnamed: 0.1,Unnamed: 0,height,width,distance,left_slope,right_slope,event_sector,target_sector,target_sector_average_price
0,0,0.053152,3e-06,0.0,0.113784,0.964781,Consumer Discretionary,Health Care,0.017173
1,0,0.089049,2.3e-05,0.01676,0.10441,0.992334,Consumer Discretionary,Health Care,0.116957
2,0,0.07572,7e-06,0.022346,0.033813,0.976758,Consumer Discretionary,Health Care,0.116957
3,0,0.023592,5e-06,0.044693,0.007957,0.964307,Consumer Discretionary,Health Care,0.231336
4,0,0.023596,3e-06,0.01676,0.01415,0.976824,Consumer Discretionary,Health Care,0.231336


In [12]:
X = data.iloc[:, 1:6]
X.head()

Unnamed: 0,height,width,distance,left_slope,right_slope
0,0.053152,3e-06,0.0,0.113784,0.964781
1,0.089049,2.3e-05,0.01676,0.10441,0.992334
2,0.07572,7e-06,0.022346,0.033813,0.976758
3,0.023592,5e-06,0.044693,0.007957,0.964307
4,0.023596,3e-06,0.01676,0.01415,0.976824


In [13]:
y = data.iloc[:, -1]
y.head()

0    0.017173
1    0.116957
2    0.116957
3    0.231336
4    0.231336
Name: target_sector_average_price, dtype: float64

## Data Pre-processing

We reshape the data for 1D convolution, with 5 features. We have variable **image_height**, which indicates the height of the convolution window. The default image height is 5.


In [39]:
# reshapes data into n (len(data)/image_height) image_heightx5 data values.
def split_reshape_dataset(X, y, image_height=5):
    
    X = X.sample(frac=0.05, random_state=100)
    y = y.sample(frac=0.05, random_state=100)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    
    X_train = make_dataset_whole(X_train,image_height)
    X_test = make_dataset_whole(X_test,image_height)
    y_train = make_dataset_whole(y_train,image_height)
    y_test = make_dataset_whole(y_test,image_height)
    
    X_train = np.reshape(X_train.values, (-1, image_height, 5))
    X_test = np.reshape(X_test.values, (-1, image_height, 5))
    y_train = np.reshape(y_train.values, (-1, image_height))
    y_test = np.reshape(y_test.values, (-1, image_height))
    
    return X_train, X_test, y_train, y_test

# to make sure we have uniform sized images. 
# eg. If there are 161 rows of observations and we want to make predictions on 5 time steps each,
# this will reduce the dataset to 160 observations as 160%5==0
def make_dataset_whole(X, image_height=5):
    X = X.reset_index(drop='index')
    x_shape=X.shape[0]
    i = x_shape%image_height
    if i != 0:
        X = X.drop(list(range(x_shape-1, x_shape-i-1,-1)),axis=0)
    
    return X

## Split data into train and test

In [40]:
X_train, X_test, y_train, y_test = split_reshape_dataset(X,y)

y_train.shape, X_train.shape

((1124, 5), (1124, 5, 5))

In [8]:
data1 = pd.read_csv('../data/events_prices/events_prices/Consumer Discretionary.csv')

In [41]:
def get_target_dfs(data):
    df_list = []
    event = str(data.event_sector[0])
    for target in pd.unique(data.target_sector):
        
        target_name = str(target)
        df = data[data.target_sector == target]
        df.name = event+'-'+target_name
        df_list.append(df)
        
    return df_list

target_dflist = get_target_dfs(data1)

## Build basic model using 1 1D convolution layer with kernel_size = 3

In [42]:
def build_model(X_train, y_train):
    verbose, epochs, batch_size = 0, 20, 4
    n_timesteps, n_features, n_outputs = X_train.shape[1], X_train.shape[2], y_train.shape[1]
    model = Sequential()
    model.add(Conv1D(filters=16, kernel_size=2, activation='relu',
                     input_shape=(n_timesteps, n_features)))
    
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(10, activation='relu'))
    model.add(Dense(n_outputs))
    model.compile(loss='mse', optimizer='adam')
    model.fit(X_train, y_train, epochs=epochs,
              batch_size=batch_size, verbose=verbose)
    return model

In [43]:
i = 0
# target_dflist = [data1, data2, data3, data4, data5, data6]
index = []
cols = []
mse_dict = {}
while i < len(target_dflist):
    data = target_dflist[i]
    name = data.name
    event_sector = name.split('-')[0]
    target_sector = name.split('-')[1]
    index+=[event_sector]
    cols+=[target_sector]
    X = data.iloc[:, 1:6]
    y = data.iloc[:, -1]
    X_train, X_test, y_train, y_test = split_reshape_dataset(X,y)
    model = build_model(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mse_dict[name] = mse
    print(mse)
    i+=1
print(mse_dict)

0.0013794300178229424
0.0012250820105853889
0.0006843781874176421
0.002825744763597792
0.003363960698824228
0.0012336542774464023
0.0028413736164810384
0.0031752534141794354
0.003479179458192337
0.014152756029514258
{'Consumer Discretionary-Health Care': 0.0013794300178229424, 'Consumer Discretionary-Industrials': 0.0012250820105853889, 'Consumer Discretionary-Information Technology': 0.0006843781874176421, 'Consumer Discretionary-Consumer Staples': 0.002825744763597792, 'Consumer Discretionary-Utilities': 0.003363960698824228, 'Consumer Discretionary-Financials': 0.0012336542774464023, 'Consumer Discretionary-Real Estate': 0.0028413736164810384, 'Consumer Discretionary-Materials': 0.0031752534141794354, 'Consumer Discretionary-Energy': 0.003479179458192337, 'Consumer Discretionary-Telecommunications Services': 0.014152756029514258}


In [45]:
# consumer_dict = pd.DataFrame(mse_dict)
# consumer_dict.to_csv('CD_model1.csv')
np.mean(list(mse_dict.values()))

0.003436081247406146

## Model with 2 convolution layers with kernel_size=2

In [48]:
def build_model_2(X_train, y_train):
    verbose, epochs, batch_size = 0, 20, 4
    n_timesteps, n_features, n_outputs = X_train.shape[1], X_train.shape[2], y_train.shape[1]
    # define model
    model = Sequential()
    model.add(Conv1D(filters=20, kernel_size=2, activation='relu',
                     input_shape=(n_timesteps, n_features)))
    model.add(Conv1D(filters=10, kernel_size=2, activation='relu'))
    model.add(MaxPooling1D(pool_size=1))
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dense(n_outputs))
    model.compile(loss='mse', optimizer='adam')
    # fit network
    model.fit(X_train, y_train, epochs=epochs,
              batch_size=batch_size, verbose=verbose)
    return model

In [49]:
i = 0
# target_dflist = [data1, data2, data3, data4, data5, data6]
index = []
cols = []
mse_dict = {}
while i < len(target_dflist):
    data = target_dflist[i]
    name = data.name
    event_sector = name.split('-')[0]
    target_sector = name.split('-')[1]
    index+=[event_sector]
    cols+=[target_sector]
    X = data.iloc[:, 1:6]
    y = data.iloc[:, -1]
    X_train, X_test, y_train, y_test = split_reshape_dataset(X,y)
    model = build_model_2(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mse_dict[name] = mse
    print(mse)
    i+=1
print(mse_dict)

0.00141969758149393
0.0012219664118559704
0.0006834738584785911
0.0026699880355763508
0.003395965483112876
0.0013021739886564464
0.0028086327098332635
0.0032137529862367293
0.003528651510155088
0.014578122250404446
{'Consumer Discretionary-Health Care': 0.00141969758149393, 'Consumer Discretionary-Industrials': 0.0012219664118559704, 'Consumer Discretionary-Information Technology': 0.0006834738584785911, 'Consumer Discretionary-Consumer Staples': 0.0026699880355763508, 'Consumer Discretionary-Utilities': 0.003395965483112876, 'Consumer Discretionary-Financials': 0.0013021739886564464, 'Consumer Discretionary-Real Estate': 0.0028086327098332635, 'Consumer Discretionary-Materials': 0.0032137529862367293, 'Consumer Discretionary-Energy': 0.003528651510155088, 'Consumer Discretionary-Telecommunications Services': 0.014578122250404446}


## Changing the timestep size from 5 to 10 and building model with 3 convolution layers.

## Increasing the timestep allows us to use more convolution layers.

In [50]:
def build_model_3(X_train, y_train):
    verbose, epochs, batch_size = 0, 20, 4
    n_timesteps, n_features, n_outputs = X_train.shape[1], X_train.shape[2], y_train.shape[1]
    # define model
    model = Sequential()
    
    model.add(Conv1D(filters=20, kernel_size=3, activation='relu',
                     input_shape=(n_timesteps, n_features)))
    model.add(Conv1D(filters=10, kernel_size=3, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Conv1D(filters=5, kernel_size=3, activation='relu'))
    model.add(MaxPooling1D(pool_size=1))
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dense(n_outputs))
    model.compile(loss='mse', optimizer='adam')
    # fit network
    model.fit(X_train, y_train, epochs=epochs,
              batch_size=batch_size, verbose=verbose, callbacks=[tensorboard])
    return model

### Testing the changed timestep size with build_model 2

In [53]:
i = 0
# target_dflist = [data1, data2, data3, data4, data5, data6]
index = []
cols = []
mse_dict = {}
while i < len(target_dflist):
    data = target_dflist[i]
    name = data.name
    event_sector = name.split('-')[0]
    target_sector = name.split('-')[1]
    index+=[event_sector]
    cols+=[target_sector]
    X = data.iloc[:, 1:6]
    y = data.iloc[:, -1]
    X_train, X_test, y_train, y_test = split_reshape_dataset(X,y, image_height=10) ## change image height
    model = build_model_2(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mse_dict[name] = mse
    print(mse)
    i+=1
print(mse_dict)
np.mean(list(mse_dict.values()))

0.001531568287631271
0.00133000158947943
0.000739482175976337
0.0030756871047207617
0.0035288014208112567
0.0013566928827960854
0.003003303418114345
0.0034887569093828147
0.003821921532877616
0.015269900625870548
{'Consumer Discretionary-Health Care': 0.001531568287631271, 'Consumer Discretionary-Industrials': 0.00133000158947943, 'Consumer Discretionary-Information Technology': 0.000739482175976337, 'Consumer Discretionary-Consumer Staples': 0.0030756871047207617, 'Consumer Discretionary-Utilities': 0.0035288014208112567, 'Consumer Discretionary-Financials': 0.0013566928827960854, 'Consumer Discretionary-Real Estate': 0.003003303418114345, 'Consumer Discretionary-Materials': 0.0034887569093828147, 'Consumer Discretionary-Energy': 0.003821921532877616, 'Consumer Discretionary-Telecommunications Services': 0.015269900625870548}


0.0037146115947660464

In [54]:
## Final CNN
## Testing with build_model_3
i = 0
# target_dflist = [data1, data2, data3, data4, data5, data6]
index = []
cols = []
mse_dict = {}
while i < len(target_dflist):
    data = target_dflist[i]
    name = data.name
    event_sector = name.split('-')[0]
    target_sector = name.split('-')[1]
    index+=[event_sector]
    cols+=[target_sector]
    X = data.iloc[:, 1:6]
    y = data.iloc[:, -1]
    X_train, X_test, y_train, y_test = split_reshape_dataset(X,y, image_height=10) ## change image height
    model = build_model_3(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mse_dict[name] = mse
    print(mse)
    i+=1
print(mse_dict)
np.mean(list(mse_dict.values()))

0.0013897012092876646
0.0012282276224936783
0.0006891808668886771
0.002681524017329346
0.0031754255677073026
0.0012258341006400892
0.0028631484347698037
0.0031624380596742864
0.00365463521475482
0.014358575152452768
{'Consumer Discretionary-Health Care': 0.0013897012092876646, 'Consumer Discretionary-Industrials': 0.0012282276224936783, 'Consumer Discretionary-Information Technology': 0.0006891808668886771, 'Consumer Discretionary-Consumer Staples': 0.002681524017329346, 'Consumer Discretionary-Utilities': 0.0031754255677073026, 'Consumer Discretionary-Financials': 0.0012258341006400892, 'Consumer Discretionary-Real Estate': 0.0028631484347698037, 'Consumer Discretionary-Materials': 0.0031624380596742864, 'Consumer Discretionary-Energy': 0.00365463521475482, 'Consumer Discretionary-Telecommunications Services': 0.014358575152452768}


0.003442869024599844

In [56]:
# Using model 3 to build the rest of the models

In [57]:
data2 = pd.read_csv('../data/events_prices/events_prices/Health Care.csv')
data3 = pd.read_csv('../data/events_prices/events_prices/Industrials.csv')
data4 = pd.read_csv('../data/events_prices/events_prices/Information Technology.csv')
data5 = pd.read_csv('../data/events_prices/events_prices/Consumer Staples.csv')
data6 = pd.read_csv('../data/events_prices/events_prices/Utilities.csv')
data7 = pd.read_csv('../data/events_prices/events_prices/Financials.csv')
data8 = pd.read_csv('../data/events_prices/events_prices/Real Estate.csv')
data9 = pd.read_csv('../data/events_prices/events_prices/Materials.csv')
data10 = pd.read_csv('../data/events_prices/events_prices/Energy.csv')
data11 = pd.read_csv('../data/events_prices/events_prices/Telecommunications Services.csv')

In [None]:
dataList = [data2, data3, data4, data5, data6, data7, data8, data9, data10, data11]
mse_list = []

for df in dataList:
    target_dflist = get_target_dfs(df)
    i = 0
    # target_dflist = [data1, data2, data3, data4, data5, data6]
    index = []
    cols = []
    mse_dict = {}
    while i < len(target_dflist):
        data = target_dflist[i]
        name = data.name
        event_sector = name.split('-')[0]
        target_sector = name.split('-')[1]
        index+=[event_sector]
        cols+=[target_sector]
        X = data.iloc[:, 1:6]
        y = data.iloc[:, -1]
        X_train, X_test, y_train, y_test = split_reshape_dataset(X,y, image_height=10) ## change image height
        model = build_model_2(X_train, y_train)
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        mse_dict[name] = mse
        print(mse)
        i+=1
    print(mse_dict)
    mse_list.append(mse_dict)
#     np.mean(list(mse_dict.values()))
    
    

0.0016250920525311939
0.0004672604387827129
0.000833812141797443
