[View in Colaboratory](https://colab.research.google.com/github/richard-cartwright/personal/blob/master/Neural_Net_for_S&P500_prediction.ipynb)

In [0]:
# This script takes minute-by-minute S&P500 data and predicts the price change for the proceeding minute.
# This is only a simplistic first attempt so shows zero optimisation. Notably, I have not used Cross Validation to tune hyperparameters.
# I have also done only very basic feature engineering as this is a first attempt.
# Features: 10min Moving Averages, and their pct changes, for price and volume.
# Target: Pct change of S&P500 in the proceeding minute.
# I use out-of-the-box Linreg, Random Forest, CNN - Random Forest is pretty effective w/ a test R^2 of 0.15

# 1) Basic imports, including ML libraries
# 2) Interacting with Kaggle API
# 3) Download Kaggle dataset
# 4) Read in data
# 5) Create S&P500 Index
# 6) Create Moving Averages
# 7) Create new df of only features & targets
# 8) Add in pct changes & define target variable
# 9) Split into train-test and scale features
# 10) Linear Regression
# 11) Random Forest
# 12) Basic CNN

In [0]:
# 1) Basic imports, including ML libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Setting plotting styles
plt.style.use('fivethirtyeight')
sns.set_style('white')

# Displays all cell's output, not just last output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Sklearn
from sklearn.preprocessing import StandardScaler, normalize, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# Tensorflow & Keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [0]:
# 2) Interacting with Kaggle API
# From: https://medium.com/@move37timm/using-kaggle-api-for-google-colaboratory-d18645f93648

!pip install kaggle

from googleapiclient.discovery import build
import io, os
from googleapiclient.http import MediaIoBaseDownload
from google.colab import auth
auth.authenticate_user()
drive_service = build('drive', 'v3')
results = drive_service.files().list(
        q="name = 'kaggle.json'", fields="files(id)").execute()
kaggle_api_key = results.get('files', [])
filename = "/root/.kaggle/kaggle.json"
os.makedirs(os.path.dirname(filename), exist_ok=True)
request = drive_service.files().get_media(fileId=kaggle_api_key[0]['id'])
fh = io.FileIO(filename, 'wb')
downloader = MediaIoBaseDownload(fh, request)
done = False
while done is False:
    status, done = downloader.next_chunk()
    print("Download %d%%." % int(status.progress() * 100))
os.chmod(filename, 600)

In [0]:
# 3) Download Kaggle dataset

# !kaggle datasets list -s 'S&P'
!kaggle datasets download nickdl/snp-500-intraday-data
!unzip snp-500-intraday-data.zip
!unzip dataset.zip
!ls

In [0]:
# 4) Read in csv with correct index & columns
# skipfooter to reduce size of dataset
data = pd.read_csv('dataset.csv', header=[0,1], index_col=0, parse_dates=True, infer_datetime_format=True, skipfooter=30000)

# Sense check
data.head(2)
data.info()

# Impute missing data via backfilling
data.bfill(axis=0, inplace=True)

In [0]:
# 5) Create S&P500 index by multiplying 'close' and 'volume' at each time period

# Create dfs for just 'close' and 'volume' columns
close_df = data.loc[:,(slice(None), 'close')]
volume_df = data.loc[:,(slice(None), 'volume')]

# Drop tier 2 column index
close_df.columns = close_df.columns.droplevel(level=1)
volume_df.columns = volume_df.columns.droplevel(level=1)

# Add new column of S&P500 index to dataframe
S&P_index = close_df*volume_df
data['S&P500'] = S&P_index.sum(axis=1)

In [0]:
# 6) Feature Engineering: create Moving Averages

# Create column names with relevant suffix
columns_close_MA10 = []
columns_volume_MA10 = []
for col in close_df.columns:
  columns_close_MA10.append(col+'_close_MA10')
  columns_volume_MA10.append(col+'_volume_MA10')

# Create Noving Average for last 10mins for each of the tickers
# MAs for both close(price) & volume
close_MA10_df = close_df.rolling(window=10).mean()
volume_MA10_df = volume_df.rolling(window=10).mean()

# Rename columns with suffix
close_MA10_df.columns = columns_close_MA10
volume_MA10_df.columns = columns_volume_MA10

In [0]:
# 7) Create new df of only features & target

# Create df of only index value
model_df = data.loc[:,['S&P500']].copy()
model_df.columns = model_df.columns.droplevel(level=1)

# Create 10min Moving Average of S&P500
model_df['S&P500_MA10'] = model_df['S&P500'].rolling(window=10).mean()

# For each time period, for each ticker, add in 10-period MAs for each of 'close' and 'volume'
model_df = pd.concat([model_df,close_MA10_df,volume_MA10_df], axis=1)

In [0]:
# 8) Add in pct changes & define target variable

# Create column names with relevant suffix
columns_change = []
for col in model_df.columns:
  columns_change.append(col+'_change')

# Create one-period pct change for all variables
change_df = model_df.pct_change()
change_df.columns = columns_change

# Concat pct changes to absolute values df
model_df = pd.concat([model_df,change_df], axis=1)

# This is the predictor variable
# Ensure for each time period, we're predicting the next change in price using this period's data 
model_df['S&P500_change'] = model_df['S&P500_change'].shift(-1)

# Drop the (very few) NAs caused by windows & pct changes
model_df.dropna(inplace=True)

In [0]:
# 9) Split into train-test and scale features

# Separate data into predictive and predictor variables
X = model_df.drop(['S&P500_change', 'S&P500'], axis=1)
y = model_df['S&P500_change']

# Define size of train vs test set
train_size = 0.7
train_num = round(train_size*len(X))

# Time-based separation so can't use train-test split
# Test set must be after train set to avoid temporal leakage
X_train = X.iloc[:train_num]
X_test = X.iloc[train_num:]
y_train = y[:train_num]
y_test = y[train_num:]

# Scale features to ensure they're of comparable magnitude
# Necessary for some algos like KNN, hugely speeds up other algos using grad descent
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [0]:
# 10) Linear regression to get the party started. Lasso for L1 regularisation
# Not optimised regularisation param b/c not done cross validation for the moment

lasso = Lasso(alpha=0.01, random_state=42)

lasso.fit(X_train, y_train)
print('\n')
print('Linreg train R^2: ',lasso.score(X_train, y_train))
print('Linreg test R^2: ',lasso.score(X_test, y_test))

In [0]:
# 11) Random Forest for impressive out-the-box non-linear regression
# Not optimised regularisation params b/c not done validation for the moment

rfr = RandomForestRegressor( n_estimators=100, 
                             max_features=10, 
                             max_depth=10, 
                             min_samples_split=10, 
                             min_samples_leaf=10,
                             max_leaf_nodes=10,
                             n_jobs=-1,
                             random_state=42)

rfr.fit(X_train, y_train)
print('Random Forest train R^2: ',rfr.score(X_train, y_train))
print('Random Forest test R^2: ',rfr.score(X_test, y_test))

# Get feature importances from our random forest model
importances = rfr.feature_importances_

# Decide how many top features to viz
top_features = 20

# Get the index of importances from greatest importance to least
sorted_index = np.argsort(importances)[::-1][:top_features]
nums = range(top_features)

# Use actual feature names  
labels = np.array(X.columns)[sorted_index]

# Plot top feature importances
plt.figure(figsize=(12,6))
plt.bar(nums, importances[sorted_index], tick_label=labels)
plt.xlabel('Features')
plt.ylabel('Importances')
plt.title('Top predictive features from Random Forest')

# Rotate tick labels to vertical
plt.xticks(rotation=90)
plt.show();

In [0]:
# 12) Basic Keras Sequential NN with 100-20-1 structure, only 10 epochs for speed
# Not optimised any hyperparams (including model structure) b/c not done validation for the moment

reg_param = 0.0001

basic_nn = Sequential()
basic_nn.add(Dense(100, input_dim=X.shape[1], activation='relu', kernel_regularizer=keras.regularizers.l1(reg_param)))
basic_nn.add(Dense(20, activation='relu', kernel_regularizer=keras.regularizers.l1(reg_param)))

# Linear activation on final node for regression
basic_nn.add(Dense(1, activation='linear', kernel_regularizer=keras.regularizers.l1(reg_param)))

# Fit the model
basic_nn.compile(optimizer='adam', loss='mse')
history = basic_nn.fit(X_train, y_train, epochs=10)


# Plot the losses from the fit
plt.plot(history.history['loss'])

# Use the last loss as the title
plt.title('basic NN loss:' + str(round(history.history['loss'][-1], 4)))
plt.show();

# Calculate R^2 score
train_preds = basic_nn.predict(X_train)
test_preds = basic_nn.predict(X_test)
print('Random Forest train R^2: ',r2_score(y_train, train_preds))
print('Random Forest test R^2: ',r2_score(y_test, test_preds))