# Forecasting Notebook

## 1. Setup
Importing relevant libraries

In [27]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from matplotlib import figure as figure
from statsmodels.tools.eval_measures import mse, rmse, meanabs

Loading in data

In [4]:
file_path= "/Users/robinvandenberg/Desktop/Thesis/Data/main.xlsx"
main = pd.read_excel(file_path)
main = main.drop('Unnamed: 0', axis=1) #remove unnecessary index column

## 2. Preparation

In [6]:
# Define the log returns for the current quarter future price
main['returns'] = np.log(main['CQF Close']).diff()
# Define the lagged log returns for the current quarter future price
main['lag_returns'] = main['Returns'].shift(1)
# Define the lagged log returns for the current quarter future price
main['lag_cqf_vol'] = main['CQF Volume'].shift(1)
# Drop first 2 rows as they contain NaN for the returns columns
main = main[2:]
main

Unnamed: 0,time,Spot Close,Spot Volume,CQF Close,CQF Volume,Returns,LaggedReturns,returns,lag_returns,lag_cqf_vol
2,2019-07-21 13:30:00,10568.0,0.000000e+00,10686.00,0.0000,-0.001449,0.004214,-0.001449,0.004214,0.0000
3,2019-07-21 13:45:00,10535.5,0.000000e+00,10648.00,0.0000,-0.003562,-0.001449,-0.003562,-0.001449,0.0000
4,2019-07-21 14:00:00,10531.0,0.000000e+00,10637.25,0.0000,-0.001010,-0.003562,-0.001010,-0.003562,0.0000
5,2019-07-21 14:15:00,10448.0,0.000000e+00,10546.75,21216.5000,-0.008544,-0.001010,-0.008544,-0.001010,0.0000
6,2019-07-21 14:30:00,10416.5,0.000000e+00,10513.75,0.0000,-0.003134,-0.008544,-0.003134,-0.008544,21216.5000
...,...,...,...,...,...,...,...,...,...,...
97359,2022-04-30 21:00:00,38278.0,6.148997e+06,38380.00,479102.9521,-0.001224,-0.000364,-0.001224,-0.000364,114937.9568
97360,2022-04-30 21:15:00,38325.0,4.012060e+06,38431.00,94022.4783,0.001328,-0.001224,0.001328,-0.001224,479102.9521
97361,2022-04-30 21:30:00,38355.0,3.339375e+06,38460.00,41562.1715,0.000754,0.001328,0.000754,0.001328,94022.4783
97362,2022-04-30 21:45:00,38304.0,1.276453e+06,38416.00,45640.5218,-0.001145,0.000754,-0.001145,0.000754,41562.1715


## 3. Training

In [10]:
# create training sample data set
train = main[:10000]

# create a 2D list object with the predictors from the training data: lagged returns and lagged trading volume
X = list()
for lag_return, lag_cqf_vol in zip(train['lag_returns'], train['lag_cqf_vol']):
    X.append( [ lag_return , lag_cqf_vol ] )

To do: Grid Search to optimise parameters

### 3.1 Linear Kernel

In [11]:
# create an SVR object with a linear kernel, fit this model to the training data, and get the paramters
lin_svr = SVR(kernel='linear')
lin_svr.fit(X, train['returns'])
lin_svr.get_params()

{'C': 1.0,
 'cache_size': 200,
 'coef0': 0.0,
 'degree': 3,
 'epsilon': 0.1,
 'gamma': 'scale',
 'kernel': 'linear',
 'max_iter': -1,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

### 3.2 Polynomial Kernel

In [12]:
# create an SVR object with a polynomial kernel, fit this model to the training data, and get the paramters
pol_svr = SVR(kernel='poly')
pol_svr.fit(X, train['returns'])
pol_svr.get_params()

{'C': 1.0,
 'cache_size': 200,
 'coef0': 0.0,
 'degree': 3,
 'epsilon': 0.1,
 'gamma': 'scale',
 'kernel': 'poly',
 'max_iter': -1,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

### 3.3 Radial Basis Funcition Kernel

In [14]:
# create an SVR object with an rbf kernel, fit this model to the training data, and get the paramters
rbf_svr = SVR(kernel='rbf')
rbf_svr.fit(X, train['returns'])
rbf_svr.get_params()

{'C': 1.0,
 'cache_size': 200,
 'coef0': 0.0,
 'degree': 3,
 'epsilon': 0.1,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

## 4. Testing

In [18]:
# create testing sample data set with the remaining data
test = main[10000:]

# create a 2D list object with the predictors from the testing data: lagged returns and lagged trading volume
Xtest = list()
for lag_returns, lag_cqf_vol in zip(test['lag_returns'], test['lag_cqf_vol']):
    Xtest.append( [ lag_returns , lag_cqf_vol ] )

In [19]:
# predict the returns for the training data set using the 3 SVR models. 
lin_pred = lin_svr.predict(Xtest)
pol_pred = pol_svr.predict(Xtest)
rbf_pred = rbf_svr.predict(Xtest)

### 4.1 Forecast Performance

In [24]:
# calculate the MSE for the forecasts of the 3 models
lin_mse = round(mse(np.array(test['Returns']), lin_pred), ndigits=10)
pol_mse = round(mse(np.array(test['Returns']), pol_pred), ndigits=10)
rbf_mse = round(mse(np.array(test['Returns']), rbf_pred), ndigits=10)

print("Linear MSE: ", lin_mse)
print("Poly MSE:   ", pol_mse)
print("RBF MSE:    ", rbf_mse)

Linear MSE:  0.0001392491
Poly MSE:    0.0001392491
RBF MSE:     0.0001392491


In [25]:
# calculate the RMSE for the forecasts of the 3 models
lin_rmse = round(rmse(np.array(test['Returns']), lin_pred), ndigits=10)
pol_rmse = round(rmse(np.array(test['Returns']), pol_pred), ndigits=10)
rbf_rmse = round(rmse(np.array(test['Returns']), rbf_pred), ndigits=10)

print("Linear RMSE: ", lin_rmse)
print("Poly RMSE:   ", pol_rmse)
print("RBF RMSE:    ", rbf_rmse)

Linear RMSE:  0.0118003846
Poly RMSE:    0.0118003846
RBF RMSE:     0.0118003846


In [28]:
# calculate the MSE for the forecasts of the 3 models
lin_mae = round(meanabs(np.array(test['Returns']), lin_pred), ndigits=10)
pol_mae = round(meanabs(np.array(test['Returns']), pol_pred), ndigits=10)
rbf_mae = round(meanabs(np.array(test['Returns']), rbf_pred), ndigits=10)

print("Linear MAE: ", lin_mae)
print("Poly MAE:   ", pol_mae)
print("RBF MAE:    ", rbf_mae)

Linear MAE:  0.0110723684
Poly MAE:    0.0110723684
RBF MAE:     0.0110723684
