In [None]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import scipy.stats as stats
from tqdm import tqdm

import datetime

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso
from sklearn import metrics
from sklearn.decomposition import PCA

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

In [None]:
def normalizeSpread(spread, ask): #making a function to convert spreads from price space to percentage space, and eliminating all negative spreads
    if ask==0:
        return np.nan
    elif ask>0:
        return float(spread/ask)
    #elif spread<0:
    #    return 0.

def splitData(data, features, target, test_fraction, set_seed=False):
    #splitting data into train and test
    x=data[features]
    y=data[target]
    if set_seed==True:
        x_train, x_test, y_train, y_test=train_test_split(x, y, test_size=test_fraction, shuffle=True, random_state='12262021')
    else:
        x_train, x_test, y_train, y_test=train_test_split(x, y, test_size=test_fraction, shuffle=True)
    return (x_train, x_test, y_train, y_test)

def build_and_compile_model(norm):
    ##baseline model for DNN fitting. Loss can be changed to mean squared error or mean absolute error
    
    model = keras.Sequential([
        norm,
        layers.Dense(64, activation='relu'),
        layers.Dense(1)
    ])

    model.compile(loss='mean_absolute_error',
                optimizer=tf.keras.optimizers.Adam(0.001))
    return model

In [None]:
datapath='' ##where testing data is
featurepath='' ##where feature file is

TrainingDataSet=pd.read_parquet("spreads_data_train.par") ##using the training data on the dropbox
omcDataSet=pd.read_parquet(datapath) ##testing data
FeatureSet=pd.read_csv(featurepath)

TrainingDataSet=TrainingDataSet.sort_values(by='localtime')
tomcDataSet=omcDataSet.sort_values(by='localtime')

tickers=list(TrainingDataSet['symbol'].unique())

TrainingDataSet=TrainingDataSet.rename_axis("ordinal_times").reset_index()
omcDataSet=omcDataSet.rename_axis("ordinal_times").reset_index()

In [None]:
TrainingDataSet=TrainingDataSet[(TrainingDataSet['fut_spread']>0) & (TrainingDataSet['spread']>0)]
TrainingDataSet=TrainingDataSet[TrainingDataSet['numEvents']>0]
TrainingDataSet=TrainingDataSet.dropna()

omcDataSet=omcDataSet[(omcDataSet['fut_spread']>0) & (omcDataSet['spread']>0)]
omcDataSet=omcDataSet[(omcDataSet['numEvents']>0)]
omcDataSet=omcDataSet.dropna()

TrainingDataSet['log_fut_spread']=TrainingDataSet.apply(lambda row: np.log(row['fut_spread']), axis=1)
TrainingDataSet['log_ordinal_times']=TrainingDataSet.apply(lambda row: np.log(row['ordinal_times']+1), axis=1)
TrainingDataSet['log_median_width']=TrainingDataSet.apply(lambda row: np.log(row['median_width']), axis=1)
TrainingDataSet['log_spread']=TrainingDataSet.apply(lambda row: np.log(row['spread']), axis=1)

omcDataSet['log_fut_spread']=omcDataSet.apply(lambda row: np.log(row['fut_spread']), axis=1)
omcDataSet['log_ordinal_times']=omcDataSet.apply(lambda row: np.log(row['ordinal_times']+1), axis=1)
omcDataSet['log_median_width']=omcDataSet.apply(lambda row: np.log(row['median_width']), axis=1)
omcDataSet['log_spread']=omcDataSet.apply(lambda row: np.log(row['spread']), axis=1)

In [None]:
staticFeatures=['log_ordinal_times','log_median_width']
allFeatures=['log_ordinal_times', 'log_median_width', 'log_spread']
target='log_fut_spread'

symbol_type='' ##input symbol type here of interest here
regularizer=10**-3   ##input regularizer here. I found 10**-3 works well.

static feature fitting

In [None]:
testsize=0.1

x_train, x_test, y_train, y_test=splitData(TrainingDataSet[TrainingDataSet['symbol_type']==symbol_type, staticFeatures, target, testsize)

Predictor=Ridge(regularizer, normalize=True).fit(x_train, y_train)

y_test_pred=Predictor.predict(x_test)
plt.plot(y_test_pred, y_test)
plt.plot([-5,5],[-5,5], label='truth')
plt.xlim([-5,5])
plt.ylim([-5,5])
plt.legend()
plt.xlabel('predicted log(fut spread)')
plt.ylabel('true log(fut spread)')

In [None]:
omc_x_data=omcDataSet[omcDataSet['symbol_type']==symbol_type][staticFeatures]
omc_true_spread=omcDataSet[omcDataSet['symbol_type']==symbol_type][target]

omc_prediction=Predictor.predict(omc_x_data)

plt.plot(omc_prediction, omc_true_spread)
plt.plot([-5,5],[-5,5], label='truth')
plt.xlim([-5,5])
plt.ylim([-5,5])
plt.legend()
plt.xlabel('predicted log(fut spread)')
plt.ylabel('true log(fut spread)')

all feature fitting

In [None]:
testsize=0.1

x_train, x_test, y_train, y_test=splitData(TrainingDataSet[TrainingDataSet['symbol_type']==symbol_type, allFeatures, target, testsize)

Predictor=Ridge(regularizer, normalize=True).fit(x_train, y_train)

y_test_pred=Predictor.predict(x_test)
plt.plot(y_test_pred, y_test)
plt.plot([-5,5],[-5,5], label='truth')
plt.xlim([-5,5])
plt.ylim([-5,5])
plt.legend()
plt.xlabel('predicted log(fut spread)')
plt.ylabel('true log(fut spread)')

In [None]:
omc_x_data=omcDataSet[omcDataSet['symbol_type']==symbol_type][allFeatures]
omc_true_spread=omcDataSet[omcDataSet['symbol_type']==symbol_type][target]

omc_prediction=Predictor.predict(omc_x_data)

plt.plot(omc_prediction, omc_true_spread)
plt.plot([-5,5],[-5,5], label='truth')
plt.xlim([-5,5])
plt.ylim([-5,5])
plt.legend()
plt.xlabel('predicted log(fut spread)')
plt.ylabel('true log(fut spread)')

DNN fitting (by symbol) with only static features

In [None]:
staticFeatures=list(FeatureSet[(FeatureSet['Category']=='Static') & ((FeatureSet['Type']=='float64') | (FeatureSet['Type']=='int64'))]['Column'])+['ordinal_times']
TandQFeatures=list(FeatureSet[(FeatureSet['Category']=='Trade') | (FeatureSet['Category']=='Quote')]['Column'])
allNumericalFeatures=staticFeatures+TandQFeatures

In [None]:
symbol=''

data=TrainingDataSet[TrainingDataSet['symbol']==symbol]
x_train, x_test, y_train, y_test=splitData(data, StaticFeatures, target, 0.1, set_seed=False)

normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(np.array(x_train))



dnn_model = build_and_compile_model(normalizer)
dnn_model.summary()
history = dnn_model.fit(
    x_train,
    y_train,
    validation_split=0.2,
    verbose=0, epochs=100)

y_test_pred = dnn_model.predict(x_test).flatten()

a = plt.axes(aspect='equal')
plt.scatter(y_test, y_test_pred)
plt.xlabel('True future spread')
plt.ylabel('Predicted future spread')
lims = [0, 2]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)

In [None]:
omc_test_data=omcDataSet[omcDataSet['symbol']==symbol][staticFeatures]
omc_test_true=omcDataSet[omcDataSet['symbol']==symbol][target]

omc_prediction=dnn_model.predict(omc_test_data).flatten()

a = plt.axes(aspect='equal')
plt.scatter(omc_test_true, omc_prediction)
plt.xlabel('True future spread')
plt.ylabel('Predicted future spread')
lims = [0, 2]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)

with all features

In [None]:
symbol=''

data=TrainingDataSet[TrainingDataSet['symbol']==symbol]
x_train, x_test, y_train, y_test=splitData(data, allNumericalFeatures, target, 0.1, set_seed=False)

normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(np.array(x_train))



dnn_model = build_and_compile_model(normalizer)
dnn_model.summary()
history = dnn_model.fit(
    x_train,
    y_train,
    validation_split=0.2,
    verbose=0, epochs=100)

y_test_pred = dnn_model.predict(x_test).flatten()

a = plt.axes(aspect='equal')
plt.scatter(y_test, y_test_pred)
plt.xlabel('True future spread')
plt.ylabel('Predicted future spread')
lims = [0, 2]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)

In [None]:
omc_test_data=omcDataSet[omcDataSet['symbol']==symbol][allNumericalFeatures]
omc_test_true=omcDataSet[omcDataSet['symbol']==symbol][target]

omc_prediction=dnn_model.predict(omc_test_data).flatten()

a = plt.axes(aspect='equal')
plt.scatter(omc_test_true, omc_prediction)
plt.xlabel('True future spread')
plt.ylabel('Predicted future spread')
lims = [0, 2]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)