## Base Model


Base line model that runs a linear regression on the yiled spreas sequence to predict the yield spread in the next timestamp.

In [1]:
import pandas as pd
import numpy as np
from google.cloud import bigquery
import os
import plotly.graph_objects as go
from IPython.display import display, HTML

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

np.random.seed(10)

In [2]:
bq_client = bigquery.Client()

#### Hyper-parameters for the model

In [24]:
TRAIN_TEST_SPLIT = 0.85
SEQUENCE_LENGTH = 32
STEP_SIZE = 1
DAYS_AGO_PROCESSING = None

#### Query for BigQuery

In [25]:
DATA_QUERY = """ SELECT
  --Combination of rtrs_control_number and publish_datetime uniquely identifies the trade messages
  --and therefore we are groupping transaction messages based on these two fields
  --Note: there are few records that have same rtrs_control_number and publish_datetime, but different sequence_number and transaction_type that need to be explored later (e.g. 2020120307600500, 2021020407278700, 2020111902310500, 2021020201831300)
  latest.rtrs_control_number,
  latest.publish_datetime,
  --Previous transactions
  ARRAY_AGG(STRUCT (past.rtrs_control_number,
      past.cusip,
      past.trade_datetime,
      past.publish_datetime,
      past.msrb_valid_from_date,
      past.msrb_valid_to_date,
      past.yield_spread,
      past.yield,
      past.dollar_price,
      past.par_traded,
      past.trade_type,
      DATE_DIFF(latest.trade_date,past.trade_date, day) AS days_ago )
  ORDER BY
    past.trade_datetime ASC ) AS recent
FROM
  eng-reactor-287421.MSRB.msrb_transforms latest
LEFT JOIN
  eng-reactor-287421.MSRB.msrb_transforms past
ON
  latest.cusip = past.cusip
  AND latest.trade_datetime BETWEEN past.msrb_valid_from_date
  AND past.msrb_valid_to_date
WHERE
  --filter to show the most recent message for each trade
  latest.MSRB_INST_ORDR_DESC = 1
  --This date can be updated show all trades that has accored during the desired dates
  AND past.trade_date >= '2021-01-01'
  AND past.trade_date < '2021-04-01'
GROUP BY
  latest.rtrs_control_number,
  latest.publish_datetime
LIMIT
  10000
        """

The DataLoader class grabs the data from BigQuery and returns the dataset as a [tensorflow dataset](https://www.tensorflow.org/api_docs/python/tf/data/Dataset). The query creates an array of all trades for every CUSIP. The query drops the trades that have been canceled. We calculate the yield spreads by taking the diffrence of the bond's yield and the yield of the [s&p muni bond index](https://www.spglobal.com/spdji/en/indices/fixed-income/sp-municipal-bond-index/#overview). To test the implementation I have limited the number of rows to 200, this can easily be changed by removing the limit in the DATA_QUERY.

The main driver method of the class is the processData function, which has been implemented as a class method. I decided to implement it as a class method as it can be easily called in other files without creating an instance for the class

The dataset is split into training and testing, with 85% of the data being used to train the model. The parameter that decides the ratio of train test split are defined in the cells above. We create a sequence of 5 trades and feed that into the model. The sequence parameter is defined as a hyper-parameter and can be easily changed. The sequence contains the yield spreads, prices, par traded value, the type of trade, and the number of days ago the trade was executed. Additional features can be added with a few minor tweaks to the source code.  


In [35]:
class DataLoader(object):
    '''
    Class to load the data from big query 
    and process it to create train and test data
    '''
    def __init__(self,query,client):
        self.query = query
        self.trade_dataframe = None
        self.client = client
        
    @staticmethod
    def createSequence(x):
        '''
        Creates sequence of historical trades
        x : list
        '''
        chunks = [x[base:base+SEQUENCE_LENGTH] for base in range(0,len(x), STEP_SIZE) if len(x[base:base+SEQUENCE_LENGTH]) == SEQUENCE_LENGTH]
        return chunks

    @staticmethod
    def tradeDictToList(trade_dict: dict) -> list:
        '''
        This function converts the recent trades dictionary
        to a list

        parameters:
        trade_dict : dict
        '''
        trade_list = []
        
        if DAYS_AGO_PROCESSING is None:
            trade_list.append(trade_dict['days_ago'])
        elif DAYS_AGO_PROCESSING.upper() == 'LOG':
            trade_list.append(np.log(1 + trade_dict['days_ago']))
        elif DAYS_AGO_PROCESSING.upper() == 'SQRT':
            trade_list.append(np.sqrt(trade_dict['days_ago']))
        else:
            raise NotImplementedError("The provided processing type does not match any implemented")
            
        trade_list.append(trade_dict['yield_spread'] * 100)
        
        
        # A few blunt normalizations will experiment with others as well
        # Multiplying the yield spreads by 100 to convert to basis points
        return trade_list 
        
        
    
    @staticmethod
    def tradeListToArray(trade_history):
        '''
        parameters:
        trade_history - list
        
        The floating values from BigQuery come as
        decimal data type. We convert it to a 32 bit
        float
        '''
        if len(trade_history) == 0:
            return np.array([])
    
        trades_list = [DataLoader.tradeDictToList(entry) for entry in trade_history]
        return np.stack(trades_list)
    
    @staticmethod
    def getFeatures(x,ind) -> str:
        '''
        Returns the feature sequence
        parameters:
        x - list
        ind - int
        '''
        return ','.join(map(str, [i[ind] for i in x]))  
    
    @staticmethod
    def create_features(x):
        return x[:SEQUENCE_LENGTH - 1]

    @staticmethod
    def create_target(x):
        return x[SEQUENCE_LENGTH-1][1]

    def fetchData(self):         
        if os.path.isfile('base.pkl'):
            self.trade_dataframe = pd.read_pickle('base.pkl')
        else:
            self.trade_dataframe = self.client.query(self.query).result().to_dataframe()
            
        self.trade_dataframe['trade_history'] = self.trade_dataframe.recent.apply(self.tradeListToArray)
        self.trade_dataframe.drop(columns=['recent'],inplace=True)
    
    # Class functions do not need an instance of the calss to be called.
    # They are mehtods associated with the class and not the instance
    # and can be called by the class directly
    @classmethod
    def processData(cls,query,client):
        '''
        Class method to process training and test data
        This function queries the 
        '''
        instance = cls(query,client)
        instance.fetchData()
            
        instance.trade_dataframe.trade_history = instance.trade_dataframe.trade_history.apply(instance.createSequence) 
        instance.trade_dataframe = instance.trade_dataframe[['rtrs_control_number','trade_history']].explode("trade_history",ignore_index=True)
        instance.trade_dataframe = instance.trade_dataframe.dropna()
        
        instance.trade_dataframe['features'] = instance.trade_dataframe['trade_history'].apply(instance.create_features)
        instance.trade_dataframe['target'] = instance.trade_dataframe['trade_history'].apply(instance.create_target)
        instance.trade_dataframe = instance.trade_dataframe.drop(columns=['trade_history'])
        
        if len(instance.trade_dataframe.iloc[0].features.shape) > 1:
            instance.trade_dataframe.features = instance.trade_dataframe.features.apply(lambda x: x.flatten())
        
        random_selection = np.random.rand(len(instance.trade_dataframe.index)) <= TRAIN_TEST_SPLIT
        train_data = instance.trade_dataframe[random_selection]
        test_data = instance.trade_dataframe[~random_selection]
        return train_data,test_data

In [36]:
train_dataframe, test_dataframe = DataLoader.processData(DATA_QUERY,bq_client)

In [37]:
train_dataframe = train_dataframe.sample(frac=1)

In [39]:
train_data = np.stack(train_dataframe.features.to_numpy())
target = train_dataframe.target.to_numpy()

test_data = np.stack(test_dataframe.features.to_numpy())
test_target =  test_dataframe.target.to_numpy()

In [40]:
reg = LinearRegression(fit_intercept=False).fit(train_data, target)

In [41]:
predicted_target= reg.predict(test_data)

In [42]:
mean_absolute_error(test_target,predicted_target)

24.193020739081778

In [43]:
reg.coef_

array([-0.47387869,  0.02912572,  0.84481038, -0.03643999,  0.59516149,
        0.17631237, -1.08789602,  0.01165806, -0.4696235 , -0.09141885,
        1.23573229, -0.01088993, -0.63246414,  0.08061162, -1.30069041,
       -0.09451323,  0.92780613, -0.02184098,  0.44507619,  0.11487903,
       -0.13335717, -0.07788781,  0.69930775,  0.07081154, -1.06834229,
       -0.14576499,  0.28378074,  0.08758424,  0.63579619,  0.00759879,
       -0.42647782,  0.02603858, -0.19117437, -0.05901626,  0.03572699,
       -0.11834497, -0.01381655,  0.20872997,  0.36427898,  0.00189678,
       -0.63728922, -0.03708282, -0.19395008,  0.11366395,  0.43969185,
        0.11015604, -0.18291194,  0.01576893,  0.02793643,  0.00323986,
        0.13084619,  0.06293257,  0.45619042, -0.02209613, -0.04776599,
        0.06647938,  1.08867538,  0.1594163 , -0.84623941,  0.03080845,
       -0.40889909,  0.29119417])