In [1]:
%load_ext autoreload
%autoreload 2

import time
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import pickle
from lightgbm import LGBMClassifier
import lightgbm
from sklearn.metrics import accuracy_score, precision_score, recall_score


from ficc.utils.auxiliary_variables import IDENTIFIERS, NON_CAT_FEATURES_DOLLAR_PRICE, BINARY_DOLLAR_PRICE, CATEGORICAL_FEATURES_DOLLAR_PRICE,PREDICTORS_DOLLAR_PRICE
from ficc.utils.auxiliary_variables import PREDICTORS, NON_CAT_FEATURES, BINARY, CATEGORICAL_FEATURES, IDENTIFIERS, PURPOSE_CLASS_DICT

Initializing pandarallel with 16.0 cores
INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


Loading the data. The file contains trades from May to July with predictions from both dollar price model and yield spread model. We use May and June to train the model and test the results on July.

In [2]:
data = pd.read_pickle("ys_dp_predictions.pkl")

In [3]:
ys_variants = ["max_ys", "min_ys", "max_qty", "min_ago", "D_min_ago", "P_min_ago", "S_min_ago"]
ys_feats = ["_ys", "_ttypes", "_ago", "_qdiff"]
D_prev = dict()
P_prev = dict()
S_prev = dict()

def get_trade_history_columns():
    '''
    This function is used to create a list of columns
    '''
    YS_COLS = []
    for prefix in ys_variants:
        for suffix in ys_feats:
            YS_COLS.append(prefix + suffix)
    return YS_COLS

YS_COLS = get_trade_history_columns()

for col in YS_COLS:
    if 'ttypes' in col and col not in PREDICTORS:
        PREDICTORS.append(col)
        CATEGORICAL_FEATURES.append(col)
    elif col not in PREDICTORS:
        NON_CAT_FEATURES.append(col)
        PREDICTORS.append(col)

additional_features = ['dv01','approx_dpd','overage','de_minimis_gap']
for i in additional_features:
    if i not in NON_CAT_FEATURES:
        NON_CAT_FEATURES.append(i)
        PREDICTORS.append(i)

In [4]:
data['yield'] /= 100

In [5]:
data['delta_ys_model'] = np.abs(data['yield'] - data['predicted_yield'])
data['delta_dp_model'] = np.abs(data['yield'] - data['converted_yield_dollar_price'])

In [6]:
data['ys_better_than_dp'] = True
data.loc[data.delta_ys_model > data.delta_dp_model,'ys_better_than_dp'] = False

In [7]:
DECIDING_MODEL_NON_CAT_FEATURE = list(set(NON_CAT_FEATURES + NON_CAT_FEATURES_DOLLAR_PRICE))
DECIDING_MODEL_BINARY_FEATURE = list(set(BINARY + BINARY_DOLLAR_PRICE))
DECIDING_MODEL_CATEGORICAL_FEATURES = list(set(CATEGORICAL_FEATURES + CATEGORICAL_FEATURES_DOLLAR_PRICE))
DECIDING_MODEL_PREDICTORS = list(set(PREDICTORS + PREDICTORS_DOLLAR_PRICE))

In [8]:
train_data = data[data.trade_date < '2023-07-01']
test_data = data[data.trade_date >= '2023-07-01']

## Training GBM model

In [9]:
def gbmprep(df):
    df[DECIDING_MODEL_CATEGORICAL_FEATURES ] = df[DECIDING_MODEL_CATEGORICAL_FEATURES].astype('category')
    df[DECIDING_MODEL_NON_CAT_FEATURE + DECIDING_MODEL_BINARY_FEATURE ] = df[DECIDING_MODEL_NON_CAT_FEATURE + DECIDING_MODEL_BINARY_FEATURE].astype('float64')
    df = df.drop(columns = ['trade_history','target_attention_features'])
    return df

In [10]:
train_label = train_data['ys_better_than_dp']
test_label = test_data['ys_better_than_dp']

In [11]:
%%time
gbt_td = gbmprep(train_data[DECIDING_MODEL_PREDICTORS])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


CPU times: user 4.99 s, sys: 1.29 s, total: 6.28 s
Wall time: 6.31 s


In [12]:
gbtmodel = LGBMClassifier(num_iterations=300,
                          max_depth=12, 
                          num_leaves=300, 
                          objective='binary', 
                          verbosity=0)
gbtmodel.fit(gbt_td, train_label) 



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.


LGBMClassifier(max_depth=12, num_iterations=300, num_leaves=300,
               objective='binary', verbosity=0)

In [13]:
y_preds_train = gbtmodel.predict(gbt_td)

In [14]:
accuracy_score(y_preds_train, train_label)

0.7112887647538083

In [15]:
train_data[train_data.ys_better_than_dp == True].shape[0] / train_data.shape[0]

0.647201231052855

In [16]:
y_preds_test = gbtmodel.predict(gbmprep(test_data[DECIDING_MODEL_PREDICTORS]))
accuracy_score(y_preds_test, test_label)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


0.6309351765175019

In [17]:
test_data[test_data.ys_better_than_dp == True].shape[0] / test_data.shape[0]

0.6219162818058152

In [18]:
test_data['predicted_model'] = y_preds_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [19]:
test_data['prediced_yield_deciding_model'] = test_data.apply(lambda x: x.converted_yield_dollar_price if x.predicted_model == False else x.predicted_yield, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [20]:
np.mean(np.abs(test_data['prediced_yield_deciding_model']- test_data['yield'])) * 100

9.732109317682339

In [21]:
np.mean(np.abs(test_data['predicted_yield']- test_data['yield'])) * 100

9.785138707743087

In [22]:
np.mean(np.abs(test_data['converted_yield_dollar_price']- test_data['yield'])) * 100

2517.621511763567

In [23]:
temp = test_data[test_data.delta_dp_model > 25]

In [24]:
temp[temp.predicted_model == False][['converted_yield_dollar_price','predicted_yield','yield','callable_at_cav']]

Unnamed: 0,converted_yield_dollar_price,predicted_yield,yield,callable_at_cav
351092,4.114,11.365303,37.63,False
351129,4.066,4.130902,37.63,False
351135,4.101,4.133581,37.63,False
351136,4.066,4.130902,37.63,False


In [36]:
test_data[test_data.converted_yield_dollar_price > 1000][['cusip','trade_date','yield','converted_yield_dollar_price','predicted_dollar_price','dollar_price']]

Unnamed: 0,cusip,trade_date,yield,converted_yield_dollar_price,predicted_dollar_price,dollar_price
133715,720135NJ7,2023-07-26,3.624,14972.51,37.068699,39.73
170240,714369BM6,2023-07-25,0.798,2155.63,47.34182,49.977
510303,058219SS6,2023-07-13,6.113,17545440.0,4.871481,11.778
530742,204709ES4,2023-07-12,4.37,2224.52,41.431992,52.333
530743,204709ES4,2023-07-12,4.37,4605.139,38.839912,52.333
765775,244811BG9,2023-07-05,0.677,5892.474,28.640026,46.913


In [43]:
data[data.cusip == '058219SS6'][['par_call_price','next_call_price','callable_at_cav']]

Unnamed: 0,par_call_price,next_call_price,callable_at_cav
510185,100.0,100.0,False
510303,100.0,100.0,False


In [42]:
data.callable_at_cav.describe()

count     2295416
unique          2
top         False
freq      2286524
Name: callable_at_cav, dtype: object