Author: Gil Shulman
Date: 2024-11-15
Last Edit Date: 2024-11-15

In [4]:
LARGE_NUMBER = 1e6
COUPON_FREQUENCY_TYPE = {'Unknown': LARGE_NUMBER,
                         'Semiannually': 2,
                         'Monthly': 12,
                         'Annually': 1,
                         'Weekly': 52,
                         'Quarterly': 4,
                         'Every 2 years': 0.5,
                         'Every 3 years': 1/3,
                         'Every 4 years': 0.25,
                         'Every 5 years': 0.2,
                         'Every 7 years': 1/7,
                         'Every 8 years': 1/8,
                         'Every 10 years': 1/10,
                         'Biweekly':  26,
                         'Changeable': 44,
                         'Daily': 360,
                         'Interest at maturity': 0,
                         'Not Applicable': LARGE_NUMBER}

In [5]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/gil/git/ficc/creds.json"
import pandas as pd

from datetime import datetime
from google.cloud import bigquery
from deloitte_ycl import add_yield_curve

bqclient = bigquery.Client()
project = "eng-reactor-287421"

def sqltodf(sql,limit = ""):
    if limit != "": 
        limit = f" ORDER BY RAND() LIMIT {limit}"
    bqr = bqclient.query(sql + limit).result()
    return bqr.to_dataframe()

In [None]:
# df = sqltodf(f'''
# SELECT
#   msrb.trade_date,
#   msrb.time_of_trade,
#   msrb.trade_datetime,
#   msrb.dollar_price,
#   msrb.yield,
#   msrb.trade_type,
#   msrb.cusip,
#   msrb.par_traded,
#   msrb.settlement_date,
#   ref_data_v1.file_received_from_provider_timestamp,
#   ref_data_v1.accrual_date,
#   ref_data_v1.next_coupon_payment_date,
#   ref_data_v1.interest_payment_frequency,
#   ref_data_v1.current_coupon_rate AS coupon,
#   ref_data_v1.incorporated_state_code,
#   ref_data_v1.coupon_type,
#   ref_data_v1.is_callable,
#   ref_data_v1.sink_indicator,
#   ref_data_v1.is_general_obligation,
#   ref_data_v1.callable_at_cav,
#   ref_data_v1.sp_long,
#   ref_data_v1.purpose_class,
#   ref_data_v1.maturity_date AS maturity_date,
#   ref_data_v1.next_call_date,
#   ref_data_v1.par_call_date,
#   ref_data_v1.instrument_primary_name
# FROM
#   `auxiliary_views_v2.msrb_final` msrb
# INNER JOIN
#   `reference_data_v1.reference_data_flat` ref_data_v1
# ON
#   msrb.cusip = ref_data_v1.cusip
#   AND timestamp(msrb.trade_datetime, "America/New_York") BETWEEN ref_data_v1.ref_valid_from_date AND ref_data_v1.ref_valid_to_date
#   AND msrb.trade_date > "2023-01-01"
# ''')

# df.to_pickle("lgbm_data_file.pkl")

# Light GBM Model:

# Ensemble Model for Bond Price Prediction

## Model Architecture
This study employs an ensemble of Light Gradient Boosting Machine (LightGBM) regressors for predicting municipal bond prices. The ensemble is constructed using a Voting Regressor, which aggregates predictions from multiple base models to enhance generalization and mitigate overfitting.

## Key Components
1. **Base Model**: LightGBM Regressor
   - Objective: Mean Absolute Error (MAE) minimization
   - Key parameters: Dynamic (configurable) max_depth, num_leaves, and n_estimators
   - Incorporates subsampling for robustness

2. **Ensemble Method**: Voting Regressor
   - Combines multiple LightGBM models with varied random seeds
   - Leverages parallel processing for efficient computation

3. **Feature Engineering**
   - Limitted set of ref data
   - Label encoding for categorical
   - Days to for dates

4. **Temporal Considerations**
   - Implements time series cross-validation for robust performance estimation

## Methodology Highlights
- Gradient boosting 
- Ensemble methods to reduce model variance and enhance predictive stability

## Performance Evaluation
- Primary metric: Mean Absolute Error (MAE)
- Secondary analysis: Feature importance quantification
- Cross-validation strategy accounting for temporal structure

In [7]:
# import pandas as pd
# from datetime import datetime, timedelta

# # Load the data
# data = pd.read_pickle('/Users/gil/git/ficc/notebooks/gil_modeling/lgbm_data_file.pkl')

# # Convert trade_date to datetime if it's not already
# data['trade_date'] = pd.to_datetime(data['trade_date'])

# # Calculate the date 7 months ago from the most recent date in the dataset
# most_recent_date = data['trade_date'].max()
# seven_months_ago = most_recent_date - timedelta(days=9*30)  # Approximating 7 months as 7*30 days

# recent_data = data[data['trade_date'] > seven_months_ago]

# print(f"Original dataset size: {len(data)}")
# print(f"Recent dataset size (last 9 months): {len(recent_data)}")
# print(f"Date range: from {recent_data['trade_date'].min()} to {recent_data['trade_date'].max()}")

In [8]:
# recent_data.to_pickle("nine_months_lgbm_data_file.pkl")

In [9]:
data = pd.read_pickle('/Users/gil/git/ficc/notebooks/gil_modeling/lgbm_data_file.pkl')
# data = add_yield_curve(data)
# data

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import TimeSeriesSplit
from lightgbm import LGBMRegressor
from sklearn.preprocessing import LabelEncoder

# Load the data
# data = pd.read_pickle('/Users/gil/git/ficc/notebooks/gil_modeling/seven_months_lgbm_data_file.pkl')

print(f"Number of rows in the dataset: {len(data)}")

# Data preprocessing (keep this part as is)
data['quantity'] = np.log10(data.par_traded.astype(np.float32))
data['has_sinking_fund'] = data['sink_indicator'].notna().astype(int)
data['is_zerocoupon'] = (data['coupon'] == 0).astype(int)
data['interest_payment_frequency'] = data['interest_payment_frequency'].map(COUPON_FREQUENCY_TYPE)
data['interest_payment_frequency'] = data['interest_payment_frequency'].fillna(COUPON_FREQUENCY_TYPE['Unknown'])

# Feature categories
BINARY = ['is_callable', 'has_sinking_fund', 'is_zerocoupon', 'is_general_obligation', 'callable_at_cav']
CATEGORICAL_FEATURES = ['sp_long', 'incorporated_state_code', 'purpose_class', 'coupon_type','trade_type']
NON_CAT_FEATURES = ['coupon', 'interest_payment_frequency', 'quantity']
DATE_COLS = ['trade_date', 'accrual_date', 'next_coupon_payment_date', 'maturity_date', 'next_call_date', 'par_call_date']

# Filter features
BINARY = [col for col in BINARY if col in data.columns]
CATEGORICAL_FEATURES = [col for col in CATEGORICAL_FEATURES if col in data.columns]
NON_CAT_FEATURES = [col for col in NON_CAT_FEATURES if col in data.columns]
DATE_COLS = [col for col in DATE_COLS if col in data.columns]

# Handle missing values
float_cols = data.select_dtypes(include=['float64']).columns
int_cols = data.select_dtypes(include=['int64', 'int32']).columns
data[float_cols] = data[float_cols].fillna(data[float_cols].mean())
data[int_cols] = data[int_cols].fillna(data[int_cols].median().astype(int))

categorical_cols = data.select_dtypes(include=['object', 'category']).columns
for col in categorical_cols:
    data[col] = data[col].fillna(data[col].mode().iloc[0])

# Convert features
for col in BINARY:
    data[col] = data[col].fillna(0).astype(int)
for col in CATEGORICAL_FEATURES:
    data[col] = data[col].astype('category')
for col in DATE_COLS:
    data[col] = pd.to_datetime(data[col], errors='coerce')

# Feature engineering
for col in DATE_COLS:
    if col != 'trade_date':
        data[f'days_to_{col}'] = (data[col] - data['trade_date']).dt.days
        NON_CAT_FEATURES.append(f'days_to_{col}')

# Combine all features
features = BINARY + CATEGORICAL_FEATURES + NON_CAT_FEATURES

# Time-based split:
data['trade_date'] = pd.to_datetime(data['trade_date'])
split_date = pd.to_datetime('2024-09-29')
train_mask = data['trade_date'] <= split_date
X_train = data.loc[train_mask, features].copy()
y_train = data.loc[train_mask, 'dollar_price'].copy()
X_test = data.loc[~train_mask, features].copy()
y_test = data.loc[~train_mask, 'dollar_price'].copy()

# Handle remaining missing values
for dataset in [X_train, X_test]:
    for col in dataset.columns:
        if dataset[col].dtype.name == 'category':
            dataset[col] = dataset[col].cat.add_categories('Unknown').fillna('Unknown')
        elif dataset[col].dtype in ['int64', 'Int64']:
            dataset[col] = dataset[col].fillna(dataset[col].median()).astype(int)
        else:
            dataset[col] = dataset[col].fillna(dataset[col].mean())

# Encode categorical variables using LabelEncoder
label_encoders = {}
for col in CATEGORICAL_FEATURES:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])
    label_encoders[col] = le

# Define myLGBM and mkensemble functions
def myLGBM(seed=11, depth=8):
    return LGBMRegressor(
        max_depth=depth,
        num_leaves=depth*10,
        n_estimators=depth*30,
        objective='mae',
        verbosity=-1,
        subsample=0.5,
        subsample_freq=10,
        random_state=seed
    )

def mkensemble(n=10, seed=11, depth=10):
    regressors = []
    for j in range(1, n+1):
        regressors.append(('m'+str(j), myLGBM(seed+j, depth)))
    return VotingRegressor(regressors, n_jobs=-1, verbose=False)

# Create the ensemble model
ensemble_model = mkensemble(n=10, seed=42, depth=10)

# Fit the ensemble model
ensemble_model.fit(X_train, y_train)

# Make predictions
y_pred = ensemble_model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae:.2f}')

# Feature importance (average over ensemble)
importances = np.mean([
    est.feature_importances_ for name, est in ensemble_model.named_estimators_.items()
], axis=0)

feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': importances
})
feature_importance = feature_importance.sort_values(by='importance', ascending=False)
print(feature_importance.head(20))

# Time-based cross-validation with ensemble
tscv = TimeSeriesSplit(n_splits=5)
cv_scores = []

for train_index, val_index in tscv.split(X_train):
    X_train_cv, X_val_cv = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_cv, y_val_cv = y_train.iloc[train_index], y_train.iloc[val_index]

    ensemble_cv = mkensemble(n=10, seed=42, depth=10)
    ensemble_cv.fit(X_train_cv, y_train_cv)

    y_pred_cv = ensemble_cv.predict(X_val_cv)
    cv_scores.append(mean_absolute_error(y_val_cv, y_pred_cv))

print(f"Cross-validation MAE scores: {cv_scores}")
print(f"Mean CV MAE: {np.mean(cv_scores):.2f}")


In [None]:
import pickle
from joblib import dump

# Method 1: Using pickle
def save_model_pickle(model, filename):
    with open(filename, 'wb') as file:
        pickle.dump(model, file)

# Save the model
save_model_pickle(ensemble_model, 'ensemble_model.pkl')

# Method 2: Using joblib (recommended for large arrays)
def save_model_joblib(model, filename):
    dump(model, filename)

# Save the model
save_model_joblib(ensemble_model, 'ensemble_model.joblib')

# To save label encoders as well
def save_label_encoders(label_encoders, filename):
    with open(filename, 'wb') as file:
        pickle.dump(label_encoders, file)

# Save label encoders
save_label_encoders(label_encoders, 'label_encoders.pkl')

print("Model and label encoders saved successfully.")

In [None]:
df = sqltodf(f'''
SELECT
  ref_data_v1.cusip,
  ref_data_v1.file_received_from_provider_timestamp,
  ref_data_v1.accrual_date,
  ref_data_v1.next_coupon_payment_date,
  ref_data_v1.interest_payment_frequency,
  ref_data_v1.current_coupon_rate AS coupon,
  ref_data_v1.incorporated_state_code,
  ref_data_v1.coupon_type,
  ref_data_v1.is_callable,
  ref_data_v1.sink_indicator,
  ref_data_v1.is_general_obligation,
  ref_data_v1.callable_at_cav,
  ref_data_v1.sp_long,
  ref_data_v1.purpose_class,
  ref_data_v1.maturity_date AS maturity_date,
  ref_data_v1.next_call_date,
  ref_data_v1.par_call_date,
  ref_data_v1.instrument_primary_name
FROM
  `reference_data_v1.reference_data_flat` ref_data_v1
WHERE
  cusip IN (
  SELECT
    cusip
  FROM
    `jesse_test.cusips_deloitte` )
AND "2024-09-29" BETWEEN ref_data_v1.ref_valid_from_date AND ref_data_v1.ref_valid_to_date
''')

df.to_pickle("deloitte.pkl")

In [13]:
# import pickle
# from joblib import load

# # Load model saved with pickle
# def load_model_pickle(filename):
#     with open(filename, 'rb') as file:
#         return pickle.load(file)

# # Load the pickle model
# loaded_model_pickle = load_model_pickle('ensemble_model.pkl')

# # Load model saved with joblib
# def load_model_joblib(filename):
#     return load(filename)

# # Load the joblib model
# loaded_model_joblib = load_model_joblib('ensemble_model.joblib')

# # Load label encoders
# def load_label_encoders(filename):
#     with open(filename, 'rb') as file:
#         return pickle.load(file)

# # Load the label encoders
# loaded_label_encoders = load_label_encoders('label_encoders.pkl')

# print("Model and label encoders loaded successfully.")

In [None]:
df 

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Define the exact 18 features the model expects
EXPECTED_FEATURES = [
    'is_callable', 'has_sinking_fund', 'is_zerocoupon', 'is_general_obligation', 'callable_at_cav',
    'sp_long', 'incorporated_state_code', 'purpose_class', 'coupon_type', 'trade_type',
    'coupon', 'interest_payment_frequency', 'quantity',
    'days_to_accrual_date', 'days_to_next_coupon_payment_date', 'days_to_maturity_date',
    'days_to_next_call_date', 'days_to_par_call_date'
]

def preprocess_input_data(df):
    df = df.copy()

    # Data preprocessing
    df['quantity'] = np.log10(df['par_traded'].astype(np.float32))
    df['has_sinking_fund'] = df['sink_indicator'].notna().astype(int)
    df['is_zerocoupon'] = (df['coupon'] == 0).astype(int)
    df['interest_payment_frequency'] = df['interest_payment_frequency'].map(COUPON_FREQUENCY_TYPE)
    df['interest_payment_frequency'] = df['interest_payment_frequency'].fillna(COUPON_FREQUENCY_TYPE['Unknown'])

    # Handle missing values and convert features
    for col in EXPECTED_FEATURES:
        if col not in df.columns:
            print(f"Warning: {col} is missing. Adding it with default values.")
            df[col] = 0  # or another appropriate default value
        elif df[col].dtype == 'object':
            df[col] = df[col].fillna(df[col].mode().iloc[0])
        elif df[col].dtype == 'float64':
            df[col] = df[col].fillna(df[col].mean())
        elif df[col].dtype in ['int64', 'Int64']:
            df[col] = df[col].fillna(df[col].median()).astype(int)

    # Feature engineering for date columns
    date_cols = ['accrual_date', 'next_coupon_payment_date', 'maturity_date', 'next_call_date', 'par_call_date']
    for col in date_cols:
        df[col] = pd.to_datetime(df[col], errors='coerce')
        df[f'days_to_{col}'] = (df[col] - df['trade_date']).dt.days.fillna(0).astype(int)

    # Encode categorical variables
    categorical_features = ['sp_long', 'incorporated_state_code', 'purpose_class', 'coupon_type', 'trade_type']
    for col in categorical_features:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))

    return df[EXPECTED_FEATURES]

# Main prediction code
inputs_df = pd.read_pickle("deloitte.pkl")

# Add trade_type, par_traded, and trade_date to the input dataframe
inputs_df['trade_type'] = 'P'  # Bid Side
inputs_df['par_traded'] = 1000000  # As specified
inputs_df['trade_date'] = pd.to_datetime('2024-08-31')  # Set the trade date for all rows
# inputs_df = add_yield_curve(inputs_df)

# Preprocess the input data
X_pred = preprocess_input_data(inputs_df)
print("X_pred columns:", X_pred.columns)
print("X_pred shape:", X_pred.shape)
print("X_pred dtypes:", X_pred.dtypes)

# Make predictions using the ensemble model
predictions = ensemble_model.predict(X_pred)

# Add predictions to the original dataframe
inputs_df['predicted_dollar_price'] = predictions

# Display results
print(inputs_df[['cusip', 'trade_type', 'par_traded', 'predicted_dollar_price']])

# Optionally, save the results to a CSV file
inputs_df[['cusip', 'trade_type', 'par_traded', 'predicted_dollar_price']].to_csv('deloitte_predictions.csv', index=False)
print("deloitte_predictions.csv")

In [None]:
# Calculate the stats
mean_price = inputs_df.predicted_dollar_price.mean()
median_price = inputs_df.predicted_dollar_price.median()
min_price = inputs_df.predicted_dollar_price.min()
max_price = inputs_df.predicted_dollar_price.max()
std_dev = inputs_df.predicted_dollar_price.std()
count = inputs_df.predicted_dollar_price.count()
negative_values_count = (inputs_df.predicted_dollar_price < 0).sum()

# Print the results
print(f"Predicted Dollar Price Statistics:")
print(f"Mean: {mean_price:.2f}")
print(f"Median: {median_price:.2f}")
print(f"Min: {min_price:.2f}")
print(f"Max: {max_price:.2f}")
print(f"Standard Deviation: {std_dev:.2f}")
print(f"Total Count: {count}")
print(f"Negative Values Count: {negative_values_count}")

In [None]:
# Load the actual trades data
actual_trades = pd.read_csv('msrb_res.csv')

# Assuming inputs_df is your dataframe with predictions
# If not, load it here:
# inputs_df = pd.read_csv('deloitte_predictions_ycl.csv')

# Convert trade_date to datetime for both dataframes
actual_trades['trade_date'] = pd.to_datetime(actual_trades['trade_date'])
inputs_df['trade_date'] = pd.to_datetime(inputs_df['trade_date'])

# Merge the predictions with actual trades
merged_df = pd.merge(inputs_df, actual_trades, on=['cusip'], how='inner', suffixes=('_pred', '_actual'))

# Calculate the price delta
merged_df['price_delta'] = merged_df['predicted_dollar_price'] - merged_df['dollar_price']

# Calculate the absolute error
merged_df['absolute_error'] = abs(merged_df['price_delta'])

# Calculate the Mean Absolute Error
mae = merged_df['absolute_error'].mean()

print(f"Mean Absolute Error: {mae:.4f}")

# Display some statistics about the price delta
print("\nPrice Delta Statistics:")
print(merged_df['price_delta'].describe())

# Count how many predictions were within 1, 2, and 5 points of the actual price
within_1 = (merged_df['absolute_error'] <= 1).sum()
within_2 = (merged_df['absolute_error'] <= 2).sum()
within_5 = (merged_df['absolute_error'] <= 5).sum()

total_predictions = len(merged_df)

print(f"\nOut of {total_predictions} matched predictions:")
print(f"Within 1 point: {within_1} ({within_1/total_predictions:.2%})")
print(f"Within 2 points: {within_2} ({within_2/total_predictions:.2%})")
print(f"Within 5 points: {within_5} ({within_5/total_predictions:.2%})")

# Optionally, save the merged results to a CSV for further analysis
merged_df.to_csv('prediction_vs_actual_comparison.csv', index=False)
print("\nDetailed comparison saved to 'prediction_vs_actual_comparison.csv'")

In [18]:
# Ensure date columns are in datetime format
inputs_df['trade_date'] = pd.to_datetime(inputs_df['trade_date'], errors='coerce')
inputs_df['maturity_date'] = pd.to_datetime(inputs_df['maturity_date'], errors='coerce')

# Calculate years to maturity
inputs_df['years_to_maturity'] = (inputs_df['maturity_date'] - inputs_df['trade_date']).dt.days / 365.25

In [None]:
# Create coupon rate bins
coupon_bins = [0, 1, 2, 3, 4, 5, np.inf]
coupon_labels = ['0-1%', '1-2%', '2-3%', '3-4%', '4-5%', '5%+']

# Create maturity bins (in years)
maturity_bins = [0, 5, 10, 15, 20, 30, np.inf]
maturity_labels = ['0-5 years', '5-10 years', '10-15 years', '15-20 years', '20-30 years', '30+ years']

# Create the groupings
inputs_df['coupon_group'] = pd.cut(inputs_df['coupon'], bins=coupon_bins, labels=coupon_labels, include_lowest=True)
inputs_df['maturity_group'] = pd.cut(inputs_df['years_to_maturity'], bins=maturity_bins, labels=maturity_labels, include_lowest=True)

# Group by coupon and maturity, and calculate average predicted price
grouped_df = inputs_df.groupby(['coupon_group', 'maturity_group']).agg({
    'predicted_dollar_price': 'mean',
    'cusip': 'count'  # This gives us the count of bonds in each group
}).reset_index()

# Rename the count column
grouped_df = grouped_df.rename(columns={'cusip': 'bond_count'})

# Sort the results
grouped_df = grouped_df.sort_values(['coupon_group', 'maturity_group'])

# Display the results
print(grouped_df.to_string(index=False))

# Optionally, save to CSV
grouped_df.to_csv('grouped_bond_predictions.csv', index=False)
print("\nGrouped predictions saved to 'grouped_bond_predictions.csv'")

# Create a pivot table for easier viewing
pivot_df = grouped_df.pivot(index='coupon_group', columns='maturity_group', values='predicted_dollar_price')
print("\nPivot Table of Average Predicted Prices:")
print(pivot_df.round(2).to_string())

# Optionally, save pivot table to CSV
pivot_df.to_csv('pivot_bond_predictions.csv')
print("\nPivot table saved to 'pivot_bond_predictions.csv'")