In [5]:
# Import Libraries
import pandas as pd
import lightgbm as lgb
import joblib
import numpy as np

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [6]:
# Load the CatBoost model and the training and testing data file.

# Model pipeline
model_pipeline = joblib.load('Output Files\\Price Prediction model files\\predict_price_model_catBoost.joblib')

# Training data on which the pipeline was fitted
X_train = pd.read_csv('Output Files\\X_train_for_CatBoost.csv')
y_train_log = pd.read_csv('Output Files\\y_train_for_CatBoost.csv').squeeze()  # y was originally transformed in log space to train the catboost model


# Testing data on which the pipeline performance was evaluated
X_test = pd.read_csv('Output Files\\X_test_for_CatBoost.csv')
y_test_log = pd.read_csv('Output Files\\y_test_for_CatBoost.csv').squeeze()

In [7]:
# Extract the preprocessor from the pipeline
preprocessor = model_pipeline.named_steps['preprocessor']

In [8]:
# Transform the training and testing data
X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Uncertainty Quantification

#### Uncertainty Quantification using Quantile Regression with LightGBM

In [9]:
# Quantiles for a 95% prediction interval
lower_alpha = 0.025
upper_alpha = 0.975

In [10]:
# LightGBM parameters
params = {
    'n_estimators': 500,
    'learning_rate': 0.05,
    'num_leaves': 31,
    'n_jobs': -1,
    'seed': 42
}

In [11]:
# Train the lower bound model on the log-transformed y_train
lgb_lower = lgb.LGBMRegressor(objective='quantile', alpha=lower_alpha, **params)
lgb_lower.fit(X_train_transformed, y_train_log)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001230 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2838
[LightGBM] [Info] Number of data points in the train set: 5024, number of used features: 52
[LightGBM] [Info] Start training from score 0.246860


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,500
,subsample_for_bin,200000
,objective,'quantile'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [12]:
# Train the upper bound model on the log-transformed y_train
lgb_upper = lgb.LGBMRegressor(objective='quantile', alpha=upper_alpha, **params)
lgb_upper.fit(X_train_transformed, y_train_log)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001243 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2838
[LightGBM] [Info] Number of data points in the train set: 5024, number of used features: 52
[LightGBM] [Info] Start training from score 2.507257


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,500
,subsample_for_bin,200000
,objective,'quantile'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [13]:
# Predict the lower and upper bounds on the transformed test data (predictions will be in log transformed space)
lower_bound_preds_log = lgb_lower.predict(X_test_transformed)
upper_bound_preds_log = lgb_upper.predict(X_test_transformed)

In [14]:
# Inverse transform the predictions back to the original price scale (Crores)
y_test_orig = np.expm1(y_test_log)
lower_bound_preds_orig_qr = np.expm1(lower_bound_preds_log)
upper_bound_preds_orig_qr = np.expm1(upper_bound_preds_log)

In [15]:
# Construct results DataFrame
results_df = pd.DataFrame({
    'True Price (Crores)': y_test_orig,
    'Lower Bound (Crores)': lower_bound_preds_orig_qr,
    'Upper Bound (Crores)': upper_bound_preds_orig_qr
})

In [16]:
# Ensure the lower bound is always less than the upper bound
results_df['Lower Bound (Crores)'] = np.minimum(results_df['Lower Bound (Crores)'], results_df['Upper Bound (Crores)'])
results_df['Upper Bound (Crores)'] = np.maximum(results_df['Lower Bound (Crores)'], results_df['Upper Bound (Crores)'])

In [17]:
# Display 10 samples
print('---- Generated prediction intervals using Quantile Regression (sample) ----\n')
results_df.sample(10)

---- Generated prediction intervals using Quantile Regression (sample) ----



Unnamed: 0,True Price (Crores),Lower Bound (Crores),Upper Bound (Crores)
498,1.5,1.583167,2.894796
275,3.6192,2.061587,4.169243
49,7.5,2.561775,9.242269
807,0.35,0.305562,0.596934
348,0.36,0.142952,0.897232
791,0.5,0.520122,0.970831
778,5.75,2.635107,6.253448
418,0.72,0.52203,1.366999
220,1.25,0.687045,1.566275
474,1.65,1.374159,2.699137


#### Uncertainty quantification using Conformal Prediction

In [18]:
# Define variables
calibration_set_size = 0.2  # Using 20% of the training data for calibration
alpha = 0.05  # For a 95% prediction interval (1 - 0.05)

In [19]:
# Data loading

# Full training data on which the pipeline was fitted
X_train_full = pd.read_csv('Output Files\\X_train_for_CatBoost.csv')
y_train_full_log = pd.read_csv('Output Files\\y_train_for_CatBoost.csv').squeeze()  # Log-transformed space

# Testing data on which the pipeline performance was evaluated
X_test = pd.read_csv('Output Files\\X_test_for_CatBoost.csv')
y_test_log = pd.read_csv('Output Files\\y_test_for_CatBoost.csv').squeeze()  # Log-transformed space

In [20]:
# Split the full training data into a new, smaller training set and a calibration set

X_train_proper, X_cal, y_train_proper_log, y_cal_log = train_test_split(X_train_full, y_train_full_log, test_size=calibration_set_size, random_state=42)
print(f"New training set size: {len(X_train_proper)}")
print(f"Calibration set size: {len(X_cal)}")

New training set size: 4019
Calibration set size: 1005


In [21]:
# Model Re-training 
model_pipeline = joblib.load('Output Files\\Price Prediction model files\\predict_price_model_catBoost.joblib')
model_pipeline.fit(X_train_proper, y_train_proper_log)
print("Model re-trained successfully.")

Model re-trained successfully.


In [22]:
# Calibration
print("Calibrating the model to find the uncertainty margin")

# Get predictions on the calibration set (in log space)
cal_preds_log = model_pipeline.predict(X_cal)

# Calculate the non-conformity scores (absolute errors in log space)
non_conformity_scores = np.abs(y_cal_log - cal_preds_log)

# Calculate the uncertainty margin (q_hat), this is the (1 - alpha) quantile of the scores
n_cal = len(X_cal)
q_hat = np.quantile(non_conformity_scores, np.ceil((1 - alpha) * (n_cal + 1)) / n_cal, interpolation='higher')
print(f"Calculated uncertainty margin (q_hat) in log space: {q_hat:.4f}")

Calibrating the model to find the uncertainty margin
Calculated uncertainty margin (q_hat) in log space: 0.2994


In [23]:
# Generate Intervals on Test Set

# Get point predictions on the test set (in log space)
test_preds_log = model_pipeline.predict(X_test)

# Construct the prediction intervals in log space
lower_bound_preds_log = test_preds_log - q_hat
upper_bound_preds_log = test_preds_log + q_hat

In [24]:
# Inverse transform everything back to the original price scale (Crores)
y_test_orig = np.expm1(y_test_log)
lower_bound_preds_orig_cp = np.expm1(lower_bound_preds_log)
upper_bound_preds_orig_cp = np.expm1(upper_bound_preds_log)

In [25]:
# Construct results DataFrame
results_df_conformal = pd.DataFrame({
    'True Price (Crores)': y_test_orig,
    'Lower Bound (Crores)': lower_bound_preds_orig_cp,
    'Upper Bound (Crores)': upper_bound_preds_orig_cp
})

print('--- Generated prediction intervals using Conformal Prediction (sample) ---')

# Sample
results_df_conformal.sample(5)

--- Generated prediction intervals using Conformal Prediction (sample) ---


Unnamed: 0,True Price (Crores),Lower Bound (Crores),Upper Bound (Crores)
685,0.7,0.305826,1.376647
791,0.5,0.128581,1.054055
229,1.95,0.536905,1.797218
15,1.25,0.739124,2.165264
425,0.55,-0.002572,0.815352


### Evaluate and compare the Quantile regression and Conformal Prediction method for Uncertainty quantification

Quantile regression metrics:

In [26]:
# Coverage: Check if the true price is within the interval for each property
covered_qr = (y_test_orig >= lower_bound_preds_orig_qr) & (y_test_orig <= upper_bound_preds_orig_qr)
coverage_qr = np.mean(covered_qr) * 100

# Width
width_qr = np.mean(upper_bound_preds_orig_qr - lower_bound_preds_orig_qr)

Conformal prediction metrics:

In [27]:
# Coverage
covered_cp = (y_test_orig >= lower_bound_preds_orig_cp) & (y_test_orig <= upper_bound_preds_orig_cp)
coverage_cp = np.mean(covered_cp) * 100

# Width
width_cp = np.mean(upper_bound_preds_orig_cp - lower_bound_preds_orig_cp)

Comparison of Uncertainty Quantification Methods:

In [31]:
# Construct dataframe for comparison
results_table = pd.DataFrame({
    'Method': ['Quantile Regression', 'Conformal Prediction'],
    'Coverage (%)': [f"{coverage_qr:.2f}%", f"{coverage_cp:.2f}%"],
    'Average Interval Width (Crores)': [f"{width_qr:.2f}", f"{width_cp:.2f}"]
})
results_table.set_index('Method', inplace=True)

print(f"\nOur target coverage was { (1 - alpha) * 100 }%\n")
# Display
results_table




Our target coverage was 95.0%



Unnamed: 0_level_0,Coverage (%),Average Interval Width (Crores)
Method,Unnamed: 1_level_1,Unnamed: 2_level_1
Quantile Regression,86.31%,1.75
Conformal Prediction,94.67%,2.06


-----------------

# Summary

The results clearly indicate the superior performance of the Conformal Prediction method over Quantile regression method. It achieved coverage of 94.67% which is very close to the target coverage of 95%.