In [1]:
# Import Libraries
import pandas as pd
import lightgbm as lgb
import joblib
import numpy as np

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load the custom regressor model and the training and testing data file.

# Model pipeline
model_pipeline = joblib.load('Output Files\\Price Prediction model files\\custom_ensemble_price_predict_model.joblib')

# Training data on which the pipeline was fitted
X_train = pd.read_csv('Output Files\\Price Prediction model files\\X_train_for_CatBoost.csv')
y_train_log = pd.read_csv('Output Files\\Price Prediction model files\\y_train_for_CatBoost.csv').squeeze()  # y was originally transformed in log space to train the catboost model


# Testing data on which the pipeline performance was evaluated
X_test = pd.read_csv('Output Files\\Price Prediction model files\\X_test_for_CatBoost.csv')
y_test_log = pd.read_csv('Output Files\\Price Prediction model files\\y_test_for_CatBoost.csv').squeeze()

In [3]:
# Extract the preprocessor from the pipeline
preprocessor = model_pipeline.named_steps['preprocessor']

In [4]:
# Transform the training and testing data
X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Uncertainty Quantification

#### Uncertainty Quantification using Quantile Regression with LightGBM

In [5]:
# Quantiles for a 95% prediction interval
lower_alpha = 0.025
upper_alpha = 0.975

In [6]:
# LightGBM parameters
params = {
    'n_estimators': 500,
    'learning_rate': 0.05,
    'num_leaves': 31,
    'n_jobs': -1,
    'seed': 42
}

In [7]:
# Train the lower bound model on the log-transformed y_train
lgb_lower = lgb.LGBMRegressor(objective='quantile', alpha=lower_alpha, **params)
lgb_lower.fit(X_train_transformed, y_train_log)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001605 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2838
[LightGBM] [Info] Number of data points in the train set: 5024, number of used features: 52
[LightGBM] [Info] Start training from score 0.246860


In [8]:
# Train the upper bound model on the log-transformed y_train
lgb_upper = lgb.LGBMRegressor(objective='quantile', alpha=upper_alpha, **params)
lgb_upper.fit(X_train_transformed, y_train_log)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.057686 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2838
[LightGBM] [Info] Number of data points in the train set: 5024, number of used features: 52
[LightGBM] [Info] Start training from score 2.507257


In [9]:
# Predict the lower and upper bounds on the transformed test data (predictions will be in log transformed space)
lower_bound_preds_log = lgb_lower.predict(X_test_transformed)
upper_bound_preds_log = lgb_upper.predict(X_test_transformed)

In [10]:
# Inverse transform the predictions back to the original price scale (Crores)
y_test_orig = np.expm1(y_test_log)
lower_bound_preds_orig_qr = np.expm1(lower_bound_preds_log)
upper_bound_preds_orig_qr = np.expm1(upper_bound_preds_log)

In [11]:
# Construct results DataFrame
results_df = pd.DataFrame({
    'True Price (Crores)': y_test_orig,
    'Lower Bound (Crores)': lower_bound_preds_orig_qr,
    'Upper Bound (Crores)': upper_bound_preds_orig_qr
})

In [12]:
# Ensure the lower bound is always less than the upper bound
results_df['Lower Bound (Crores)'] = np.minimum(results_df['Lower Bound (Crores)'], results_df['Upper Bound (Crores)'])
results_df['Upper Bound (Crores)'] = np.maximum(results_df['Lower Bound (Crores)'], results_df['Upper Bound (Crores)'])

In [13]:
# Display 10 samples
print('---- Generated prediction intervals using Quantile Regression (sample) ----\n')
results_df.sample(10)

---- Generated prediction intervals using Quantile Regression (sample) ----



Unnamed: 0,True Price (Crores),Lower Bound (Crores),Upper Bound (Crores)
31,1.25,0.908629,1.546983
627,1.35,1.055455,1.951975
95,0.8782,0.821503,1.322184
1095,10.5,5.66036,14.024847
1059,1.5,0.80379,1.988451
886,1.2,0.526368,1.041562
626,0.6,0.692393,1.700763
1029,2.8,1.842354,3.484663
796,1.83,1.461147,3.006211
740,0.68,0.414725,0.766561


#### Uncertainty quantification using Conformal Prediction

In [14]:
# Define variables
calibration_set_size = 0.2  # Using 20% of the training data for calibration
alpha = 0.05  # For a 95% prediction interval (1 - 0.05)

In [15]:
# Data loading

# Full training data on which the pipeline was fitted
X_train_full = pd.read_csv('Output Files\\Price Prediction model files\\X_train_for_CatBoost.csv')
y_train_full_log = pd.read_csv('Output Files\\Price Prediction model files\\y_train_for_CatBoost.csv').squeeze()  # Log-transformed space

# Testing data on which the pipeline performance was evaluated
X_test = pd.read_csv('Output Files\\Price Prediction model files\\X_test_for_CatBoost.csv')
y_test_log = pd.read_csv('Output Files\\Price Prediction model files\\y_test_for_CatBoost.csv').squeeze()  # Log-transformed space

In [16]:
# Split the full training data into a new, smaller training set and a calibration set

X_train_proper, X_cal, y_train_proper_log, y_cal_log = train_test_split(X_train_full, y_train_full_log, test_size=calibration_set_size, random_state=42)
print(f"New training set size: {len(X_train_proper)}")
print(f"Calibration set size: {len(X_cal)}")

New training set size: 4019
Calibration set size: 1005


In [17]:
# Model Re-training 
model_pipeline = joblib.load('Output Files\\Price Prediction model files\\custom_ensemble_price_predict_model.joblib')
model_pipeline.fit(X_train_proper, y_train_proper_log)
print("Model re-trained successfully.")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   14.1s remaining:   21.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   14.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   20.8s remaining:   31.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   23.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.0s finished


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000945 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2625
[LightGBM] [Info] Number of data points in the train set: 4019, number of used features: 51
[LightGBM] [Info] Start training from score 1.028711
Model re-trained successfully.


In [18]:
# Calibration
print("Calibrating the model to find the uncertainty margin")

# Get predictions on the calibration set (in log space)
cal_preds_log = model_pipeline.predict(X_cal)

# Calculate the non-conformity scores (absolute errors in log space)
non_conformity_scores = np.abs(y_cal_log - cal_preds_log)

# Calculate the uncertainty margin (q_hat), this is the (1 - alpha) quantile of the scores
n_cal = len(X_cal)
q_hat = np.quantile(non_conformity_scores, np.ceil((1 - alpha) * (n_cal + 1)) / n_cal, interpolation='higher')
print(f"Calculated uncertainty margin (q_hat) in log space: {q_hat:.4f}")

Calibrating the model to find the uncertainty margin
Calculated uncertainty margin (q_hat) in log space: 0.2788


In [19]:
# Generate Intervals on Test Set

# Get point predictions on the test set (in log space)
test_preds_log = model_pipeline.predict(X_test)

# Construct the prediction intervals in log space
lower_bound_preds_log = test_preds_log - q_hat
upper_bound_preds_log = test_preds_log + q_hat

In [20]:
# Inverse transform everything back to the original price scale (Crores)
y_test_orig = np.expm1(y_test_log)
lower_bound_preds_orig_cp = np.expm1(lower_bound_preds_log)
upper_bound_preds_orig_cp = np.expm1(upper_bound_preds_log)

In [21]:
# Construct results DataFrame
results_df_conformal = pd.DataFrame({
    'True Price (Crores)': y_test_orig,
    'Lower Bound (Crores)': lower_bound_preds_orig_cp,
    'Upper Bound (Crores)': upper_bound_preds_orig_cp
})

print('--- Generated prediction intervals using Conformal Prediction (sample) ---')

# Sample
results_df_conformal.sample(5)

--- Generated prediction intervals using Conformal Prediction (sample) ---


Unnamed: 0,True Price (Crores),Lower Bound (Crores),Upper Bound (Crores)
645,1.85,1.011939,2.514001
1076,0.7977,0.433888,1.504392
775,3.5,1.717251,3.74588
204,0.8,0.291517,1.25573
1018,1.8,1.335115,3.078452


### Evaluate and compare the Quantile regression and Conformal Prediction method for Uncertainty quantification

Quantile regression metrics:

In [22]:
# Coverage: Check if the true price is within the interval for each property
covered_qr = (y_test_orig >= lower_bound_preds_orig_qr) & (y_test_orig <= upper_bound_preds_orig_qr)
coverage_qr = np.mean(covered_qr) * 100

# Width
width_qr = np.mean(upper_bound_preds_orig_qr - lower_bound_preds_orig_qr)

Conformal prediction metrics:

In [23]:
# Coverage
covered_cp = (y_test_orig >= lower_bound_preds_orig_cp) & (y_test_orig <= upper_bound_preds_orig_cp)
coverage_cp = np.mean(covered_cp) * 100

# Width
width_cp = np.mean(upper_bound_preds_orig_cp - lower_bound_preds_orig_cp)

Comparison of Uncertainty Quantification Methods:

In [24]:
# Construct dataframe for comparison
results_table = pd.DataFrame({
    'Method': ['Quantile Regression', 'Conformal Prediction'],
    'Coverage (%)': [f"{coverage_qr:.2f}%", f"{coverage_cp:.2f}%"],
    'Average Interval Width (Crores)': [f"{width_qr:.2f}", f"{width_cp:.2f}"]
})
results_table.set_index('Method', inplace=True)

print(f"\nOur target coverage was { (1 - alpha) * 100 }%\n")
# Display
results_table




Our target coverage was 95.0%



Unnamed: 0_level_0,Coverage (%),Average Interval Width (Crores)
Method,Unnamed: 1_level_1,Unnamed: 2_level_1
Quantile Regression,86.31%,1.75
Conformal Prediction,93.79%,1.91


-----------------

# Summary

The results clearly indicate the superior performance of the Conformal Prediction method over Quantile regression method. It achieved coverage of 93.8% which is very close to the target coverage of 95%.

In [25]:
# Save the re-trained model pipeline
joblib.dump(model_pipeline, 'Output Files\\Price Prediction model files\\conformal_predict_price_model_pipeline_custom_regressor.joblib')
print('Conformal price predictive model successfully saved !')

# Save the calculated uncertainty margin (q_hat)
joblib.dump(q_hat, 'Output Files\\Price Prediction model files\\q_hat.joblib')
print('Uncertainty margin (q_hat) successfully saved !')

Conformal price predictive model successfully saved !
Uncertainty margin (q_hat) successfully saved !
