In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ensemble-of-ensemble-models-1-2/sr_upper_lightgbm_3.joblib
/kaggle/input/ensemble-of-ensemble-models-1-2/sr_lower_lightgbm_4.joblib
/kaggle/input/ensemble-of-ensemble-models-1-2/sr_upper_lightgbm_4.joblib
/kaggle/input/ensemble-of-ensemble-models-1-2/sr_lower_lightgbm_3.joblib
/kaggle/input/prediction-interval-competition-ii-house-price/sample_submission.csv
/kaggle/input/prediction-interval-competition-ii-house-price/test.csv
/kaggle/input/prediction-interval-competition-ii-house-price/dataset.csv


In [2]:
import numpy as np
import pandas as pd

import torch as torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.utils.tensorboard as tb
from tensorboard import notebook
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import RidgeCV
from sklearn.svm import SVR

from scipy.optimize import minimize
from scipy.interpolate import make_interp_spline


import matplotlib.pyplot as plt
%matplotlib inline


import xgboost as xgb
import lightgbm
import catboost

import joblib


2025-06-15 14:59:13.345339: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749999553.707138      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749999553.804065      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
train_data = pd.read_csv("/kaggle/input/prediction-interval-competition-ii-house-price/dataset.csv")

In [4]:
test_data = pd.read_csv("/kaggle/input/prediction-interval-competition-ii-house-price/test.csv")

In [5]:
print("=== STEP 2: DATA PREPROCESSING ===")

# Create a copy for processing
train_processed = train_data.copy()
test_processed = test_data.copy()

# 1. Handle missing values
print("Handling missing values...")

# For sale_nbr (numerical), fill with median
train_processed['sale_nbr'].fillna(train_processed['sale_nbr'].median(), inplace=True)
test_processed['sale_nbr'].fillna(train_processed['sale_nbr'].median(), inplace=True)

# For subdivision and submarket (categorical), fill with 'Unknown'
train_processed['subdivision'].fillna('Unknown', inplace=True)
test_processed['subdivision'].fillna('Unknown', inplace=True)
train_processed['submarket'].fillna('Unknown', inplace=True)
test_processed['submarket'].fillna('Unknown', inplace=True)

print("Missing values handled.")

# 2. Feature Engineering
print("Creating new features...")

# Date features
train_processed['sale_date'] = pd.to_datetime(train_processed['sale_date'])
test_processed['sale_date'] = pd.to_datetime(test_processed['sale_date'])

train_processed['sale_year'] = train_processed['sale_date'].dt.year
train_processed['sale_month'] = train_processed['sale_date'].dt.month
train_processed['sale_quarter'] = train_processed['sale_date'].dt.quarter

test_processed['sale_year'] = test_processed['sale_date'].dt.year
test_processed['sale_month'] = test_processed['sale_date'].dt.month
test_processed['sale_quarter'] = test_processed['sale_date'].dt.quarter

# Age of house at sale
train_processed['house_age'] = train_processed['sale_year'] - train_processed['year_built']
test_processed['house_age'] = test_processed['sale_year'] - test_processed['year_built']

# Years since renovation (0 if never renovated)
train_processed['years_since_reno'] = np.where(
    train_processed['year_reno'] > 0,
    train_processed['sale_year'] - train_processed['year_reno'],
    train_processed['house_age']
)
test_processed['years_since_reno'] = np.where(
    test_processed['year_reno'] > 0,
    test_processed['sale_year'] - test_processed['year_reno'],
    test_processed['house_age']
)

# Total bathrooms
train_processed['total_baths'] = (train_processed['bath_full'] + 
                                 train_processed['bath_3qtr'] * 0.75 + 
                                 train_processed['bath_half'] * 0.5)
test_processed['total_baths'] = (test_processed['bath_full'] + 
                                test_processed['bath_3qtr'] * 0.75 + 
                                test_processed['bath_half'] * 0.5)

# Price per sqft (only for training data)
train_processed['price_per_sqft'] = train_processed['sale_price'] / train_processed['sqft']

# Total value (land + improvement)
train_processed['total_val'] = train_processed['land_val'] + train_processed['imp_val']
test_processed['total_val'] = test_processed['land_val'] + test_processed['imp_val']

# Total view score (sum of all view features)
view_cols = [col for col in train_processed.columns if col.startswith('view_')]
train_processed['total_views'] = train_processed[view_cols].sum(axis=1)
test_processed['total_views'] = test_processed[view_cols].sum(axis=1)

# Garage + basement sqft
train_processed['total_extra_sqft'] = train_processed['garb_sqft'] + train_processed['gara_sqft']
test_processed['total_extra_sqft'] = test_processed['garb_sqft'] + test_processed['gara_sqft']

print("New features created.")

# 3. Encode categorical variables
print("Encoding categorical variables...")

categorical_cols = ['sale_warning', 'join_status', 'city', 'zoning', 'subdivision', 'submarket']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    # Fit on combined data to ensure consistent encoding
    combined_data = pd.concat([train_processed[col], test_processed[col]], axis=0)
    le.fit(combined_data.astype(str))
    
    train_processed[col + '_encoded'] = le.transform(train_processed[col].astype(str))
    test_processed[col + '_encoded'] = le.transform(test_processed[col].astype(str))
    
    label_encoders[col] = le

print("Categorical variables encoded.")

# 4. Select features for modeling
print("Selecting features...")

# Features to exclude
exclude_cols = ['id', 'sale_date', 'sale_price', 'price_per_sqft'] + categorical_cols

# Get all feature columns
feature_cols = [col for col in train_processed.columns if col not in exclude_cols]

print(f"Selected {len(feature_cols)} features for modeling:")
print(feature_cols[:10], "... (showing first 10)")

# Prepare final datasets
X_train = train_processed[feature_cols]
y_train = train_processed['sale_price']
X_test = test_processed[feature_cols]
test_ids = test_processed['id']

print(f"\nFinal shapes:")
print(f"X_train: {X_train.shape}")
print(f"y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}")

# 5. Check for any remaining issues
print(f"\nData quality check:")
print(f"X_train missing values: {X_train.isnull().sum().sum()}")
print(f"X_test missing values: {X_test.isnull().sum().sum()}")
print(f"y_train missing values: {y_train.isnull().sum()}")

# Save processed data
print("\nSaving processed data...")
X_train.to_csv('X_train_processed.csv', index=False)
y_train.to_csv('y_train_processed.csv', index=False)
X_test.to_csv('X_test_processed.csv', index=False)
test_ids.to_csv('test_ids.csv', index=False)

print("Step 2 completed successfully!")
print("\nNext step will involve building prediction interval models.")

=== STEP 2: DATA PREPROCESSING ===
Handling missing values...
Missing values handled.
Creating new features...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_processed['sale_nbr'].fillna(train_processed['sale_nbr'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_processed['sale_nbr'].fillna(train_processed['sale_nbr'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method

New features created.
Encoding categorical variables...
Categorical variables encoded.
Selecting features...
Selected 53 features for modeling:
['sale_nbr', 'join_year', 'latitude', 'longitude', 'area', 'present_use', 'land_val', 'imp_val', 'year_built', 'year_reno'] ... (showing first 10)

Final shapes:
X_train: (200000, 53)
y_train: (200000,)
X_test: (200000, 53)

Data quality check:
X_train missing values: 0
X_test missing values: 0
y_train missing values: 0

Saving processed data...
Step 2 completed successfully!

Next step will involve building prediction interval models.


In [6]:
X_train = pd.read_csv('X_train_processed.csv').to_numpy()
y_train = pd.read_csv('y_train_processed.csv').to_numpy()
X_given_test = pd.read_csv('X_test_processed.csv').to_numpy()
id_given_test = pd.read_csv('test_ids.csv').to_numpy()

In [7]:
print(X_train.shape)
print(y_train.shape)
print(X_given_test.shape)
print(id_given_test.shape)

(200000, 53)
(200000, 1)
(200000, 53)
(200000, 1)


In [8]:
y_train = y_train.reshape(-1,)
id_given_test = id_given_test.reshape(-1,)

In [9]:
print(X_train.shape)
print(y_train.shape)
print(X_given_test.shape)
print(id_given_test.shape)

(200000, 53)
(200000,)
(200000, 53)
(200000,)


In [10]:
X_train, X_test, y_sale_price_train, y_sale_price_test = train_test_split(X_train, y_train, test_size = 0.1, random_state=42)

X_test, X_val, y_sale_price_test, y_sale_price_val = train_test_split(X_test, y_sale_price_test, test_size = 0.5, random_state=42)

X_train, X_calib, y_sale_price_train, y_calib = train_test_split(X_train, y_sale_price_train, test_size = 0.2, random_state=42)

In [11]:
# Model 1

In [12]:
# lower limit quantile alpha=0.05

In [13]:
# XGBoost

xgb_model_lower_1 = xgb.XGBRegressor(objective="reg:quantileerror", 
                               quantile_alpha = 0.05,
                        n_estimators=1500,
                        subsample=0.84,  
                        colsample_bytree=0.85,
                        #device='cuda', 
                        reg_alpha=0.023,
                        learning_rate=0.10,
                        random_state=42,
                        enable_categorical=True,
                        tree_method = 'hist',
                        n_jobs = -1)

In [14]:
# LightGBM

light_gbm_model_lower_1 = lightgbm.LGBMRegressor(n_estimators=1200,
                  random_state=42,                  
                  subsample=0.87,
                  colsample_bytree=0.9,
                  reg_alpha=0.1,                  
                  objective='quantile',
                  alpha=0.05,
                  verbose=-1)

In [15]:
# CatBoost

catboost_model_lower_1 = catboost.CatBoostRegressor(loss_function='Quantile:alpha=0.05',
                                           learning_rate=0.05, 
                                           colsample_bylevel=0.89, 
                                           subsample=0.8, 
                                           max_depth=6, 
                                           iterations=1500,
                                           random_state=42, 
                                           task_type='CPU', 
                                           boosting_type='Plain', 
                                           bootstrap_type='Bernoulli', 
                                           verbose=500
                                           #'cat_features': cat_cols
                                           )

In [16]:
# GradientBoostingRegressor

hgbr_model_lower_1 = HistGradientBoostingRegressor(loss='quantile',
                                    quantile=0.05,
                                    learning_rate=0.2,
                                    max_iter=1200,
                                    #max_depth=6,
                                    max_leaf_nodes=31,
                                    min_samples_leaf=20,
                                    #min_samples_split=10,
                                    #subsample=0.8,
                                    random_state=42)

In [17]:
# Hyperparameters for StackingRegressor lower

estimators_lower_1 = [
    ('xgb_lower', xgb_model_lower_1),
    ('lgbm_lower', light_gbm_model_lower_1),
    ('cat_lower', catboost_model_lower_1),
    ('hgbr_lower', hgbr_model_lower_1)
]

final_estimator_1_lower_1 = lightgbm.LGBMRegressor(n_estimators=1200,
                  random_state=42,                  
                  subsample=0.87,
                  colsample_bytree=0.9,
                  reg_alpha=0.1,                  
                  objective='quantile',
                  alpha=0.05,
                  verbose=-1)

# final_estimator_1_lower = RidgeCV(
#                                 alphas=np.logspace(-4, 4, 20),     
#                                 #store_cv_values=True,              
#                                 cv=5,                              
#                                 scoring='neg_mean_absolute_error'
#                                 )


In [18]:
# Ensemble StackingRegressor

sr_lower_1 = StackingRegressor(estimators = estimators_lower_1,
                             final_estimator = final_estimator_1_lower_1,
                            cv = 5,
                            n_jobs = -1,
                            passthrough = True,
                            )

In [19]:
# sr_lower_1 = joblib.load('/kaggle/input/ensemble-of-ensemble-models-1-2/sr_lower_lightgbm_4.joblib')

In [20]:
sr_lower_1.fit(X_train, y_sale_price_train)

In [21]:
joblib.dump(sr_lower_1, 'sr_lower_lightgbm_4.joblib')

['sr_lower_lightgbm_4.joblib']

In [22]:
# upper limit quantile alpha=0.95

In [23]:
# XGBoost

xgb_model_upper_1 = xgb.XGBRegressor(objective="reg:quantileerror", 
                               quantile_alpha = 0.95,
                        n_estimators=1500,
                        subsample=0.84,  
                        colsample_bytree=0.85,
                        #device='cuda', 
                        reg_alpha=0.023,
                        learning_rate=0.10,
                        random_state=42,
                        enable_categorical=True,
                        tree_method = 'hist',
                        n_jobs = -1)

In [24]:
# LightGBM

light_gbm_model_upper_1 = lightgbm.LGBMRegressor(n_estimators=1200,
                  random_state=42,                  
                  subsample=0.87,
                  colsample_bytree=0.9,
                  reg_alpha=0.1,                  
                  objective='quantile',
                  alpha=0.95,
                  verbose=-1)

In [25]:
# CatBoost

catboost_model_upper_1 = catboost.CatBoostRegressor(loss_function='Quantile:alpha=0.95',
                                           learning_rate=0.05, 
                                           colsample_bylevel=0.89, 
                                           subsample=0.8, 
                                           max_depth=6, 
                                           iterations=1500,
                                           random_state=42, 
                                           task_type='CPU', 
                                           boosting_type='Plain', 
                                           bootstrap_type='Bernoulli', 
                                           verbose=500
                                           #'cat_features': cat_cols
                                           )

In [26]:
# GradientBoostingRegressor

hgbr_model_upper_1 = HistGradientBoostingRegressor(loss='quantile',
                                    quantile=0.95,
                                    learning_rate=0.2,
                                    max_iter=1200,
                                    #max_depth=6,
                                    max_leaf_nodes=31,
                                    min_samples_leaf=20,
                                    #min_samples_split=10,
                                    #subsample=0.8,
                                    random_state=42)

In [27]:
# Hyperparameters for StackingRegressor upper

estimators_upper_1 = [
    ('xgb_upper', xgb_model_upper_1),
    ('lgbm_upper', light_gbm_model_upper_1),
    ('cat_upper', catboost_model_upper_1),
    ('hgbr_upper', hgbr_model_upper_1)
]

final_estimator_2_upper_1 = lightgbm.LGBMRegressor(n_estimators=1200,
                  random_state=42,                  
                  subsample=0.87,
                  colsample_bytree=0.9,
                  reg_alpha=0.1,                  
                  objective='quantile',
                  alpha=0.95,
                  verbose=-1)

# final_estimator_2_upper = RidgeCV(
#                                 alphas=np.logspace(-4, 4, 20),     
#                                 #store_cv_values=True,              
#                                 cv=5,                              
#                                 scoring='neg_mean_absolute_error'
#                                 )

In [28]:
sr_upper_1 = StackingRegressor(estimators = estimators_upper_1,
                             final_estimator = final_estimator_2_upper_1,
                            cv = 5,
                            n_jobs = -1,
                            passthrough = True,
                            )

In [29]:
# sr_upper_1 = joblib.load('/kaggle/input/ensemble-of-ensemble-models-1-2/sr_upper_lightgbm_4.joblib')

In [30]:
sr_upper_1.fit(X_train, y_sale_price_train)

0:	learn: 21189.6341658	total: 166ms	remaining: 4m 9s
500:	learn: 7740.3104023	total: 35.1s	remaining: 1m 9s
1000:	learn: 7208.8612891	total: 54.9s	remaining: 27.4s
1499:	learn: 6941.3586776	total: 1m 12s	remaining: 0us
0:	learn: 21216.5559272	total: 13.9s	remaining: 5h 47m
0:	learn: 21223.6169000	total: 13.5s	remaining: 5h 38m 3s
0:	learn: 21195.2874920	total: 369ms	remaining: 9m 13s
0:	learn: 21210.1301418	total: 451ms	remaining: 11m 15s
500:	learn: 7733.8735448	total: 1m 47s	remaining: 3m 34s
500:	learn: 7799.1832781	total: 1m 33s	remaining: 3m 7s
500:	learn: 7750.3894030	total: 1m 49s	remaining: 3m 37s
500:	learn: 7732.0901128	total: 1m 34s	remaining: 3m 8s
1000:	learn: 7194.7269476	total: 2m 55s	remaining: 1m 27s
1000:	learn: 7209.7557636	total: 2m 56s	remaining: 1m 27s
1000:	learn: 7229.3835818	total: 2m 41s	remaining: 1m 20s
1000:	learn: 7192.5775742	total: 2m 43s	remaining: 1m 21s
1499:	learn: 7011.2218461	total: 3m 52s	remaining: 0us
42, bin=147 score 1.37082287
20, bin=1 scor

In [31]:
joblib.dump(sr_upper_1, 'sr_upper_lightgbm_4.joblib')

['sr_upper_lightgbm_4.joblib']

In [32]:
# Model 2

In [33]:
# XGBoost

xgb_model_lower_2 = xgb.XGBRegressor(objective="reg:quantileerror", 
                               quantile_alpha = 0.05,
                        n_estimators=1500,
                        subsample=0.84,  
                        colsample_bytree=0.85,
                        #device='cuda', 
                        reg_alpha=0.023,
                        learning_rate=0.10,
                        random_state=42,
                        enable_categorical=True,
                        tree_method = 'hist',
                        n_jobs = -1)

In [34]:
# LightGBM

light_gbm_model_lower_2 = lightgbm.LGBMRegressor(objective='quantile',
            alpha=0.05,
            n_estimators=1000,
            learning_rate=0.05,
            num_leaves=64,
            subsample=0.9,
            colsample_bytree=0.5,
            random_state=42,
            n_jobs=1,
            verbosity=-1,)

In [35]:
# CatBoost

catboost_model_lower_2 = catboost.CatBoostRegressor(loss_function='Quantile:alpha=0.05',
                                           learning_rate=0.05, 
                                           colsample_bylevel=0.89, 
                                           subsample=0.8, 
                                           max_depth=6, 
                                           iterations=1500,
                                           random_state=42, 
                                           task_type='CPU', 
                                           boosting_type='Plain', 
                                           bootstrap_type='Bernoulli', 
                                           verbose=500
                                           #'cat_features': cat_cols
                                           )

In [36]:
# GradientBoostingRegressor

hgbr_model_lower_2 = HistGradientBoostingRegressor(loss='quantile',
                                    quantile=0.05,
                                    learning_rate=0.2,
                                    max_iter=1200,
                                    #max_depth=6,
                                    max_leaf_nodes=31,
                                    min_samples_leaf=20,
                                    #min_samples_split=10,
                                    #subsample=0.8,
                                    random_state=42)

In [37]:
# Hyperparameters for StackingRegressor lower

estimators_lower_2 = [
    ('xgb_lower', xgb_model_lower_2),
    ('lgbm_lower', light_gbm_model_lower_2),
    ('cat_lower', catboost_model_lower_2),
    ('hgbr_lower', hgbr_model_lower_2)
]

final_estimator_1_lower_2 = lightgbm.LGBMRegressor(n_estimators=1200,
                  random_state=42,                  
                  subsample=0.87,
                  colsample_bytree=0.9,
                  reg_alpha=0.1,                  
                  objective='quantile',
                  alpha=0.05,
                  verbose=-1)

# final_estimator_1_lower = RidgeCV(
#                                 alphas=np.logspace(-4, 4, 20),     
#                                 #store_cv_values=True,              
#                                 cv=5,                              
#                                 scoring='neg_mean_absolute_error'
#                                 )

In [38]:
# Ensemble StackingRegressor

sr_lower_2 = StackingRegressor(estimators = estimators_lower_2,
                             final_estimator = final_estimator_1_lower_2,
                            cv = 5,
                            n_jobs = -1,
                            passthrough = True,
                            )

In [39]:
# sr_lower_2 = joblib.load('/kaggle/input/ensemble-of-ensemble-models-1-2/sr_lower_lightgbm_3.joblib')

In [40]:
sr_lower_2.fit(X_train, y_sale_price_train)

0:	learn: 21189.6341658	total: 153ms	remaining: 3m 49s
500:	learn: 7740.3104023	total: 29.9s	remaining: 59.5s
1000:	learn: 7208.8612891	total: 55.8s	remaining: 27.8s
1499:	learn: 6941.3586776	total: 1m 16s	remaining: 0us
0:	learn: 21195.2874920	total: 36s	remaining: 14h 59m 58s
0:	learn: 21223.6169000	total: 11.4s	remaining: 4h 45m 33s
0:	learn: 21216.5559272	total: 287ms	remaining: 7m 9s
0:	learn: 21210.1301418	total: 225ms	remaining: 5m 37s
500:	learn: 7750.3894030	total: 1m 30s	remaining: 2m 59s
500:	learn: 7799.1832781	total: 2m 7s	remaining: 4m 14s
500:	learn: 7733.8735448	total: 1m 42s	remaining: 3m 24s
500:	learn: 7732.0901128	total: 1m 32s	remaining: 3m 4s
1000:	learn: 7209.7557636	total: 2m 41s	remaining: 1m 20s
1000:	learn: 7194.7269476	total: 2m 54s	remaining: 1m 26s
1000:	learn: 7229.3835818	total: 3m 19s	remaining: 1m 39s
1000:	learn: 7192.5775742	total: 2m 44s	remaining: 1m 22s
1499:	learn: 7054.6108320	total: 4m 18s	remaining: 0us
45, bin=20 score 1.068098072
47, bin=0 s

In [41]:
joblib.dump(sr_lower_2, 'sr_lower_lightgbm_5.joblib')

['sr_lower_lightgbm_5.joblib']

In [42]:
# upper limit quantile alpha=0.95

In [43]:
# XGBoost

xgb_model_upper_2 = xgb.XGBRegressor(objective="reg:quantileerror", 
                               quantile_alpha = 0.95,
                        n_estimators=1500,
                        subsample=0.84,  
                        colsample_bytree=0.85,
                        #device='cuda', 
                        reg_alpha=0.023,
                        learning_rate=0.10,
                        random_state=42,
                        enable_categorical=True,
                        tree_method = 'hist',
                        n_jobs = -1)

In [44]:
# LightGBM

light_gbm_model_upper_2 = lightgbm.LGBMRegressor(objective='quantile',
            alpha=0.95,
            n_estimators=1000,
            learning_rate=0.05,
            num_leaves=64,
            subsample=0.9,
            colsample_bytree=0.5,
            random_state=42,
            n_jobs=1,
            verbosity=-1,)

In [45]:
# CatBoost

catboost_model_upper_2 = catboost.CatBoostRegressor(loss_function='Quantile:alpha=0.95',
                                           learning_rate=0.05, 
                                           colsample_bylevel=0.89, 
                                           subsample=0.8, 
                                           max_depth=6, 
                                           iterations=1500,
                                           random_state=42, 
                                           task_type='CPU', 
                                           boosting_type='Plain', 
                                           bootstrap_type='Bernoulli', 
                                           verbose=500
                                           #'cat_features': cat_cols
                                           )

In [46]:
# GradientBoostingRegressor

hgbr_model_upper_2 = HistGradientBoostingRegressor(loss='quantile',
                                    quantile=0.95,
                                    learning_rate=0.2,
                                    max_iter=1200,
                                    #max_depth=6,
                                    max_leaf_nodes=31,
                                    min_samples_leaf=20,
                                    #min_samples_split=10,
                                    #subsample=0.8,
                                    random_state=42)

In [47]:
# Hyperparameters for StackingRegressor upper

estimators_upper_2 = [
    ('xgb_upper', xgb_model_upper_2),
    ('lgbm_upper', light_gbm_model_upper_2),
    ('cat_upper', catboost_model_upper_2),
    ('hgbr_upper', hgbr_model_upper_2)
]

final_estimator_2_upper_2 = lightgbm.LGBMRegressor(n_estimators=1200,
                  random_state=42,                  
                  subsample=0.87,
                  colsample_bytree=0.9,
                  reg_alpha=0.1,                  
                  objective='quantile',
                  alpha=0.95,
                  verbose=-1)

# final_estimator_2_upper = RidgeCV(
#                                 alphas=np.logspace(-4, 4, 20),     
#                                 #store_cv_values=True,              
#                                 cv=5,                              
#                                 scoring='neg_mean_absolute_error'
#                                 )

In [48]:
sr_upper_2 = StackingRegressor(estimators = estimators_upper_2,
                             final_estimator = final_estimator_2_upper_2,
                            cv = 5,
                            n_jobs = -1,
                            passthrough = True,
                            )

In [49]:
# sr_upper_2 = joblib.load('/kaggle/input/ensemble-of-ensemble-models-1-2/sr_upper_lightgbm_3.joblib')

In [50]:
sr_upper_2.fit(X_train, y_sale_price_train)

In [51]:
joblib.dump(sr_upper_2, 'sr_upper_lightgbm_5.joblib')

['sr_upper_lightgbm_5.joblib']

In [52]:
q5_cal_1 = sr_lower_1.predict(X_calib)
q95_cal_1 = sr_upper_1.predict(X_calib)

residuals_1 = np.maximum(y_calib - q95_cal_1, q5_cal_1 - y_calib)
conformal_quantile_1 = np.quantile(residuals_1,0.9)

In [53]:
q5_cal_2 = sr_lower_2.predict(X_calib)
q95_cal_2 = sr_upper_2.predict(X_calib)

residuals_2 = np.maximum(y_calib - q95_cal_2, q5_cal_2 - y_calib)
conformal_quantile_2 = np.quantile(residuals_2,0.9)

In [54]:
# prediction on test set

q5_test_1 = sr_lower_1.predict(X_test)
q95_test_1 = sr_upper_1.predict(X_test)

lower_bound_test_1 = q5_test_1 - conformal_quantile_1
upper_bound_test_1 = q95_test_1 + conformal_quantile_1

In [55]:
# prediction on test set

q5_test_2 = sr_lower_2.predict(X_test)
q95_test_2 = sr_upper_2.predict(X_test)

lower_bound_test_2 = q5_test_2 - conformal_quantile_2
upper_bound_test_2 = q95_test_2 + conformal_quantile_2

In [72]:
lower_bound_test = lower_bound_test_1*0.5897 + lower_bound_test_2*(1-0.5897)
upper_bound_test = upper_bound_test_1*0.5129+ upper_bound_test_2*(1-0.5129)

# margin
margin = (upper_bound_test - lower_bound_test)*0.0319
lower_bound_test = lower_bound_test - margin
upper_bound_test = upper_bound_test + margin

In [57]:
# Winkler Score Function

def winkler_score(y_true, y_lower, y_upper, alpha=0.1):
    width = y_upper - y_lower
    below = y_true < y_lower
    above = y_true > y_upper
    inside = ~below & ~above

    score = np.empty_like(y_true, dtype=float)
    score[inside] = width[inside]
    score[below] = width[below] + (2 / alpha) * (y_lower[below] - y_true[below])
    score[above] = width[above] + (2 / alpha) * (y_true[above] - y_upper[above])
    return score

In [58]:
def objective(weights):
    wl, wu, m = weights
    lower_bound_test = lower_bound_test_1*wl + lower_bound_test_2*(1-wl)
    upper_bound_test = upper_bound_test_1*wu + upper_bound_test_2*(1-wu)
    
    # margin
    margin = (upper_bound_test - lower_bound_test)*m
    lower_bound_test = lower_bound_test - margin
    upper_bound_test = upper_bound_test + margin

    scores = winkler_score(y_sale_price_test, lower_bound_test, upper_bound_test, alpha=0.1)
    score = np.mean(scores)

    return score

In [59]:
initial_guess = [0.5,0.5, 0.03]
bounds = [(0,1), (0,1), (0,1)]

In [60]:
result = minimize(objective, initial_guess, method='L-BFGS-B', bounds=bounds)

In [61]:
wl_opt, wu_opt, m_opt = result.x
min_winkler_score = result.fun

In [62]:
print(f"Optimal lower weight (wl): {wl_opt:.4f}")
print(f"Optimal upper weight (wu): {wu_opt:.4f}")
print(f"Optimal margin (m): {m_opt:.4f}")
print(f"Minimum Winkler Score: {min_winkler_score:.4f}")

Optimal lower weight (wl): 0.5897
Optimal upper weight (wu): 0.5129
Optimal margin (m): 0.0319
Minimum Winkler Score: 332124.3945


In [73]:
# Winkler Score on X_test

winkler_scores_test = winkler_score(y_sale_price_test, lower_bound_test, upper_bound_test, alpha=0.1)
mean_winkler_test = np.mean(winkler_scores_test)

coverage_test = np.mean((y_sale_price_test >= lower_bound_test) & (y_sale_price_test <= upper_bound_test))

print(f"Mean Winkler Score: {mean_winkler_test:.2f}")
print(f"Empirical Coverage: {coverage_test:.3f}")

Mean Winkler Score: 332124.45
Empirical Coverage: 0.916


In [64]:
# Prediction on X_given_test (test.csv) which is to be submitted

q5_given_test_1 = sr_lower_1.predict(X_given_test)
q95_given_test_1 = sr_upper_1.predict(X_given_test)

lower_bound_given_test_1 = q5_given_test_1 - conformal_quantile_1
upper_bound_given_test_1 = q95_given_test_1 + conformal_quantile_1

In [65]:
# Prediction on X_given_test (test.csv) which is to be submitted

q5_given_test_2 = sr_lower_2.predict(X_given_test)
q95_given_test_2 = sr_upper_2.predict(X_given_test)

lower_bound_given_test_2 = q5_given_test_2 - conformal_quantile_2
upper_bound_given_test_2 = q95_given_test_2 + conformal_quantile_2

0:	learn: 62708.7911076	total: 164ms	remaining: 4m 5s
500:	learn: 9504.9325684	total: 27.6s	remaining: 55.1s
1000:	learn: 8657.3040892	total: 53.9s	remaining: 26.9s
1499:	learn: 8194.2279879	total: 1m 14s	remaining: 0us


In [74]:
lower_bound_given_test = lower_bound_given_test_1*0.5897 + lower_bound_given_test_2*(1-0.5897)
upper_bound_given_test = upper_bound_given_test_1*0.5129 + upper_bound_given_test_2*(1-0.5129)

# margin
margin = (upper_bound_given_test - lower_bound_given_test)*0.0319
lower_bound_given_test = lower_bound_given_test - margin
upper_bound_given_test = upper_bound_given_test + margin

In [75]:
print(len(id_given_test))

200000


In [76]:
print(len(lower_bound_given_test))

200000


In [77]:
# saving File for Submission

result = pd.DataFrame({
    'id' : id_given_test,
    'pi_lower' : lower_bound_given_test,
    'pi_upper' : upper_bound_given_test
})

In [78]:
print(result)

            id       pi_lower      pi_upper
0       200000  804715.803549  1.148973e+06
1       200001  555066.411071  7.528674e+05
2       200002  416245.594386  6.882123e+05
3       200003  288119.099850  4.433915e+05
4       200004  370870.377368  6.319982e+05
...        ...            ...           ...
199995  399995  219830.242714  3.802775e+05
199996  399996  238485.846939  3.563710e+05
199997  399997  381367.462383  5.176807e+05
199998  399998  434791.643221  5.642576e+05
199999  399999  522583.952489  6.609900e+05

[200000 rows x 3 columns]


In [79]:
result.to_csv('submission_20_pi_house_price.csv', index=False)