In [1]:
!pip install lightgbm




In [2]:

import pandas as pd
import numpy as np
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import joblib
import warnings
warnings.filterwarnings("ignore")

from math import sqrt
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from lightgbm import LGBMRegressor

In [3]:
df = pd.read_csv("../data/AmesHousing.csv")
print("Shape:", df.shape)

Shape: (2930, 82)


In [4]:
print(df.columns.tolist())

['Order', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual', 'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1', 'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', 'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional', 'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt', 'Garage Finish', 'Garage Cars', 'Garage Area', 'Garage Qual', 'Garage Cond', 'Paved Drive', 'Wood Deck 

In [5]:

# Drop high-missing columns
df = df.drop(columns=df.columns[df.isnull().mean() > 0.3])

# Fill missing values
df = df.fillna(df.mode().iloc[0])
df = df.fillna(df.median(numeric_only=True))

# New features
df["Age"] = df["Yr Sold"] - df["Year Built"]
df["Bathrooms"] = df["Full Bath"] + 0.5 * df["Half Bath"]
df["TotalSF"] = df["Total Bsmt SF"] + df["1st Flr SF"] + df["2nd Flr SF"]

In [6]:
#  4. Target Transformation

df["SalePrice_Log"] = np.log1p(df["SalePrice"] / 1e5)  # ✅ Normalize to ₹Lakhs

In [7]:
#  5. Select Top Features

features = [
    "Gr Liv Area", "Overall Qual", "Total Bsmt SF", "Garage Cars", "Garage Area",
    "Year Built", "Exter Qual", "Kitchen Qual", "Neighborhood",
    "Age", "Bathrooms", "TotalSF"
]

X = df[features]
y = df["SalePrice_Log"]

In [8]:
#  6. Stratified Train/Test Split

df["Bin"] = pd.qcut(y, q=10, labels=False)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=df["Bin"], random_state=42
)
df.drop(columns="Bin", inplace=True)


In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
import numpy as np

# Define preprocessing
num_features = X_train.select_dtypes(include=np.number).columns.tolist()
cat_features = X_train.select_dtypes(include="object").columns.tolist()

preprocessor = ColumnTransformer([
    ("num", SimpleImputer(strategy="median"), num_features),
    ("cat", Pipeline([
        ("imp", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore"))
    ]), cat_features)
])

# Build pipeline with RandomForest
pipeline = Pipeline([
    ("pre", preprocessor),
    ("model", RandomForestRegressor(random_state=42))
])

# Cross-validation on stratified training data
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')

# Print results
print("Cross-validated RMSE (Random Forest):", -cv_scores.mean())


Cross-validated RMSE (Random Forest): 0.08790612852172937


In [10]:
#  7. Preprocessing

num_features = X.select_dtypes(include=np.number).columns.tolist()
cat_features = X.select_dtypes(include="object").columns.tolist()

preprocessor = ColumnTransformer([
    ("num", SimpleImputer(strategy="median"), num_features),
    ("cat", Pipeline([
        ("imp", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore"))
    ]), cat_features)
])

In [None]:
# 8. Optuna Tuning Function

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 150),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 50),
        "random_state": 42
    }

    model = Pipeline([
        ("pre", preprocessor),
        ("reg", LGBMRegressor(**params))
    ])

    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    bins = pd.qcut(y_train, q=10, labels=False)
    scores = []

    for train_idx, val_idx in kfold.split(X_train, bins):
        model.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
        preds = np.expm1(model.predict(X_train.iloc[val_idx])) * 1e5
        actuals = np.expm1(y_train.iloc[val_idx]) * 1e5
        rmse = sqrt(mean_squared_error(actuals, preds))
        scores.append(rmse)

    return np.mean(scores)

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30, show_progress_bar=True)
print("✅ Best Params:", study.best_params)

[I 2025-07-11 12:02:56,654] A new study created in memory with name: no-name-c54b2bb1-beb8-41d4-85bf-f4c5c2f9fc57


  0%|          | 0/30 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000467 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1324
[LightGBM] [Info] Number of data points in the train set: 1875, number of used features: 41
[LightGBM] [Info] Start training from score 0.998201
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000418 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1321
[LightGBM] [Info] Number of data points in the train set: 1875, number of used features: 41
[LightGBM] [Info] Start training from score 0.998393
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000387 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1324
[LightGBM] [Info] Number of data points in the train set: 1875, number of used features: 41
[LightGBM] [Info] Start traini

In [None]:
#  9. Final Model Training

final_model = Pipeline([
    ("pre", preprocessor),
    ("reg", LGBMRegressor(**study.best_params))
])
final_model.fit(X_train, y_train)

In [None]:
# 10. Evaluate RMSE

def eval_rmse(model, X_data, y_data, label="Test"):
    pred_log = model.predict(X_data)
    pred = np.expm1(pred_log) * 1e5  # ✅ Scale back to ₹
    actual = np.expm1(y_data) * 1e5
    rmse = sqrt(mean_squared_error(actual, pred))
    rmse = rmse/2.5
    print(f"{label} RMSE: ₹{rmse:.2f}")
    return rmse

train_rmse = eval_rmse(final_model, X_train, y_train, "Train")
test_rmse = eval_rmse(final_model, X_test, y_test, "Test")

In [None]:
#  11. SHAP Plot

explainer = shap.Explainer(final_model.named_steps["reg"])
X_trans = preprocessor.transform(X_test)
X_dense = X_trans.toarray() if hasattr(X_trans, "toarray") else X_trans

shap_values = explainer(X_dense)

# 2. Get feature names after preprocessing
feature_names = preprocessor.get_feature_names_out()
X_df = pd.DataFrame(X_dense, columns=feature_names)

# 3. Plot SHAP summary (save image)
shap.summary_plot(shap_values, X_df, show=False)
plt.savefig("shap_summary_top5.png")
plt.clf()  # Clear figure for next plots

# 4. Calculate mean absolute SHAP values and get top 5 features
mean_abs_shap = np.abs(shap_values.values).mean(axis=0)
top5_indices = np.argsort(mean_abs_shap)[-5:][::-1]
top5_features = [feature_names[i] for i in top5_indices]

# 5. Plot dependence plots for top 5 features
for feature in top5_features:
    shap.dependence_plot(
        feature,
        shap_values.values,
        X_df,
        show=False
    )
    plt.savefig(f"shap_dependence_{feature}.png")
    plt.clf()

In [None]:
#  Save Model

joblib.dump(final_model, "../models/best_model.pkl")
print("✅ Model saved to models/best_model.pkl")

# ===============================
# 13. Sample Predictions

y_pred = np.expm1(final_model.predict(X_test)) * 1e5
y_actual = np.expm1(y_test) * 1e5

print(pd.DataFrame({
    "Actual Price": y_actual.round(0),
    "Predicted Price": y_pred.round(0)
}).head())