# Model Improvement & Hyperparameter Optimization

Enhancing the baseline model through geospatial feature engineering and automated hyperparameter tuning using Optuna.

In [1]:
pip install lightgbm


Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import joblib
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import re
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

In [3]:
# LOAD TEAM'S PROCESSED DATA

print("Loading team's processed data...")

# Loading artifacts created by teammates
X_team = joblib.load("../data/X.joblib")
y = joblib.load("../data/y.joblib")
test_ids = joblib.load("../data/test_ids.joblib")
X_test_team = joblib.load("../data/X_test.joblib")

# Load raw data for geospatial calculations (Latitude/Longitude)
train_raw = pd.read_csv("../data/train.csv")
test_raw = pd.read_csv("../data/test.csv")

# Align raw data with processed data (Filtering outliers as done by the team)
train_raw["price"] = train_raw["price"].replace({"\$": "", ",": ""}, regex=True).astype(float)
train_raw = train_raw.dropna(subset=["price"])
q_high = train_raw["price"].quantile(0.99)
train_filtered = train_raw[train_raw["price"] <= q_high].reset_index(drop=True)

# Ensure row counts match
if train_filtered.shape[0] != X_team.shape[0]:
    train_filtered = train_filtered.iloc[:X_team.shape[0]]

Loading team's processed data...


In [4]:
# GEOSPATIAL FEATURE ENGINEERING

print("Generating geospatial features...")

def haversine_distance(lat1, lon1, lat2, lon2):
    #Calculates distance (km) between two coordinates.
    R = 6371  # Earth radius
    phi1, phi2 = np.radians(lat1), np.radians(lat2)
    dphi = np.radians(lat2 - lat1)
    dlambda = np.radians(lon2 - lon1)
    a = np.sin(dphi/2)**2 + np.cos(phi1)*np.cos(phi2)*np.sin(dlambda/2)**2
    c = 2*np.arctan2(np.sqrt(a), np.sqrt(1-a))
    return R * c

# Critical locations in Istanbul impacting price
locations = {
    "Taksim": (41.0370, 28.9851),
    "Sultanahmet": (41.0054, 28.9768),
    "Besiktas": (41.0422, 29.0060),
    "Kadikoy": (40.9901, 29.0254),
    "Airport": (41.2811, 28.7533)
}

def get_geo_features(df):
    geo_data = pd.DataFrame()
    for loc, (lat, lon) in locations.items():
        geo_data[f"dist_{loc}"] = haversine_distance(df["latitude"], df["longitude"], lat, lon)
    
    # Distance to the nearest city center (excluding airport)
    centers = [c for c in geo_data.columns if "Airport" not in c]
    geo_data["min_dist_center"] = geo_data[centers].min(axis=1)
    return geo_data
    
# Generate features
X_geo_train = get_geo_features(train_filtered)
X_geo_test = get_geo_features(test_raw)

Generating geospatial features...


In [5]:
# MERGE & CLEAN

print("Merging features...")
# Combine Team's Features + New Geo Features
X_final = pd.concat([X_team.reset_index(drop=True), X_geo_train.reset_index(drop=True)], axis=1)
X_test_final = pd.concat([X_test_team.reset_index(drop=True), X_geo_test.reset_index(drop=True)], axis=1)

# Clean column names to prevent LightGBM errors (special characters/duplicates)
def clean_col_names(df):
    new_cols = []
    seen_cols = {}
    for col in df.columns:
        new_col = re.sub(r'[^A-Za-z0-9_]+', '', str(col))
        if new_col in seen_cols:
            seen_cols[new_col] += 1
            new_col = f"{new_col}_{seen_cols[new_col]}"
        else:
            seen_cols[new_col] = 1
        new_cols.append(new_col)
    df.columns = new_cols
    return df

X_final = clean_col_names(X_final)
X_test_final = clean_col_names(X_test_final)

Merging features...


In [6]:
pip install optuna

Note: you may need to restart the kernel to use updated packages.


In [7]:
# HYPERPARAMETER OPTIMIZATION (OPTUNA)
import optuna
print("Starting Optuna optimization...")

X_train, X_valid, y_train, y_valid = train_test_split(X_final, y, test_size=0.2, random_state=42)

def objective(trial):
    params = {
        "objective": "regression",
        "metric": "rmse",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "n_estimators": trial.suggest_int("n_estimators", 500, 2500),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "num_leaves": trial.suggest_int("num_leaves", 20, 100),
        "max_depth": trial.suggest_int("max_depth", 5, 15),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "random_state": 42,
        "n_jobs": -1
    }
    
    model = lgb.LGBMRegressor(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        callbacks=[lgb.early_stopping(50, verbose=False)]
    )
    
    preds = model.predict(X_valid)
    rmse = np.sqrt(mean_squared_error(y_valid, preds))
    return rmse

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20) # Keep trials low for quick demo, increase for better results

print(f"Best RMSE: {study.best_value:.5f}")
print("Best Params:", study.best_params)

[I 2025-12-02 02:38:11,510] A new study created in memory with name: no-name-d3b461c8-43d4-4bcc-8c78-ef865f1d636f


Starting Optuna optimization...


[I 2025-12-02 02:38:17,356] Trial 0 finished with value: 0.42467776374313393 and parameters: {'n_estimators': 691, 'learning_rate': 0.036459672832288126, 'num_leaves': 89, 'max_depth': 8, 'colsample_bytree': 0.858742793555864, 'subsample': 0.5343167316151838}. Best is trial 0 with value: 0.42467776374313393.
[I 2025-12-02 02:38:20,280] Trial 1 finished with value: 0.4224744719330957 and parameters: {'n_estimators': 1217, 'learning_rate': 0.0914301961837681, 'num_leaves': 25, 'max_depth': 8, 'colsample_bytree': 0.8565901402587315, 'subsample': 0.883577982435841}. Best is trial 1 with value: 0.4224744719330957.
[I 2025-12-02 02:38:23,787] Trial 2 finished with value: 0.418610255341611 and parameters: {'n_estimators': 1451, 'learning_rate': 0.08236359401967983, 'num_leaves': 24, 'max_depth': 7, 'colsample_bytree': 0.58728582603724, 'subsample': 0.8498208352647839}. Best is trial 2 with value: 0.418610255341611.
[I 2025-12-02 02:38:31,918] Trial 3 finished with value: 0.41759240089351135 a

Best RMSE: 0.41483
Best Params: {'n_estimators': 2059, 'learning_rate': 0.025343452969683825, 'num_leaves': 68, 'max_depth': 11, 'colsample_bytree': 0.6625893951602744, 'subsample': 0.6509179025430295}


In [8]:
# TRAIN & SAVE FINAL MODEL
import os

print("Training final model with best parameters...")
best_model = lgb.LGBMRegressor(**study.best_params)
best_model.fit(X_train, y_train)


os.makedirs("../models", exist_ok=True)
os.makedirs("../data", exist_ok=True)
os.makedirs("../submissions", exist_ok=True)


# Save artifacts for the Evaluation Notebook
print("Saving artifacts...")
joblib.dump(best_model, "../models/best_lgbm_model.pkl")
joblib.dump(X_final, "../data/X_final_enhanced.joblib")
joblib.dump(y, "../data/y_final.joblib") 
joblib.dump(X_valid, "../data/X_valid_for_eval.joblib") 
joblib.dump(y_valid, "../data/y_valid_for_eval.joblib")

# Generate Submission
final_preds = np.expm1(best_model.predict(X_test_final))
final_preds = np.maximum(final_preds, 0)

# Use original IDs
df_test_orig = pd.read_csv("../data/test.csv")
sub = pd.DataFrame({"ID": df_test_orig["id"], "TARGET": final_preds})
sub.to_csv(f"../submissions/submission_optuna_{study.best_value:.4f}.csv", index=False)

print("Model saved and submission created successfully.")

Training final model with best parameters...
Saving artifacts...
Model saved and submission created successfully.
