# Import Libraries

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import catboost as cb
from sklearn.ensemble import RandomForestRegressor  # Reuse your RF
import optuna
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)

  from .autonotebook import tqdm as notebook_tqdm


# Load data

In [5]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Preprocessing (same as RF)

In [6]:
train_id, test_id = train['id'], test['id']
train, test = train.drop('id', axis=1), test.drop('id', axis=1)
y = train['accident_risk']
train = train.drop('accident_risk', axis=1)

all_data = pd.concat([train, test], axis=0, ignore_index=True)
cat_cols = ['road_type', 'lighting', 'weather', 'time_of_day']
for col in cat_cols:
    le = LabelEncoder()
    all_data[col] = le.fit_transform(all_data[col].astype(str))

bool_cols = ['road_signs_present', 'public_road', 'holiday', 'school_season']
for col in bool_cols:
    all_data[col] = all_data[col].astype(int)

train, test = all_data[:len(train)].reset_index(drop=True), all_data[len(train):].reset_index(drop=True)
num_cols = ['num_lanes', 'curvature', 'speed_limit', 'num_reported_accidents']
scaler = StandardScaler()
train[num_cols] = scaler.fit_transform(train[num_cols])
test[num_cols] = scaler.transform(test[num_cols])

X, X_test = train, test


# Simple FE: Interaction example

In [7]:
X['curv_speed'] = X['curvature'] * X['speed_limit']
X_test['curv_speed'] = X_test['curvature'] * X_test['speed_limit']

# Optuna Objective for LGBM

In [8]:
def lgbm_objective(trial):
    params = {
        'objective': 'regression', 'metric': 'rmse', 'verbosity': -1,
        'boosting_type': 'gbdt', 'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'subsample': trial.suggest_float('subsample', 0.8, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.8, 1.0),
        'random_state': 42
    }
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = []
    for train_idx, val_idx in kf.split(X):
        X_tr, X_vl = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_vl = y.iloc[train_idx], y.iloc[val_idx]
        model = lgb.LGBMRegressor(**params)
        model.fit(X_tr, y_tr)
        pred = model.predict(X_vl)
        scores.append(np.sqrt(mean_squared_error(y_vl, pred)))
    return np.mean(scores)

# Tune LGBM

In [9]:
study_lgbm = optuna.create_study(direction='minimize')
study_lgbm.optimize(lgbm_objective, n_trials=50)
lgbm_params = study_lgbm.best_params
print(f'Best LGBM RMSE: {study_lgbm.best_value:.5f}, Params: {lgbm_params}')

[I 2025-10-04 22:04:41,156] A new study created in memory with name: no-name-58076813-5c6b-4876-b5e5-09f6e6a00287
[I 2025-10-04 22:05:43,553] Trial 0 finished with value: 0.05613478979704034 and parameters: {'n_estimators': 1189, 'learning_rate': 0.026983773709503063, 'max_depth': 5, 'num_leaves': 62, 'subsample': 0.8509211365138216, 'colsample_bytree': 0.9467071839626432}. Best is trial 0 with value: 0.05613478979704034.
[I 2025-10-04 22:07:06,325] Trial 1 finished with value: 0.0562929649139582 and parameters: {'n_estimators': 1603, 'learning_rate': 0.06852717334289402, 'max_depth': 7, 'num_leaves': 62, 'subsample': 0.9033669454860865, 'colsample_bytree': 0.9971773409729534}. Best is trial 0 with value: 0.05613478979704034.
[I 2025-10-04 22:09:11,484] Trial 2 finished with value: 0.05612008827068371 and parameters: {'n_estimators': 1272, 'learning_rate': 0.010855675632399444, 'max_depth': 9, 'num_leaves': 36, 'subsample': 0.8095824549128481, 'colsample_bytree': 0.8971063340526393}. B

Best LGBM RMSE: 0.05602, Params: {'n_estimators': 617, 'learning_rate': 0.015921938464615877, 'max_depth': 10, 'num_leaves': 100, 'subsample': 0.9727201188776715, 'colsample_bytree': 0.8567874321811487}


# Similar for CatBoost (simplified, no full Optuna for brevity)

In [10]:
cb_params = {'iterations': 1000, 'learning_rate': 0.05, 'depth': 6, 'random_seed': 42, 'verbose': False}
print('Using default-tuned CatBoost params.')


Using default-tuned CatBoost params.


# Train models with 5-fold CV

In [11]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
lgbm_oof, cb_oof, rf_oof = np.zeros(len(X)), np.zeros(len(X)), np.zeros(len(X))
lgbm_preds, cb_preds, rf_preds = np.zeros(len(X_test)), np.zeros(len(X_test)), np.zeros(len(X_test))

for train_idx, val_idx in kf.split(X):
    X_tr, X_vl = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_vl = y.iloc[train_idx], y.iloc[val_idx]

# LGBM

In [12]:
    lgb_model = lgb.LGBMRegressor(**{**lgbm_params, 'verbosity': -1, 'random_state': 42})
    lgb_model.fit(X_tr, y_tr)
    lgbm_oof[val_idx] = lgb_model.predict(X_vl)
    lgbm_preds += lgb_model.predict(X_test) / 5

 # CatBoost

In [13]:
    cb_model = cb.CatBoostRegressor(**cb_params)
    cb_model.fit(X_tr, y_tr)
    cb_oof[val_idx] = cb_model.predict(X_vl)
    cb_preds += cb_model.predict(X_test) / 5

  # RF (your original)

In [14]:

    rf_model = RandomForestRegressor(n_estimators=1000, max_depth=None, random_state=42, n_jobs=-1)
    rf_model.fit(X_tr, y_tr)
    rf_oof[val_idx] = rf_model.predict(X_vl)
    rf_preds += rf_model.predict(X_test) / 5

# OOF RMSE

In [15]:
print(f'LGBM OOF RMSE: {np.sqrt(mean_squared_error(y, lgbm_oof)):.5f}')
print(f'CatBoost OOF RMSE: {np.sqrt(mean_squared_error(y, cb_oof)):.5f}')
print(f'RF OOF RMSE: {np.sqrt(mean_squared_error(y, rf_oof)):.5f}')

LGBM OOF RMSE: 0.34948
CatBoost OOF RMSE: 0.34949
RF OOF RMSE: 0.34958


# Ensemble: Weighted average (tune weights based on OOF)

In [16]:
w_lgbm, w_cb, w_rf = 0.4, 0.3, 0.3  # Example; optimize via another Optuna if needed
ensemble_pred = w_lgbm * lgbm_preds + w_cb * cb_preds + w_rf * rf_preds
ensemble_oof = w_lgbm * lgbm_oof + w_cb * cb_oof + w_rf * rf_oof
print(f'Ensemble OOF RMSE: {np.sqrt(mean_squared_error(y, ensemble_oof)):.5f}')

Ensemble OOF RMSE: 0.34949


# Test preds

In [17]:
y_pred_test = np.clip(ensemble_pred, 0, 1)

# Submission

In [18]:
submission = pd.DataFrame({'id': test_id, 'accident_risk': y_pred_test})
submission.to_csv('submission_lgbm.csv', index=False)
print('Submission saved as submission_lgbm.csv')
print(submission.head())

Submission saved as submission_lgbm.csv
       id  accident_risk
0  517754       0.060817
1  517755       0.024368
2  517756       0.035554
3  517757       0.063725
4  517758       0.080910
