## Import Libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)

# Load data

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Preprocessing (same as before, but NO new FE)

In [4]:
train_id, test_id = train['id'], test['id']
train, test = train.drop('id', axis=1), test.drop('id', axis=1)
y = train['accident_risk']
train = train.drop('accident_risk', axis=1)

all_data = pd.concat([train, test], axis=0, ignore_index=True)
cat_cols = ['road_type', 'lighting', 'weather', 'time_of_day']
for col in cat_cols:
    le = LabelEncoder()
    all_data[col] = le.fit_transform(all_data[col].astype(str))

bool_cols = ['road_signs_present', 'public_road', 'holiday', 'school_season']
for col in bool_cols:
    all_data[col] = all_data[col].astype(int)

train, test = all_data[:len(train)].reset_index(drop=True), all_data[len(train):].reset_index(drop=True)
num_cols = ['num_lanes', 'curvature', 'speed_limit', 'num_reported_accidents']
scaler = StandardScaler()
train[num_cols] = scaler.fit_transform(train[num_cols])
test[num_cols] = scaler.transform(test[num_cols])

X, X_test = train, test

print('Preprocessing complete.')

Preprocessing complete.


# Fixed LGBM params (tuned from top solutions)

In [5]:
lgbm_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'n_estimators': 1500,
    'learning_rate': 0.02,
    'max_depth': 7,
    'num_leaves': 50,
    'subsample': 0.9,
    'colsample_bytree': 0.9,
    'random_state': 42,
    'verbosity': -1
}

# RF params (from your good RF run)

In [6]:
rf_params = {
    'n_estimators': 1000,
    'max_depth': None,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'random_state': 42,
    'n_jobs': -1
}


# 3-fold CV for speed

In [7]:
kf = KFold(n_splits=3, shuffle=True, random_state=42)
lgbm_oof, rf_oof = np.zeros(len(X)), np.zeros(len(X))
lgbm_preds, rf_preds = np.zeros(len(X_test)), np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f'Fold {fold+1}/3')
    X_tr, X_vl = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_vl = y.iloc[train_idx], y.iloc[val_idx]
    
    # LGBM
    lgb_model = lgb.LGBMRegressor(**lgbm_params)
    lgb_model.fit(X_tr, y_tr)
    lgbm_oof[val_idx] = lgb_model.predict(X_vl)
    lgbm_preds += lgb_model.predict(X_test) / 3
    
    # RF
    rf_model = RandomForestRegressor(**rf_params)
    rf_model.fit(X_tr, y_tr)
    rf_oof[val_idx] = rf_model.predict(X_vl)
    rf_preds += rf_model.predict(X_test) / 3

Fold 1/3
Fold 2/3
Fold 3/3


# OOF RMSE

In [8]:
lgbm_rmse = np.sqrt(mean_squared_error(y, lgbm_oof))
rf_rmse = np.sqrt(mean_squared_error(y, rf_oof))
print(f'LGBM OOF RMSE: {lgbm_rmse:.5f}')
print(f'RF OOF RMSE: {rf_rmse:.5f}')

LGBM OOF RMSE: 0.05610
RF OOF RMSE: 0.05911


# Ensemble: 50/50 weighted (simple, effective)

In [9]:
ensemble_oof = 0.5 * lgbm_oof + 0.5 * rf_oof
ensemble_rmse = np.sqrt(mean_squared_error(y, ensemble_oof))
print(f'Ensemble OOF RMSE: {ensemble_rmse:.5f}')

Ensemble OOF RMSE: 0.05682


# Test preds

In [10]:
y_pred_test = 0.5 * lgbm_preds + 0.5 * rf_preds
y_pred_test = np.clip(y_pred_test, 0, 1)

# Diagnostics

In [11]:
print(f'Test pred mean: {y_pred_test.mean():.3f}, std: {y_pred_test.std():.3f}')
print(f'Target mean: {y.mean():.3f}, std: {y.std():.3f}')

Test pred mean: 0.352, std: 0.157
Target mean: 0.352, std: 0.166


# Submission

In [12]:
submission = pd.DataFrame({'id': test_id, 'accident_risk': y_pred_test})
submission.to_csv('submission_fixed_lgbm.csv', index=False)
print('Submission saved as submission_fixed_lgbm.csv')
print(submission.head())

Submission saved as submission_fixed_lgbm.csv
       id  accident_risk
0  517754       0.312811
1  517755       0.122548
2  517756       0.174609
3  517757       0.316887
4  517758       0.406062
