In [17]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
import optuna


In [18]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)
train.head()


Train shape: (517754, 14)
Test shape: (172585, 13)


Unnamed: 0,id,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents,accident_risk
0,0,urban,2,0.06,35,daylight,rainy,False,True,afternoon,False,True,1,0.13
1,1,urban,4,0.99,35,daylight,clear,True,False,evening,True,True,0,0.35
2,2,rural,4,0.63,70,dim,clear,False,True,morning,True,False,2,0.3
3,3,highway,4,0.07,35,dim,rainy,True,True,morning,False,False,1,0.21
4,4,rural,1,0.58,60,daylight,foggy,False,False,evening,True,False,1,0.56


In [19]:
train['source'] = 'train'
test['source'] = 'test'
test['accident_risk'] = np.nan

data = pd.concat([train, test], ignore_index=True)
print("Combined shape:", data.shape)


Combined shape: (690339, 15)


In [20]:
cat_cols = ['road_type', 'lighting', 'weather', 'time_of_day']
bool_cols = ['road_signs_present', 'public_road', 'holiday', 'school_season']

for col in cat_cols + bool_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))


In [21]:
print("Available columns:", data.columns.tolist())

Available columns: ['id', 'road_type', 'num_lanes', 'curvature', 'speed_limit', 'lighting', 'weather', 'road_signs_present', 'public_road', 'time_of_day', 'holiday', 'school_season', 'num_reported_accidents', 'accident_risk', 'source']


In [22]:
# Basic features
data['lanes_per_speed'] = data['num_lanes'] / data['speed_limit']
data['curvature_x_speed'] = data['curvature'] * data['speed_limit']
data['accidents_per_lane'] = data['num_reported_accidents'] / (data['num_lanes'] + 1)
data['is_high_risk_road'] = ((data['road_type'] == 2) & (data['speed_limit'] > 60)).astype(int)
data['weather_lighting_combo'] = data['weather'] * 10 + data['lighting']

# Additional feature interactions
data['speed_per_accident'] = data['speed_limit'] / (data['num_reported_accidents'] + 1)
data['risk_score'] = data['speed_limit'] * data['curvature'] / (data['num_lanes'] + 1)
data['accident_density'] = data['num_reported_accidents'] / (data['num_lanes'])
data['speed_curvature_ratio'] = data['speed_limit'] / (data['curvature'] + 1)

# Time-based features
data['time_risk'] = np.where(data['time_of_day'].isin([0, 4]), 1, 0)  # High risk times (dawn/dusk)
data['holiday_school'] = data['holiday'].astype(int) * data['school_season'].astype(int)

# Polynomial features for important numerical columns
data['speed_limit_squared'] = data['speed_limit'] ** 2
data['curvature_squared'] = data['curvature'] ** 2
data['num_lanes_squared'] = data['num_lanes'] ** 2

# Interaction between categorical and numerical features
for cat in ['road_type', 'weather', 'lighting']:
    data[f'{cat}_speed'] = data[cat] * data['speed_limit']
    data[f'{cat}_accidents'] = data[cat] * data['num_reported_accidents']
    
# Safety indicator features
data['road_complexity'] = data['curvature'] * (data['num_lanes'] / (data['road_signs_present'].astype(int) + 1))
data['weather_risk'] = data['weather'] * (data['speed_limit'] / data['num_lanes'])
data['combined_safety_score'] = (data['road_signs_present'].astype(int) * data['public_road'].astype(int)) / (data['speed_limit'] + 1)

In [23]:
train = data[data['source'] == 'train'].drop('source', axis=1)
test = data[data['source'] == 'test'].drop(['source', 'accident_risk'], axis=1)

X = train.drop(['accident_risk', 'id'], axis=1)
y = train['accident_risk']
X_test = test.drop('id', axis=1)


In [24]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

oof_preds_lgb = np.zeros(len(X))
oof_preds_xgb = np.zeros(len(X))
oof_preds_cat = np.zeros(len(X))
test_preds_lgb = np.zeros(len(X_test))
test_preds_xgb = np.zeros(len(X_test))
test_preds_cat = np.zeros(len(X_test))

fold = 1
for train_idx, val_idx in kf.split(X):
    print(f"\n===== Fold {fold} =====")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # LightGBM
    lgb_model = lgb.LGBMRegressor(
        n_estimators=3000,
        learning_rate=0.005,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        min_child_samples=20,
        random_state=42,
        verbose=200
    )

    early_stopping_callback = lgb.early_stopping(
    stopping_rounds=100,  # Number of rounds without improvement before stopping
    verbose=True         # Whether to print early stopping messages
    )

    lgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='rmse', callbacks=[early_stopping_callback])
    lgb_pred = lgb_model.predict(X_val)
    oof_preds_lgb[val_idx] = lgb_pred
    test_preds_lgb += lgb_model.predict(X_test) / kf.n_splits

    # XGBoost
    xgb_model = xgb.XGBRegressor(
        n_estimators=3000,
        learning_rate=0.01,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        min_child_weight=3,
        objective='reg:squarederror',
        random_state=42,
        eval_metric='rmse',
        early_stopping_rounds=100,
        verbose=200
    )
    xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)])
    xgb_pred = xgb_model.predict(X_val)
    oof_preds_xgb[val_idx] = xgb_pred
    test_preds_xgb += xgb_model.predict(X_test) / kf.n_splits

    # CatBoost
    cat_model = CatBoostRegressor(
        iterations=3000,
        learning_rate=0.005,
        depth=6,
        l2_leaf_reg=3,
        loss_function='RMSE',
        random_seed=42,
        verbose=200
    )
    cat_model.fit(X_train, y_train, eval_set=(X_val, y_val))
    cat_pred = cat_model.predict(X_val)
    oof_preds_cat[val_idx] = cat_pred
    test_preds_cat += cat_model.predict(X_test) / kf.n_splits

    # Print individual model metrics
    fold_rmse_lgb = np.sqrt(mean_squared_error(y_val, lgb_pred))
    fold_rmse_xgb = np.sqrt(mean_squared_error(y_val, xgb_pred))
    fold_rmse_cat = np.sqrt(mean_squared_error(y_val, cat_pred))
    print(f"Fold {fold} RMSE - LGB: {fold_rmse_lgb:.5f}, XGB: {fold_rmse_xgb:.5f}, CAT: {fold_rmse_cat:.5f}")
    fold += 1

print("\nOverall OOF RMSEs:")
print("LGB:", np.sqrt(mean_squared_error(y, oof_preds_lgb)))
print("XGB:", np.sqrt(mean_squared_error(y, oof_preds_xgb)))
print("CAT:", np.sqrt(mean_squared_error(y, oof_preds_cat)))


===== Fold 1 =====
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.812932
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.270068
[LightGBM] [Debug] init for col-wise cost 0.009543 seconds, init for row-wise cost 0.049070 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013677 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 1502
[LightGBM] [Info] Number of data points in the train set: 414203, number of used features: 35
[LightGBM] [Info] Start training from score 0.352605
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
Training until validation scores don't improve for 100 rounds
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [D

In [25]:
# Create stacking features using OOF predictions
stack_train = np.vstack([oof_preds_lgb, oof_preds_xgb, oof_preds_cat]).T
stack_test = np.vstack([test_preds_lgb, test_preds_xgb, test_preds_cat]).T

# Add weighted version of predictions to the stacking features
weights = np.array([0.4, 0.3, 0.3])  # Initial weights based on typical performance
weighted_oof = (oof_preds_lgb * weights[0] + oof_preds_xgb * weights[1] + oof_preds_cat * weights[2])
weighted_test = (test_preds_lgb * weights[0] + test_preds_xgb * weights[1] + test_preds_cat * weights[2])

stack_train = np.hstack([stack_train, weighted_oof.reshape(-1, 1)])
stack_test = np.hstack([stack_test, weighted_test.reshape(-1, 1)])

# Fit meta model with regularization
meta = Ridge(alpha=0.5)
meta.fit(stack_train, y)
final_test_pred = meta.predict(stack_test)
final_test_pred = np.clip(final_test_pred, 0, 1)

print("Meta ensemble complete.")
print("Meta model coefficients:", meta.coef_)

Meta ensemble complete.
Meta model coefficients: [ 0.07499567  0.7820162  -0.09322291  0.23663625]


In [26]:
submission = pd.DataFrame({
    'id': test['id'],
    'accident_risk': final_test_pred
})

submission.to_csv("submission.csv", index=False)
submission.head()


Unnamed: 0,id,accident_risk
517754,517754,0.291179
517755,517755,0.123618
517756,517756,0.182553
517757,517757,0.318115
517758,517758,0.392975
