In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
%load_ext cudf.pandas
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor, early_stopping, log_evaluation

import optuna
from optuna.visualization import (
    plot_optimization_history,
    plot_param_importances,
    plot_slice,
    plot_parallel_coordinate
)


import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e10/sample_submission.csv
/kaggle/input/playground-series-s5e10/train.csv
/kaggle/input/playground-series-s5e10/test.csv
/kaggle/input/simulated-roads-accident-data/synthetic_road_accidents_10k.csv
/kaggle/input/simulated-roads-accident-data/synthetic_road_accidents_2k.csv
/kaggle/input/simulated-roads-accident-data/synthetic_road_accidents_100k.csv


In [30]:
train = pd.read_csv('/kaggle/input/playground-series-s5e10/train.csv', index_col='id')
test = pd.read_csv('/kaggle/input/playground-series-s5e10/test.csv', index_col='id')

In [31]:
train = train.drop_duplicates()

In [32]:
train = train.head(200_000)

In [33]:
num_cols = test.select_dtypes(include=['float64', 'int64']).columns.to_list()
cat_cols = test.select_dtypes(include='object').columns.to_list()

In [34]:
for col in num_cols:
    if test[col].dtype == 'float64':
        train[col] = train[col].astype('float32')
        test[col] = test[col].astype('float32')
        
    else:
        train[col] = train[col].astype('int32')
        test[col] = test[col].astype('int32')

for col in cat_cols:
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')

In [35]:
features = ['road_type', 'num_lanes', 'curv_bin', 'speed_limit', 'lighting',
       'weather', 'road_signs_present', 'public_road', 'time_of_day',
       'holiday', 'school_season', 'num_reported_accidents']

target = 'accident_risk'

In [37]:
num_cols = test.select_dtypes(include=['float32', 'int32']).columns.to_list()
cat_cols = test.select_dtypes(include='category').columns.to_list()

# Feature Engineering

## Creating base risk feature based on domain knowledge

In [38]:
def risk(df):
    base_risk = (0.4 * df['curvature'] +
                 0.2 * (df['lighting'] == 'night').astype(int) +
                 0.1 * (df["weather"] != "clear").astype(int) +
                 0.2 * (df["speed_limit"] >= 60).astype(int) +
                 0.1 * (np.array(df["num_reported_accidents"] > 4).astype(int)
    ))
                 
    noise = np.random.normal(0, 0.05, df.shape[0])
    risk_score = np.clip(base_risk + noise, 0, 1)
    df["simulated_risk"] = np.round(risk_score, 2)

    return df

In [39]:
train = risk(train)
test = risk(test)

In [40]:
TE = []
for c in cat_cols:
    te_map = train.groupby(c)[target].mean()
    n = f"TE_{c}"
    print(f"{n}, ",end="")
 
    train[n] = train[c].map(te_map)
    test[n] = test[c].map(te_map)

    global_mean = train[target].mean()
    train[n].fillna(global_mean, inplace=True)
    test[n].fillna(global_mean, inplace=True)
    
    TE.append(n)

TE_road_type, TE_lighting, TE_weather, TE_time_of_day, 

In [41]:
def feature_engineering(df):
    
    df['speed_visibility'] = df['speed_limit'] * df['lighting'].map({'daylight': 0.5, 'dim': 1.0, 'night': 1.5}).fillna(1.0)
    df['curv_speed'] = df['curvature'] * df['speed_limit'] 
    df['curv_sq'] = df['curvature']**2
    df['speed_sq'] = df['speed_limit']**2
    df['curv_per_lane'] = df['curvature'] / (df['num_lanes'] + 1)
    df['risk_density'] = df['curv_speed'] / (df['num_lanes'] + 1)
    df['curve_night_risk'] = df['curvature'] * (df['lighting'] == 'night').astype(int)
    df['curv_log'] = np.log1p(df['curvature'])
    df['speed_log'] = np.log1p(df['speed_limit'])
    df['meta_curvature'] = 0.3 * df['curvature']
    df['meta_night'] = 0.2 * (df['lighting'] == 'night').astype(int)
    df['meta_speed'] = 0.2 * (df['speed_limit'] >= 60).astype(int)
   
    return df

train = feature_engineering(train)
test = feature_engineering(test)

In [42]:
for c in cat_cols:
    train[c] = pd.Categorical(train[c]).codes
    test[c] = pd.Categorical(test[c]).codes

# Splitting the data into training and testing sets

In [43]:
X = train.copy()
y = X.pop('accident_risk')

In [44]:
kf = KFold(n_splits=5, shuffle=True, random_state=2)
n_folds = 5

In [45]:
def objective(trial):
    params = {
        "objective": "regression",
        "metric": "rmse",
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.2, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 5, 150),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-3, 10.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 10.0),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "n_estimators": trial.suggest_int('n_estimators', 300, 1200),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 20, 100),
        "device_type": "gpu",
        "random_state": 2,
        'verbose': -1,
        'n_jobs': -1,
        'early_stopping_rounds': 100,
    }

    scores= []

    for fold, (train_index, valid_index) in enumerate(kf.split(X, y), start=1):
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        
        lgbm = LGBMRegressor(**params).fit(X_train, y_train, eval_set=[(X_valid, y_valid)])
        pred_valid = lgbm.predict(X_valid)
        fold_rmse = np.sqrt(mean_squared_error(y_valid, pred_valid))
        scores.append(fold_rmse)
    
    return np.mean(scores)
    
study = optuna.create_study(direction='minimize', study_name='LGBM-RMSE-Optimization')
study.optimize(objective, n_trials=12, show_progress_bar=True)

print(f'Best cross-validation RMSE: {study.best_value:,.5f}')
print(f'Best parameters: {study.best_params}')

[I 2025-10-31 06:54:02,487] A new study created in memory with name: LGBM-RMSE-Optimization


  0%|          | 0/12 [00:00<?, ?it/s]

[I 2025-10-31 06:55:24,585] Trial 0 finished with value: 0.05701873690650533 and parameters: {'learning_rate': 0.024962198118111283, 'num_leaves': 114, 'feature_fraction': 0.8650481892816195, 'bagging_fraction': 0.8363466626330924, 'bagging_freq': 3, 'reg_alpha': 7.9175633796918605, 'reg_lambda': 4.254837763785412, 'max_depth': 4, 'n_estimators': 1151, 'min_data_in_leaf': 57}. Best is trial 0 with value: 0.05701873690650533.
[I 2025-10-31 06:56:10,547] Trial 1 finished with value: 0.07053869779581493 and parameters: {'learning_rate': 0.004012825662606918, 'num_leaves': 52, 'feature_fraction': 0.9425360298930034, 'bagging_fraction': 0.8493385674202169, 'bagging_freq': 3, 'reg_alpha': 5.1775588594773065, 'reg_lambda': 5.00510781034894, 'max_depth': 9, 'n_estimators': 385, 'min_data_in_leaf': 27}. Best is trial 0 with value: 0.05701873690650533.
[I 2025-10-31 06:57:02,302] Trial 2 finished with value: 0.07023661202702332 and parameters: {'learning_rate': 0.004190288241462563, 'num_leaves'

In [47]:
# Parameter importance plot (which hyperparameters matter most)
plot_param_importances(study)

In [50]:
best_params = study.best_params
best_params.update({
    'verbose': -1,
    'random_state': 2,
    "objective": "regression",
    "metric": "rmse",
    "device_type": "gpu",
})
best_model = LGBMRegressor(**best_params)
best_model.fit(X, y)

In [51]:
X_test = test.copy()
test_preds = best_model.predict(X_test)

In [52]:
sub = pd.read_csv('/kaggle/input/playground-series-s5e10/sample_submission.csv')
sub['accident_risk'] = test_preds
sub.to_csv('submission.csv', index=False)
sub.head()

Unnamed: 0,id,accident_risk
0,517754,0.292851
1,517755,0.125354
2,517756,0.186493
3,517757,0.324
4,517758,0.404756
