In [50]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
%load_ext cudf.pandas
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor, early_stopping, log_evaluation
from catboost import CatBoostRegressor

import optuna
from optuna.visualization import (
    plot_optimization_history,
    plot_param_importances,
    plot_slice,
    plot_parallel_coordinate
)
%matplotlib inline
plt.style.use('seaborn-v0_8')
plt.rc('figure', figsize=(10,6), dpi=180)
plt.rc('axes', labelweight='bold', labelsize='large',
       titleweight='bold', titlesize=15, titlepad=10)
plt.rc('animation', html='html5')

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

The cudf.pandas extension is already loaded. To reload it, use:
  %reload_ext cudf.pandas
/kaggle/input/simulated-roads-accident-data/synthetic_road_accidents_10k.csv
/kaggle/input/simulated-roads-accident-data/synthetic_road_accidents_2k.csv
/kaggle/input/simulated-roads-accident-data/synthetic_road_accidents_100k.csv
/kaggle/input/playground-series-s5e10/sample_submission.csv
/kaggle/input/playground-series-s5e10/train.csv
/kaggle/input/playground-series-s5e10/test.csv


In [64]:
train = pd.read_csv('/kaggle/input/playground-series-s5e10/train.csv', index_col='id')
test = pd.read_csv('/kaggle/input/playground-series-s5e10/test.csv', index_col='id')

In [67]:
train = train.drop_duplicates()

In [68]:
num_cols = test.select_dtypes(include=['float64', 'int64']).columns.to_list()
cat_cols = test.select_dtypes(include='object').columns.to_list()

In [69]:
for col in num_cols:
    if test[col].dtype == 'float64':
        train[col] = train[col].astype('float32')
        test[col] = test[col].astype('float32')
        
    else:
        train[col] = train[col].astype('int32')
        test[col] = test[col].astype('int32')

for col in cat_cols:
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')

In [70]:
features = ['road_type', 'num_lanes', 'curv_bin', 'speed_limit', 'lighting',
       'weather', 'road_signs_present', 'public_road', 'time_of_day',
       'holiday', 'school_season', 'num_reported_accidents']

target = 'accident_risk'

# Feature Engineering

In [71]:
num_cols = test.select_dtypes(include=['float32', 'int32']).columns.to_list()
cat_cols = test.select_dtypes(include='category').columns.to_list()

## Creating base risk feature based on domain knowledge

In [72]:
def risk(df):
    base_risk = (0.4 * df['curvature'] +
                 0.2 * (df['lighting'] == 'night').astype(int) +
                 0.1 * (df["weather"] != "clear").astype(int) +
                 0.2 * (df["speed_limit"] >= 60).astype(int) +
                 0.1 * (np.array(df["num_reported_accidents"] > 4).astype(int)
    ))
                 
    noise = np.random.normal(0, 0.05, df.shape[0])
    risk_score = np.clip(base_risk + noise, 0, 1)
    df["simulated_risk"] = np.round(risk_score, 2).astype('float32')

    return df

In [73]:
train = risk(train)
test = risk(test)

In [75]:
TE = []
for c in cat_cols:
    te_map = train.groupby(c)[target].mean()
    n = f"TE_{c}"
    print(f"{n}, ",end="")
 
    train[n] = train[c].map(te_map)
    test[n] = test[c].map(te_map)

    global_mean = train[target].mean()
    train[n].fillna(global_mean, inplace=True)
    test[n].fillna(global_mean, inplace=True)
    
    TE.append(n)

TE_road_type, TE_lighting, TE_weather, TE_time_of_day, 

In [76]:
def feature_engineering(df):
    
    df['speed_visibility'] = df['speed_limit'] * df['lighting'].map({'daylight': 0.5, 'dim': 1.0, 'night': 1.5}).fillna(1.0)
    df['curv_speed'] = df['curvature'] * df['speed_limit'] 
    df['curv_sq'] = df['curvature']**2
    df['speed_sq'] = df['speed_limit']**2
    df['curv_per_lane'] = df['curvature'] / (df['num_lanes'] + 1)
    df['risk_density'] = df['curv_speed'] / (df['num_lanes'] + 1)
    df['curve_night_risk'] = df['curvature'] * (df['lighting'] == 'night').astype('int32')
    df['curv_log'] = np.log1p(df['curvature'])
    df['speed_log'] = np.log1p(df['speed_limit'])
    df['meta_curvature'] = 0.3 * df['curvature']
    df['meta_night'] = 0.2 * (df['lighting'] == 'night').astype('int32')
    df['meta_weather'] = 0.1 * (df['weather'] != 'clear').astype('int32')
    df['meta_speed'] = 0.2 * (df['speed_limit'] >= 60).astype('int32')
    df['meta_accidents'] = 0.1 * (df['num_reported_accidents'] > 2).astype('int32')

    df['tight_lane'] = (df['num_lanes'] <= 2).astype(int)
   
    return df

train = feature_engineering(train)
test = feature_engineering(test)

In [77]:
for c in cat_cols:
    train[c] = pd.Categorical(train[c]).codes
    test[c] = pd.Categorical(test[c]).codes

In [59]:
train.head()

Unnamed: 0_level_0,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents,accident_risk,simulated_risk,TE_road_type,TE_lighting,TE_weather,TE_time_of_day
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,2,2,0.06,35,0,2,False,True,0,False,True,1,0.13,0.2,0.357456,0.302923,0.361494,0.351428
1,2,4,0.99,35,0,0,True,False,1,True,True,0,0.35,0.42,0.357456,0.302923,0.31006,0.354736
2,1,4,0.63,70,1,0,False,True,2,True,False,2,0.3,0.48,0.349997,0.300109,0.31006,0.350966
3,0,4,0.07,35,1,2,True,True,2,False,False,1,0.21,0.14,0.349734,0.300109,0.361494,0.350966
4,1,1,0.58,60,0,1,False,False,1,True,False,1,0.56,0.53,0.349997,0.302923,0.386305,0.354736


In [78]:
train.info()

<class 'cudf.core.dataframe.DataFrame'>
Index: 517098 entries, 0 to 517753
Data columns (total 33 columns):
 #   Column                  Non-Null Count   Dtype
---  ------                  --------------   -----
 0   road_type               517098 non-null  int8
 1   num_lanes               517098 non-null  int32
 2   curvature               517098 non-null  float32
 3   speed_limit             517098 non-null  int32
 4   lighting                517098 non-null  int8
 5   weather                 517098 non-null  int8
 6   road_signs_present      517098 non-null  bool
 7   public_road             517098 non-null  bool
 8   time_of_day             517098 non-null  int8
 9   holiday                 517098 non-null  bool
 10  school_season           517098 non-null  bool
 11  num_reported_accidents  517098 non-null  int32
 12  accident_risk           517098 non-null  float64
 13  simulated_risk          517098 non-null  float32
 14  TE_road_type            517098 non-null  float64
 15  TE_

# Splitting the data into training and testing sets

In [79]:
X = train.copy()
y = X.pop('accident_risk')

# Conducting optuna study


In [80]:
kf = KFold(n_splits=5, shuffle=True, random_state=2)
n_folds = 5

In [81]:
def objective(trial):
    params = {
        'iterations': 1000,
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
        'depth': trial.suggest_int('depth', 3, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10, log=True),
        "loss_function": "RMSE",
        'eval_metric': 'RMSE',
        'task_type':'GPU', 
        'random_state': 2,
        'early_stopping_rounds': 100,
        'verbose': 0,
    }

    scores= []

    for fold, (train_index, valid_index) in enumerate(kf.split(X, y), start=1):
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        
        cat = CatBoostRegressor(**params).fit(X_train, y_train, eval_set=[(X_valid, y_valid)], use_best_model=True)
        pred_valid = cat.predict(X_valid)
        fold_rmse = np.sqrt(mean_squared_error(y_valid, pred_valid))
        scores.append(fold_rmse)
    
    return np.mean(scores)
    
study = optuna.create_study(direction='minimize', study_name='CAT-RMSE-Optimization')
study.optimize(objective, n_trials=10, show_progress_bar=True)

print(f'Best cross-validation RMSE: {study.best_value:,.5f}')
print(f'Best parameters: {study.best_params}')

[I 2025-10-31 06:20:33,933] A new study created in memory with name: CAT-RMSE-Optimization


  0%|          | 0/10 [00:00<?, ?it/s]

[I 2025-10-31 06:21:48,747] Trial 0 finished with value: 0.056162047998806156 and parameters: {'learning_rate': 0.06144782844338708, 'depth': 6, 'l2_leaf_reg': 5.892286724095839}. Best is trial 0 with value: 0.056162047998806156.
[I 2025-10-31 06:23:46,816] Trial 1 finished with value: 0.05641534904776191 and parameters: {'learning_rate': 0.005442700223862539, 'depth': 9, 'l2_leaf_reg': 3.6180028694246236}. Best is trial 0 with value: 0.056162047998806156.
[I 2025-10-31 06:24:51,064] Trial 2 finished with value: 0.057130054168822234 and parameters: {'learning_rate': 0.00842720466076636, 'depth': 5, 'l2_leaf_reg': 1.508738135118222}. Best is trial 0 with value: 0.056162047998806156.
[I 2025-10-31 06:25:48,159] Trial 3 finished with value: 0.05808350911886466 and parameters: {'learning_rate': 0.008111437667376807, 'depth': 4, 'l2_leaf_reg': 2.541973350445111}. Best is trial 0 with value: 0.056162047998806156.
[I 2025-10-31 06:26:33,583] Trial 4 finished with value: 0.056212507381464584 a

In [83]:
# Parameter importance plot (which hyperparameters matter most)
plot_param_importances(study)

In [87]:
best_params = study.best_params
best_model = CatBoostRegressor(**best_params, verbose=0)
best_model.fit(X, y)

<catboost.core.CatBoostRegressor at 0x7dc06706b090>

In [88]:
X_test = test.copy()
test_preds = best_model.predict(X_test)

In [89]:
sub = pd.read_csv('/kaggle/input/playground-series-s5e10/sample_submission.csv')
sub['accident_risk'] = test_preds
sub.to_csv('submission.csv', index=False)
sub.head()

Unnamed: 0,id,accident_risk
0,517754,0.292915
1,517755,0.119169
2,517756,0.181223
3,517757,0.313537
4,517758,0.396086
