## Imports

In [19]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, accuracy_score

## Settings

In [2]:
N_SPLITS = 5
RANDOM_STATE = 42
N_ESTIMATORS = 1000

## Loading Data

In [3]:
train_1 = pd.read_csv("/kaggle/input/playground-series-s5e10/train.csv")
train_2 = pd.read_csv("/kaggle/input/simulated-roads-accident-data/synthetic_road_accidents_10k.csv")
test_df = pd.read_csv("/kaggle/input/playground-series-s5e10/test.csv")

In [4]:
train_1.shape

(517754, 14)

In [5]:
train_2.shape

(10000, 13)

In [6]:
# train_df = pd.concat([train_1, train_2], axis=0)
train_df = train_1

In [7]:
train_df.shape

(517754, 14)

In [8]:
train_df.dtypes

id                          int64
road_type                  object
num_lanes                   int64
curvature                 float64
speed_limit                 int64
lighting                   object
weather                    object
road_signs_present           bool
public_road                  bool
time_of_day                object
holiday                      bool
school_season                bool
num_reported_accidents      int64
accident_risk             float64
dtype: object

In [9]:
categorical_columns = ["road_type", "lighting", "weather", "time_of_day"]

In [10]:
df_all = pd.concat([train_df, test_df], axis=0)

In [11]:
df_all.shape

(690339, 14)

In [12]:
df_all = pd.get_dummies(df_all, columns=categorical_columns)

In [13]:
df_all.shape

(690339, 22)

In [14]:
train_df = df_all[:train_df.shape[0]]
test_df = df_all[train_df.shape[0]:]

In [15]:
train_df.drop(columns=["id"], inplace=True)
train_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.drop(columns=["id"], inplace=True)


Unnamed: 0,num_lanes,curvature,speed_limit,road_signs_present,public_road,holiday,school_season,num_reported_accidents,accident_risk,road_type_highway,...,road_type_urban,lighting_daylight,lighting_dim,lighting_night,weather_clear,weather_foggy,weather_rainy,time_of_day_afternoon,time_of_day_evening,time_of_day_morning
0,2,0.06,35,False,True,False,True,1,0.13,False,...,True,True,False,False,False,False,True,True,False,False
1,4,0.99,35,True,False,True,True,0,0.35,False,...,True,True,False,False,True,False,False,False,True,False
2,4,0.63,70,False,True,True,False,2,0.3,False,...,False,False,True,False,True,False,False,False,False,True
3,4,0.07,35,True,True,False,False,1,0.21,True,...,False,False,True,False,False,False,True,False,False,True
4,1,0.58,60,False,False,True,False,1,0.56,False,...,False,True,False,False,False,True,False,False,True,False


In [16]:
test_df.drop(columns=["id", "accident_risk"], inplace=True)
test_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.drop(columns=["id", "accident_risk"], inplace=True)


Unnamed: 0,num_lanes,curvature,speed_limit,road_signs_present,public_road,holiday,school_season,num_reported_accidents,road_type_highway,road_type_rural,road_type_urban,lighting_daylight,lighting_dim,lighting_night,weather_clear,weather_foggy,weather_rainy,time_of_day_afternoon,time_of_day_evening,time_of_day_morning
0,2,0.34,45,True,True,True,True,1,True,False,False,False,False,True,True,False,False,True,False,False
1,3,0.04,45,True,False,True,False,0,False,False,True,False,True,False,False,True,False,True,False,False
2,2,0.59,35,True,False,True,True,1,False,False,True,False,True,False,True,False,False,True,False,False
3,4,0.95,35,False,False,False,False,2,False,True,False,True,False,False,False,False,True,True,False,False
4,2,0.86,35,True,False,False,True,3,True,False,False,True,False,False,True,False,False,False,True,False


In [17]:
y = train_df["accident_risk"]
X = train_df.drop(columns=["accident_risk"])

## K-Fold Cross Validation

In [21]:
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

oof_preds = np.zeros(len(X))
y_pred = np.zeros(len(test_df))

In [22]:
for fold, (train_idx, valid_idx) in enumerate(kf.split(X, y)):
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    model_lgb = LGBMRegressor()

    model_xgb = XGBRegressor()

    model_cat = CatBoostRegressor()

    model_lgb.fit(X_train, y_train)
    model_xgb.fit(X_train, y_train)
    model_cat.fit(X_train, y_train)

    pred_lgb = model_lgb.predict(X_valid)
    pred_xgb = model_xgb.predict(X_valid)
    pred_cat = model_cat.predict(X_valid)

    pred_ensemble = (pred_lgb + pred_xgb + pred_cat) / 3
    oof_preds[valid_idx] = pred_ensemble

    rmse = mean_squared_error(y_valid, pred_ensemble, squared=False)
    print(f"Fold {fold + 1} RMSE: {rmse:.5f}")

    y_pred += (model_lgb.predict(test_df) + model_xgb.predict(test_df) + model_cat.predict(test_df)) / 3 / N_SPLITS

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011210 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 181
[LightGBM] [Info] Number of data points in the train set: 414203, number of used features: 20
[LightGBM] [Info] Start training from score 0.352605
Learning rate set to 0.106096
0:	learn: 0.1515898	total: 101ms	remaining: 1m 41s
1:	learn: 0.1385197	total: 143ms	remaining: 1m 11s
2:	learn: 0.1269968	total: 180ms	remaining: 59.9s
3:	learn: 0.1169191	total: 218ms	remaining: 54.4s
4:	learn: 0.1081307	total: 258ms	remaining: 51.2s
5:	learn: 0.1005085	total: 296ms	remaining: 49s
6:	learn: 0.0936512	total: 332ms	remaining: 47.1s
7:	learn: 0.0879685	total: 370ms	remaining: 45.9s
8:	learn: 0.0830702	total: 408ms	remaining: 44.9s
9:	learn: 0.0787441	total: 446ms	remaining: 44.1s
10:	learn: 0.0751316	total: 483ms	remaining: 43.4s
11:	learn: 0.

In [23]:
ssub = pd.read_csv("/kaggle/input/playground-series-s5e10/sample_submission.csv")
ssub

Unnamed: 0,id,accident_risk
0,517754,0.352
1,517755,0.352
2,517756,0.352
3,517757,0.352
4,517758,0.352
...,...,...
172580,690334,0.352
172581,690335,0.352
172582,690336,0.352
172583,690337,0.352


In [24]:
ssub["accident_risk"] = y_pred

In [25]:
ssub.to_csv("ssub.csv", index=False)