### Few changes made in this notebook:
- Added original dataset
- Added simulated accident risk feature
- Added target encoding
- Decrease the size of dataset

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
%load_ext cudf.pandas
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score
from sklearn.impute import SimpleImputer

from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.compose import TransformedTargetRegressor, ColumnTransformer
from scipy.special import logit, expit

%matplotlib inline
plt.style.use('seaborn-v0_8')
plt.rc('figure', figsize=(10,6), dpi=180)
plt.rc('axes', labelweight='bold', labelsize='large',
       titleweight='bold', titlesize=15, titlepad=10)
plt.rc('animation', html='html5')

import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e10/sample_submission.csv
/kaggle/input/playground-series-s5e10/train.csv
/kaggle/input/playground-series-s5e10/test.csv
/kaggle/input/simulated-roads-accident-data/synthetic_road_accidents_10k.csv
/kaggle/input/simulated-roads-accident-data/synthetic_road_accidents_2k.csv
/kaggle/input/simulated-roads-accident-data/synthetic_road_accidents_100k.csv


In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s5e10/train.csv', index_col='id')
test = pd.read_csv('/kaggle/input/playground-series-s5e10/test.csv', index_col='id')

In [3]:
org = []
for n in [2, 10, 100]:
    df = pd.read_csv(f'/kaggle/input/simulated-roads-accident-data/synthetic_road_accidents_{n}k.csv')
    org.append(df)

org = pd.concat(org, axis=0)
org.head()

Unnamed: 0,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents,accident_risk
0,rural,2,0.72,60,daylight,clear,True,False,afternoon,False,False,2,0.37
1,highway,4,0.95,45,daylight,foggy,False,True,evening,False,True,1,0.4
2,rural,1,0.72,25,night,rainy,False,False,evening,True,False,1,0.55
3,rural,4,0.86,70,dim,foggy,True,False,morning,True,True,1,0.56
4,highway,1,0.0,60,night,rainy,True,True,morning,True,True,3,0.54


In [4]:
train = pd.concat([train, org], axis=0, ignore_index=True)

In [5]:
num_cols = test.select_dtypes(include=['float64', 'int64']).columns.to_list()
cat_cols = test.select_dtypes(include='object').columns.to_list()

In [6]:
train.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 629754 entries, 0 to 629753
Data columns (total 13 columns):
 #   Column                  Non-Null Count   Dtype
---  ------                  --------------   -----
 0   road_type               629754 non-null  object
 1   num_lanes               629754 non-null  int64
 2   curvature               629754 non-null  float64
 3   speed_limit             629754 non-null  int64
 4   lighting                629754 non-null  object
 5   weather                 629754 non-null  object
 6   road_signs_present      629754 non-null  bool
 7   public_road             629754 non-null  bool
 8   time_of_day             629754 non-null  object
 9   holiday                 629754 non-null  bool
 10  school_season           629754 non-null  bool
 11  num_reported_accidents  629754 non-null  int64
 12  accident_risk           629754 non-null  float64
dtypes: bool(4), float64(2), int64(3), object(4)
memory usage: 50.2+ MB


In [7]:
for col in num_cols:
    if test[col].dtype == 'float64':
        train[col] = train[col].astype('float32')
        test[col] = test[col].astype('float32')
        
    else:
        train[col] = train[col].astype('int32')
        test[col] = test[col].astype('int32')

for col in cat_cols:
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')

In [8]:
train.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 629754 entries, 0 to 629753
Data columns (total 13 columns):
 #   Column                  Non-Null Count   Dtype
---  ------                  --------------   -----
 0   road_type               629754 non-null  category
 1   num_lanes               629754 non-null  int32
 2   curvature               629754 non-null  float32
 3   speed_limit             629754 non-null  int32
 4   lighting                629754 non-null  category
 5   weather                 629754 non-null  category
 6   road_signs_present      629754 non-null  bool
 7   public_road             629754 non-null  bool
 8   time_of_day             629754 non-null  category
 9   holiday                 629754 non-null  bool
 10  school_season           629754 non-null  bool
 11  num_reported_accidents  629754 non-null  int32
 12  accident_risk           629754 non-null  float64
dtypes: bool(4), category(4), float32(1), float64(1), int32(3)
memory usage: 19.2 MB


# Feature Engineering

In [9]:
num_cols = test.select_dtypes(include=['float32', 'int32']).columns.to_list()
cat_cols = test.select_dtypes(include='category').columns.to_list()

In [10]:
def risk(df):
    base_risk = (0.4 * df['curvature'] +
                 0.2 * (df['lighting'] == 'night').astype(int) +
                 0.1 * (df["weather"] != "clear").astype(int) +
                 0.2 * (df["speed_limit"] >= 60).astype(int) +
                 0.1 * (np.array(df["num_reported_accidents"] > 4).astype(int)
    ))
                 
    noise = np.random.normal(0, 0.05, df.shape[0])
    risk_score = np.clip(base_risk + noise, 0, 1)
    df["simulated_risk"] = np.round(risk_score, 2).astype('float32')

    return df

train = risk(train)
test = risk(test)

In [11]:
train.dtypes

road_type                 category
num_lanes                    int32
curvature                  float32
speed_limit                  int32
lighting                  category
weather                   category
road_signs_present            bool
public_road                   bool
time_of_day               category
holiday                       bool
school_season                 bool
num_reported_accidents       int32
accident_risk              float64
simulated_risk             float32
dtype: object

In [12]:
train.columns

Index(['road_type', 'num_lanes', 'curvature', 'speed_limit', 'lighting',
       'weather', 'road_signs_present', 'public_road', 'time_of_day',
       'holiday', 'school_season', 'num_reported_accidents', 'accident_risk',
       'simulated_risk'],
      dtype='object')

In [13]:
features = ['num_lanes', 'curvature', 'speed_limit', 'road_signs_present', 'public_road', 'holiday', 'school_season', 'num_reported_accidents']
target = 'accident_risk'

In [14]:
print(num_cols)

['num_lanes', 'curvature', 'speed_limit', 'num_reported_accidents']


In [15]:
TE = []
for c in cat_cols:
    te_map = train.groupby(c)[target].mean()
    n = f"TE_{c}"
    print(f"{n}, ",end="")
 
    train[n] = train[c].map(te_map)
    test[n] = test[c].map(te_map)

    global_mean = train[target].mean()
    train[n].fillna(global_mean, inplace=True)
    test[n].fillna(global_mean, inplace=True)
    
    TE.append(n)

TE_road_type, TE_lighting, TE_weather, TE_time_of_day, 

In [16]:
train.head()

Unnamed: 0,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents,accident_risk,simulated_risk,TE_road_type,TE_lighting,TE_weather,TE_time_of_day
0,urban,2,0.06,35,daylight,rainy,False,True,afternoon,False,True,1,0.13,0.14,0.362206,0.305396,0.372135,0.357131
1,urban,4,0.99,35,daylight,clear,True,False,evening,True,True,0,0.35,0.39,0.362206,0.305396,0.311183,0.359882
2,rural,4,0.63,70,dim,clear,False,True,morning,True,False,2,0.3,0.4,0.355762,0.302921,0.311183,0.356445
3,highway,4,0.07,35,dim,rainy,True,True,morning,False,False,1,0.21,0.15,0.355524,0.302921,0.372135,0.356445
4,rural,1,0.58,60,daylight,foggy,False,False,evening,True,False,1,0.56,0.53,0.355762,0.305396,0.391359,0.359882


# Splitting the data into training and testing sets
- Use log transformation on the target variable

In [17]:
X = train.copy()
y = X.pop('accident_risk')
X_test = test.copy()

From the previous notebook, I noticed the XGBoost model was conservative at high values around 1.0. So we are applying higher weights on higher values, so that the model pays extra attention on these values.

In [18]:
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=2)

oof_xgb = np.zeros(len(y))
test_xgb = np.zeros(len(X_test))

In [19]:
params = {
    'n_estimators': 1100,
    'learning_rate': 0.05036086563157658,
    'max_depth': 5,
    'reg_alpha': 1.7078790750979551, 
    'reg_lambda': 0.04303317633957419,
    'subsample': 0.6010146163838317,
    'colsample_bytree': 0.8088494830103302,
    'eval_metric': 'rmse',
    'random_state':2, 
    'enable_categorical': True,
    'early_stopping_rounds': 100,
    'device': 'cuda',
    'objective': 'reg:squarederror',
}

for fold, (train_index, valid_index) in enumerate(kf.split(X, y), start=1):
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

    xgb = XGBRegressor(**params).fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=0)

    valid_pred = xgb.predict(X_valid)
    oof_xgb[valid_index] = valid_pred
    test_xgb += xgb.predict(X_test)
    
    fold_rmsle = np.sqrt(mean_squared_error(y_valid, valid_pred))
    print(f"Fold {fold} RMSE: {fold_rmsle:.6f}")

test_xgb /= n_folds
overall_rmsle = np.sqrt(mean_squared_error(y, oof_xgb))
overall_r2score = r2_score(y, oof_xgb)
overall_explained_var = explained_variance_score(y, oof_xgb)

print(f"\nOverall OOF RMSE: {overall_rmsle:.6f}")  
print(f"\nOverall OOF R2Score: {overall_r2score:.6f}")  
print(f"\nOverall OOF Explained Variance: {overall_explained_var:.6f}")  

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Fold 1 RMSE: 0.056485
Fold 2 RMSE: 0.055708
Fold 3 RMSE: 0.055903
Fold 4 RMSE: 0.055908
Fold 5 RMSE: 0.056042

Overall OOF RMSE: 0.056010

Overall OOF R2Score: 0.890333

Overall OOF Explained Variance: 0.890333


In [20]:
sub = pd.read_csv('/kaggle/input/playground-series-s5e10/sample_submission.csv')
sub['accident_risk'] = test_xgb
sub.to_csv('submission.csv', index=False)
sub.head()

Unnamed: 0,id,accident_risk
0,517754,0.295082
1,517755,0.118949
2,517756,0.182556
3,517757,0.344391
4,517758,0.388306
