# Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# For reproducibility
np.random.seed(42)

# Load data

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample = pd.read_csv('sample_submission.csv')

print(f'Train shape: {train.shape}')
print(f'Test shape: {test.shape}')
print(f'Sample shape: {sample.shape}')

# Preprocessing

In [None]:
# Drop id
train_id = train['id']
test_id = test['id']
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

# Target
y = train['accident_risk']
train = train.drop('accident_risk', axis=1)

# Combine for preprocessing
all_data = pd.concat([train, test], axis=0, ignore_index=True)

# Feature engineering
all_data['speed_curv_interact'] = all_data['speed_limit'] * all_data['curvature']
all_data['bad_weather'] = np.where(all_data['weather'].isin(['rainy', 'foggy']), 1, 0)
all_data['poor_lighting'] = np.where(all_data['lighting'].isin(['dim', 'night']), 1, 0)
all_data['rush_hour'] = np.where(all_data['time_of_day'].isin(['morning', 'evening']), 1, 0)

# Encode categoricals
cat_cols = ['road_type', 'lighting', 'weather', 'time_of_day']
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    all_data[col] = le.fit_transform(all_data[col].astype(str))
    label_encoders[col] = le

# Boolean cols to int
bool_cols = ['road_signs_present', 'public_road', 'holiday', 'school_season']
for col in bool_cols:
    all_data[col] = all_data[col].astype(int)

# No scaling needed for tree-based models

# Split back
train = all_data[:len(train)].reset_index(drop=True)
test = all_data[len(train):].reset_index(drop=True)

# Prepare X
X = train
X_test = test

print('Preprocessing complete.')

Train shape: (517754, 14)
Test shape: (172585, 13)
Sample shape: (172585, 2)
Preprocessing complete.
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[793]	valid_0's rmse: 0.0562534
Validation RMSE: 0.05625
Submission saved as submission_lgbm_fe.csv
       id  accident_risk
0  517754       0.297474
1  517755       0.122516
2  517756       0.181681
3  517757       0.310421
4  517758       0.399756


# Model Training
# Split for validation

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# LightGBM parameters

In [None]:
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'verbosity': -1,
    'n_jobs': -1,
    'random_state': 42,
    'learning_rate': 0.03,
    'n_estimators': 2000,
    'num_leaves': 64,
    'min_data_in_leaf': 50,
    'max_depth': 10,
    'subsample': 0.8,
    'colsample_bytree': 0.8
}

model = lgb.LGBMRegressor(**params)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='rmse',
    callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=True)]
)

# Validation

In [None]:
y_pred_val = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
print(f'Validation RMSE: {rmse:.5f}')

# Retrain on full data

In [None]:
model.fit(X, y)

# Prediction and Submission

In [None]:
y_pred_test = model.predict(X_test)

# Ensure predictions are between 0 and 1

In [None]:
y_pred_test = np.clip(y_pred_test, 0, 1)

# Submission

In [None]:
submission = pd.DataFrame({'id': test_id, 'accident_risk': y_pred_test})
submission.to_csv('submission_lgbm_fe.csv', index=False)
print('Submission saved as submission_lgbm_fe.csv')
print(submission.head())