# Install required libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')
# For reproducibility
np.random.seed(42)

# Load data

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample = pd.read_csv('sample_submission.csv')

print(f'Train shape: {train.shape}')
print(f'Test shape: {test.shape}')
print(f'Sample shape: {sample.shape}')

Train shape: (517754, 14)
Test shape: (172585, 13)
Sample shape: (172585, 2)


# Preprocessing

In [3]:
# Drop id
train_id = train['id']
test_id = test['id']
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

# Target
y = train['accident_risk']
train = train.drop('accident_risk', axis=1)

# Combine for preprocessing
all_data = pd.concat([train, test], axis=0, ignore_index=True)

# Encode categoricals
cat_cols = ['road_type', 'lighting', 'weather', 'time_of_day']
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    all_data[col] = le.fit_transform(all_data[col].astype(str))
    label_encoders[col] = le

# Boolean cols to int
bool_cols = ['road_signs_present', 'public_road', 'holiday', 'school_season']
for col in bool_cols:
    all_data[col] = all_data[col].astype(int)

# Split back
train = all_data[:len(train)].reset_index(drop=True)
test = all_data[len(train):].reset_index(drop=True)

# Numerical features for scaling
num_cols = ['num_lanes', 'curvature', 'speed_limit', 'num_reported_accidents']
scaler = StandardScaler()
train[num_cols] = scaler.fit_transform(train[num_cols])
test[num_cols] = scaler.transform(test[num_cols])

# Prepare X
X = train
X_test = test

print('Preprocessing complete.')

Preprocessing complete.


# Model Training

In [4]:
# Split for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest with grid search
param_grid = {
    'n_estimators': [500, 1000],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

model = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_
print(f'Best params: {grid_search.best_params_}')

Fitting 3 folds for each of 24 candidates, totalling 72 fits
Best params: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}


# Validation

In [5]:
# Validate
y_pred_val = best_model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
print(f'Validation RMSE: {rmse:.5f}')

# Full train RMSE for reference
y_pred_full = best_model.predict(X)
rmse_full = np.sqrt(mean_squared_error(y, y_pred_full))
print(f'Full Train RMSE: {rmse_full:.5f}')

Validation RMSE: 0.05629
Full Train RMSE: 0.05569


# Prediction and Submission

In [6]:
# Predict test
y_pred_test = best_model.predict(X_test)

# Ensure predictions are between 0 and 1
y_pred_test = np.clip(y_pred_test, 0, 1)

# Submission
submission = pd.DataFrame({'id': test_id, 'accident_risk': y_pred_test})
submission.to_csv('submission_rf.csv', index=False)
print('Submission saved as submission_rf.csv')
print(submission.head())

Submission saved as submission_rf.csv
       id  accident_risk
0  517754       0.302155
1  517755       0.122313
2  517756       0.177293
3  517757       0.324067
4  517758       0.407808
