# Install required libraries

In [2]:
# Install required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# For reproducibility
np.random.seed(42)

# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample = pd.read_csv('sample_submission.csv')

print(f'Train shape: {train.shape}')
print(f'Test shape: {test.shape}')
print(f'Sample shape: {sample.shape}')
print(f'Train columns: {train.columns.tolist()}')  # Print column names to verify

# Preprocessing
# Drop id
train_id = train['id']
test_id = test['id']
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

# Target
y = train['accident_risk']
train = train.drop('accident_risk', axis=1)

# Combine for preprocessing
all_data = pd.concat([train, test], axis=0, ignore_index=True)

# Feature engineering
all_data['speed_curv_interact'] = all_data['speed_limit'] * all_data['curvature']
all_data['bad_weather'] = np.where(all_data['weather'].isin(['rainy', 'foggy']), 1, 0)
all_data['poor_lighting'] = np.where(all_data['lighting'].isin(['dim', 'night']), 1, 0)
all_data['rush_hour'] = np.where(all_data['time_of_day'].isin(['morning', 'evening']), 1, 0)
# New features
all_data['road_condition'] = all_data['curvature'] * all_data['bad_weather']  # Interaction for risky conditions
all_data['holiday_rush'] = all_data['holiday'] * all_data['rush_hour']  # Holiday during rush hour

# Encode categoricals
cat_cols = ['road_type', 'lighting', 'weather', 'time_of_day']
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    all_data[col] = le.fit_transform(all_data[col].astype(str))
    label_encoders[col] = le

# Boolean cols to int
bool_cols = ['road_signs_present', 'public_road', 'holiday', 'school_season']
for col in bool_cols:
    all_data[col] = all_data[col].astype(int)

# Handle potential outliers in numerical columns
num_cols = ['speed_limit', 'curvature', 'speed_curv_interact', 'road_condition']
for col in num_cols:
    upper_limit = all_data[col].quantile(0.99)
    all_data[col] = np.clip(all_data[col], None, upper_limit)

# Split back
train = all_data[:len(train)].reset_index(drop=True)
test = all_data[len(train):].reset_index(drop=True)

# Prepare X
X = train
X_test = test

print('Preprocessing complete.')

# Model Training with Cross-Validation
# LightGBM parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'verbosity': -1,
    'n_jobs': -1,
    'random_state': 42,
    'learning_rate': 0.02,
    'n_estimators': 3000,
    'num_leaves': 50,
    'min_data_in_leaf': 30,
    'max_depth': 8,
    'subsample': 0.75,
    'colsample_bytree': 0.75
}

# Cross-validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
val_rmse = []
test_preds = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f'Training fold {fold + 1}...')
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model = lgb.LGBMRegressor(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='rmse',
        callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)]
    )
    
    # Validation score
    y_pred_val = model.predict(X_val)
    fold_rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
    val_rmse.append(fold_rmse)
    print(f'Fold {fold + 1} RMSE: {fold_rmse:.5f}')
    
    # Predict on test set
    test_preds += model.predict(X_test) / kf.n_splits

print(f'Mean CV RMSE: {np.mean(val_rmse):.5f} ± {np.std(val_rmse):.5f}')

# Ensure predictions are between 0 and 1
test_preds = np.clip(test_preds, 0.01, 0.99)

# Submission
submission = pd.DataFrame({'id': test_id, 'accident_risk': test_preds})
submission.to_csv('submission_lgbm_cv_fe2.csv', index=False)
print('Submission saved as submission_lgbm_cv_fe2.csv')
print(submission.head())

Train shape: (517754, 14)
Test shape: (172585, 13)
Sample shape: (172585, 2)
Train columns: ['id', 'road_type', 'num_lanes', 'curvature', 'speed_limit', 'lighting', 'weather', 'road_signs_present', 'public_road', 'time_of_day', 'holiday', 'school_season', 'num_reported_accidents', 'accident_risk']
Preprocessing complete.
Training fold 1...
Fold 1 RMSE: 0.05624
Training fold 2...
Fold 2 RMSE: 0.05609
Training fold 3...
Fold 3 RMSE: 0.05615
Training fold 4...
Fold 4 RMSE: 0.05597
Training fold 5...
Fold 5 RMSE: 0.05594
Mean CV RMSE: 0.05608 ± 0.00011
Submission saved as submission_lgbm_cv_fe2.csv
       id  accident_risk
0  517754       0.297491
1  517755       0.124464
2  517756       0.180660
3  517757       0.311601
4  517758       0.403815
