In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
import optuna
import warnings
warnings.filterwarnings('ignore')  # Suppress warnings for cleaner output

# Set random seed for reproducibility
np.random.seed(42)

# Load data (replace with your actual file paths)
train_data = pd.read_csv('train.csv')  # Shape: (517754, 14)
test_data = pd.read_csv('test.csv')    # Shape: (172585, 13)

# Define features and target
features = [
    'road_type', 'num_lanes', 'curvature', 'speed_limit', 'lighting',
    'weather', 'road_signs_present', 'public_road', 'time_of_day',
    'holiday', 'school_season', 'num_reported_accidents'
]
target = 'accident_risk'

# Separate features and target
X = train_data[features]
y = train_data[target]
X_test = test_data[features]
ids = test_data['id']

# Preprocess categorical and numerical features
categorical_features = ['road_type', 'lighting', 'weather', 'road_signs_present',
                        'public_road', 'time_of_day', 'holiday', 'school_season']
numerical_features = ['num_lanes', 'curvature', 'speed_limit', 'num_reported_accidents']

# Convert categorical features to category type for LightGBM
for col in categorical_features:
    X[col] = X[col].astype('category')
    X_test[col] = X_test[col].astype('category')

# Scale numerical features
scaler = StandardScaler()
X[numerical_features] = scaler.fit_transform(X[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

# Initialize base models
lgbm1 = LGBMRegressor(n_estimators=100, random_state=42, verbose=-1)
lgbm2 = LGBMRegressor(n_estimators=150, random_state=42, verbose=-1)
xgb = XGBRegressor(n_estimators=100, random_state=42)

# Define Optuna objective for hyperparameter tuning
def objective(trial, model_name, X_train, y_train, X_val, y_val):
    if model_name == 'lgbm':
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 200),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'num_leaves': trial.suggest_int('num_leaves', 20, 100),
            'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),
            'random_state': 42,
            'verbose': -1
        }
        model = LGBMRegressor(**params)
    elif model_name == 'xgb':
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 200),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'random_state': 42
        }
        model = XGBRegressor(**params)
    
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds))  # Compute RMSE manually
    return rmse

# Initialize arrays for meta-features
meta_X = np.zeros((len(X), 3))  # 3 base models
meta_test = np.zeros((len(X_test), 3))  # Same number of features for test

# 5-fold cross-validation for stacking
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"Training fold {fold + 1}")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Optimize hyperparameters for each model
    study_lgbm1 = optuna.create_study(direction='minimize')
    study_lgbm1.optimize(lambda trial: objective(trial, 'lgbm', X_train, y_train, X_val, y_val), n_trials=10)
    lgbm1.set_params(**study_lgbm1.best_params)
    
    study_lgbm2 = optuna.create_study(direction='minimize')
    study_lgbm2.optimize(lambda trial: objective(trial, 'lgbm', X_train, y_train, X_val, y_val), n_trials=10)
    lgbm2.set_params(**study_lgbm2.best_params)
    
    study_xgb = optuna.create_study(direction='minimize')
    study_xgb.optimize(lambda trial: objective(trial, 'xgb', X_train, y_train, X_val, y_val), n_trials=10)
    xgb.set_params(**study_xgb.best_params)
    
    # Train base models and get validation predictions
    lgbm1.fit(X_train, y_train)
    lgbm2.fit(X_train, y_train)
    xgb.fit(X_train, y_train)
    
    meta_X[val_idx, 0] = lgbm1.predict(X_val)
    meta_X[val_idx, 1] = lgbm2.predict(X_val)
    meta_X[val_idx, 2] = xgb.predict(X_val)
    
    # Get test predictions (average across folds)
    meta_test[:, 0] += lgbm1.predict(X_test) / kf.n_splits
    meta_test[:, 1] += lgbm2.predict(X_test) / kf.n_splits
    meta_test[:, 2] += xgb.predict(X_test) / kf.n_splits

# Verify feature consistency
print("meta_X shape:", meta_X.shape)  # Expected: (517754, 3)
print("meta_test shape:", meta_test.shape)  # Expected: (172585, 3)

# Train meta-model
meta_model = LGBMRegressor(n_estimators=100, random_state=42, verbose=-1)
meta_model.fit(meta_X, y)

# Make final predictions
stacked_test_preds = meta_model.predict(meta_test)
stacked_test_preds = np.clip(stacked_test_preds, 0.01, 0.99)

# Create submission
submission = pd.DataFrame({'id': ids, 'accident_risk': stacked_test_preds})
submission.to_csv('submission.csv', index=False)

print("Submission saved to submission.csv")

[I 2025-10-15 18:13:31,798] A new study created in memory with name: no-name-1673b1fb-53e1-489c-8b57-e77604ce6546


Training fold 1


[I 2025-10-15 18:13:32,828] Trial 0 finished with value: 0.056397852929404566 and parameters: {'n_estimators': 51, 'learning_rate': 0.15223372957852144, 'num_leaves': 32, 'min_child_samples': 41}. Best is trial 0 with value: 0.056397852929404566.
[I 2025-10-15 18:13:34,265] Trial 1 finished with value: 0.05627923757748418 and parameters: {'n_estimators': 82, 'learning_rate': 0.2371495859090775, 'num_leaves': 87, 'min_child_samples': 48}. Best is trial 1 with value: 0.05627923757748418.
[I 2025-10-15 18:13:36,331] Trial 2 finished with value: 0.056244306809098255 and parameters: {'n_estimators': 136, 'learning_rate': 0.15298706027063697, 'num_leaves': 88, 'min_child_samples': 30}. Best is trial 2 with value: 0.056244306809098255.
[I 2025-10-15 18:13:38,984] Trial 3 finished with value: 0.05926544021112753 and parameters: {'n_estimators': 127, 'learning_rate': 0.017034758498500713, 'num_leaves': 90, 'min_child_samples': 20}. Best is trial 2 with value: 0.056244306809098255.
[I 2025-10-15

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:road_type: category, lighting: category, weather: category, road_signs_present: category, public_road: category, time_of_day: category, holiday: category, school_season: category