In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e10/sample_submission.csv
/kaggle/input/playground-series-s5e10/train.csv
/kaggle/input/playground-series-s5e10/test.csv


In [2]:
# --- 1. Setup and Data Preparation ---
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import warnings

warnings.filterwarnings('ignore')
print("Libraries imported and setup complete.")

# Load data
train_df = pd.read_csv('/kaggle/input/playground-series-s5e10/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s5e10/test.csv')
print("Data loaded.")

# Feature Engineering & Preprocessing
combined_df = pd.concat([train_df.drop('accident_risk', axis=1), test_df], ignore_index=True)
combined_df['speed_x_curvature'] = combined_df['speed_limit'] * combined_df['curvature']
combined_df['lanes_x_accidents'] = combined_df['num_lanes'] * combined_df['num_reported_accidents']
categorical_features = combined_df.select_dtypes(include=['object', 'boolean']).columns
for col in categorical_features:
    le = LabelEncoder()
    combined_df[col] = le.fit_transform(combined_df[col])
train_processed = combined_df.iloc[:len(train_df)]
test_processed = combined_df.iloc[len(train_df):]
y_train = train_df['accident_risk']
print("Feature processing complete.")

Libraries imported and setup complete.
Data loaded.
Feature processing complete.


In [3]:
# --- 2. Train LightGBM Model ---
print("Starting LightGBM model training...")

# Solid baseline parameters for LightGBM
lgbm_params = {
    'objective': 'regression_l1', 'metric': 'rmse', 'n_estimators': 2000,
    'learning_rate': 0.01, 'feature_fraction': 0.8, 'bagging_fraction': 0.8,
    'bagging_freq': 1, 'lambda_l1': 0.1, 'lambda_l2': 0.1, 'num_leaves': 31,
    'verbose': -1, 'seed': 42, 'boosting_type': 'gbdt', 'device': 'gpu'
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
lgbm_test_preds = np.zeros(len(test_processed))
lgbm_oof_preds = np.zeros(len(train_processed))

for fold, (train_idx, val_idx) in enumerate(kf.split(train_processed, y_train)):
    print(f"--- LightGBM Fold {fold+1}/5 ---")
    X_train, y_train_fold = train_processed.iloc[train_idx], y_train.iloc[train_idx]
    X_val, y_val_fold = train_processed.iloc[val_idx], y_train.iloc[val_idx]

    model = lgb.LGBMRegressor(**lgbm_params)
    model.fit(X_train, y_train_fold,
              eval_set=[(X_val, y_val_fold)],
              eval_metric='rmse',
              callbacks=[lgb.early_stopping(100, verbose=False)])
    
    lgbm_oof_preds[val_idx] = model.predict(X_val)
    lgbm_test_preds += model.predict(test_processed) / kf.n_splits

lgbm_oof_rmse = np.sqrt(mean_squared_error(y_train, lgbm_oof_preds))
print(f"\nLightGBM Training Complete! ✅ OOF RMSE: {lgbm_oof_rmse}")

Starting LightGBM model training...
--- LightGBM Fold 1/5 ---




--- LightGBM Fold 2/5 ---
--- LightGBM Fold 3/5 ---
--- LightGBM Fold 4/5 ---
--- LightGBM Fold 5/5 ---

LightGBM Training Complete! ✅ OOF RMSE: 0.05635442949360848


In [4]:
# --- 3. Train XGBoost Model ---
print("\nStarting XGBoost model training...")

# Solid baseline parameters for XGBoost
xgb_params = {
    'objective': 'reg:squarederror', 'eval_metric': 'rmse',
    'eta': 0.02, 'max_depth': 7, 'subsample': 0.8, 'colsample_bytree': 0.7,
    'seed': 42, 'tree_method': 'gpu_hist'
}

xgb_test_preds = np.zeros(len(test_processed))
xgb_oof_preds = np.zeros(len(train_processed))

for fold, (train_idx, val_idx) in enumerate(kf.split(train_processed, y_train)):
    print(f"--- XGBoost Fold {fold+1}/5 ---")
    X_train, y_train_fold = train_processed.iloc[train_idx], y_train.iloc[train_idx]
    X_val, y_val_fold = train_processed.iloc[val_idx], y_train.iloc[val_idx]

    model = xgb.XGBRegressor(**xgb_params, n_estimators=5000, early_stopping_rounds=100, verbose=False)
    model.fit(X_train, y_train_fold, eval_set=[(X_val, y_val_fold)])
    
    xgb_oof_preds[val_idx] = model.predict(X_val)
    xgb_test_preds += model.predict(test_processed) / kf.n_splits

xgb_oof_rmse = np.sqrt(mean_squared_error(y_train, xgb_oof_preds))
print(f"\nXGBoost Training Complete! ✅ OOF RMSE: {xgb_oof_rmse}")


Starting XGBoost model training...
--- XGBoost Fold 1/5 ---
[0]	validation_0-rmse:0.16408
[1]	validation_0-rmse:0.16141
[2]	validation_0-rmse:0.15892
[3]	validation_0-rmse:0.15624
[4]	validation_0-rmse:0.15446
[5]	validation_0-rmse:0.15246
[6]	validation_0-rmse:0.15069
[7]	validation_0-rmse:0.14840
[8]	validation_0-rmse:0.14663
[9]	validation_0-rmse:0.14419
[10]	validation_0-rmse:0.14194
[11]	validation_0-rmse:0.14033
[12]	validation_0-rmse:0.13826
[13]	validation_0-rmse:0.13601
[14]	validation_0-rmse:0.13378
[15]	validation_0-rmse:0.13164
[16]	validation_0-rmse:0.13070
[17]	validation_0-rmse:0.12944
[18]	validation_0-rmse:0.12738
[19]	validation_0-rmse:0.12557
[20]	validation_0-rmse:0.12426
[21]	validation_0-rmse:0.12254
[22]	validation_0-rmse:0.12069
[23]	validation_0-rmse:0.11885
[24]	validation_0-rmse:0.11718
[25]	validation_0-rmse:0.11592
[26]	validation_0-rmse:0.11417
[27]	validation_0-rmse:0.11261
[28]	validation_0-rmse:0.11116
[29]	validation_0-rmse:0.10969
[30]	validation_0-r

In [6]:
# --- 4. Ensemble and Create Submission ---
print("\nAveraging model predictions...")

# Simple 50/50 average of the two models' predictions
final_predictions = (lgbm_test_preds * 0.5) + (xgb_test_preds * 0.5)

# You can also check the OOF score of the ensemble
final_oof_preds = (lgbm_oof_preds * 0.5) + (xgb_oof_preds * 0.5)
ensemble_oof_rmse = np.sqrt(mean_squared_error(y_train, final_oof_preds))
print(f"Ensemble OOF RMSE: {ensemble_oof_rmse}")

# Create the submission file
print("\nCreating final submission file...")
submission_df = pd.DataFrame({'id': test_df['id'], 'accident_risk': final_predictions})
submission_df.to_csv('submission.csv', index=False)

print("submission.csv created successfully!")
submission_df.head()


Averaging model predictions...
Ensemble OOF RMSE: 0.056137029308187286

Creating final submission file...
submission.csv created successfully!


Unnamed: 0,id,accident_risk
0,517754,0.292744
1,517755,0.127936
2,517756,0.187514
3,517757,0.324328
4,517758,0.398234


In [7]:
# --- 5. Find Best Ensemble Weights and Create Final Submission ---
import numpy as np
from sklearn.metrics import mean_squared_error

print("Finding the best weights for the ensemble...")

best_rmse = 0.05579 # Your current best score
best_weight = 0.5
lgbm_weight = 0

# We will test weights from 0 to 1 in small steps (e.g., 0.01)
for w in np.arange(0, 1.01, 0.01):
    # Calculate the weighted average for the OOF predictions
    weighted_oof = (lgbm_oof_preds * w) + (xgb_oof_preds * (1-w))
    
    # Check the RMSE score for this weight combination
    current_rmse = np.sqrt(mean_squared_error(y_train, weighted_oof))
    
    # If this is the best score we've seen, save the weight
    if current_rmse < best_rmse:
        best_rmse = current_rmse
        best_weight = w

# The best weight for XGBoost will be 1 minus the best weight for LightGBM
xgb_weight = 1 - best_weight

print("--------------------------------------------------")
print(f"Search Complete! ✅")
print(f"New Best OOF RMSE: {best_rmse}")
print(f"Optimal Weights -> LGBM: {best_weight:.2f}, XGBoost: {xgb_weight:.2f}")
print("--------------------------------------------------")


# --- Create the final submission with the optimal weights ---
print("\nCreating final submission file with the best weights...")
final_predictions = (lgbm_test_preds * best_weight) + (xgb_test_preds * xgb_weight)

submission_df = pd.DataFrame({'id': test_df['id'], 'accident_risk': final_predictions})
submission_df.to_csv('submission.csv', index=False)

print("submission.csv created successfully!")
submission_df.head()

Finding the best weights for the ensemble...
--------------------------------------------------
Search Complete! ✅
New Best OOF RMSE: 0.05579
Optimal Weights -> LGBM: 0.50, XGBoost: 0.50
--------------------------------------------------

Creating final submission file with the best weights...
submission.csv created successfully!


Unnamed: 0,id,accident_risk
0,517754,0.292744
1,517755,0.127936
2,517756,0.187514
3,517757,0.324328
4,517758,0.398234
