In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [4]:
# Load clean dataset
df = pd.read_csv(
    "../data/ml_ready_dataset_clean.csv",
    parse_dates=["from_date"]
)

df = df.sort_values("from_date").reset_index(drop=True)

TARGET = "PM2.5"
DROP_COLS = ["from_date", "station_id"]

X = df.drop(columns=DROP_COLS + [TARGET])
y = df[TARGET]

# Time-aware split (same as before)
split_idx = int(0.8 * len(df))
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]


In [5]:
# Load trained models
rf_model = joblib.load("../models/pm25_rf_model.pkl")
xgb_model = joblib.load("../models/pm25_xgb_model.pkl")


In [6]:
y_pred_rf = rf_model.predict(X_test)
y_pred_xgb = xgb_model.predict(X_test)


In [7]:
y_pred_ensemble = 0.5 * y_pred_rf + 0.5 * y_pred_xgb


In [8]:
mae = mean_absolute_error(y_test, y_pred_ensemble)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_ensemble))
r2 = r2_score(y_test, y_pred_ensemble)

print("ðŸ“Š Ensemble Model Performance")
print("MAE :", round(mae, 3))
print("RMSE:", round(rmse, 3))
print("RÂ²  :", round(r2, 4))


ðŸ“Š Ensemble Model Performance
MAE : 3.541
RMSE: 5.578
RÂ²  : 0.8525


In [9]:
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))

w_rf = 1 / rmse_rf
w_xgb = 1 / rmse_xgb

# Normalize
w_rf /= (w_rf + w_xgb)
w_xgb /= (w_rf + w_xgb)

print("RF weight :", round(w_rf, 3))
print("XGB weight:", round(w_xgb, 3))


RF weight : 0.497
XGB weight: 0.264


In [10]:
y_pred_ensemble_weighted = w_rf * y_pred_rf + w_xgb * y_pred_xgb


In [11]:
mae_w = mean_absolute_error(y_test, y_pred_ensemble_weighted)
rmse_w = np.sqrt(mean_squared_error(y_test, y_pred_ensemble_weighted))
r2_w = r2_score(y_test, y_pred_ensemble_weighted)

print("ðŸ“Š Weighted Ensemble Performance")
print("MAE :", round(mae_w, 3))
print("RMSE:", round(rmse_w, 3))
print("RÂ²  :", round(r2_w, 4))


ðŸ“Š Weighted Ensemble Performance
MAE : 6.001
RMSE: 8.146
RÂ²  : 0.6854


In [12]:
print("RF R2:", r2_score(y_test, y_pred_rf))
print("XGB R2:", r2_score(y_test, y_pred_xgb))
print("Ensemble R2:", r2_score(y_test, y_pred_ensemble))
print("Weighted Ensemble R2:", r2_score(y_test, y_pred_ensemble_weighted))


RF R2: 0.8467218478921139
XGB R2: 0.8501695714861207
Ensemble R2: 0.852464386459724
Weighted Ensemble R2: 0.6853676514257088


In [13]:
import joblib

joblib.dump(rf_model, "../models/pm25_rf_model.pkl")
joblib.dump(xgb_model, "../models/pm25_xgb_model.pkl")


['../models/pm25_xgb_model.pkl']

In [14]:
ensemble_config = {
    "type": "simple_average",
    "weights": {
        "random_forest": 0.5,
        "xgboost": 0.5
    },
    "target": "PM2.5",
    "description": "50-50 ensemble of RF and XGBoost"
}

joblib.dump(ensemble_config, "../models/ensemble_config.pkl")

print("âœ… 50â€“50 ensemble configuration saved")


âœ… 50â€“50 ensemble configuration saved
