In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.model_selection import KFold

# Load and clean dataset
df = pd.read_csv("delhi_aqi_preprocessed.csv")
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

X = df.drop(columns=["computed_aqi"])
y = df["computed_aqi"]

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.01, random_state=42
)

# Define base models
base_models = [
    ("dt", DecisionTreeRegressor(random_state=42)),
    ("rf", RandomForestRegressor(n_estimators=100, random_state=42)),
    ("xgb", XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)),
    ("svr", SVR(kernel='linear', C=1.0))
]

# Meta-model
meta_model = LinearRegression()

# Create empty arrays for stacking
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
S_train = np.zeros((X_train.shape[0], len(base_models)))
S_test = np.zeros((X_test.shape[0], len(base_models)))

# Train base models and collect out-of-fold predictions
for i, (name, model) in enumerate(base_models):
    S_test_i = np.zeros((X_test.shape[0], n_folds))
    for j, (train_idx, val_idx) in enumerate(kf.split(X_train)):
        X_tr, X_val = X_train[train_idx], X_train[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model.fit(X_tr, y_tr)
        S_train[val_idx, i] = model.predict(X_val)
        S_test_i[:, j] = model.predict(X_test)
    S_test[:, i] = S_test_i.mean(axis=1)

# Train meta-model on stacked base predictions
meta_model.fit(S_train, y_train)

# Final predictions
y_train_pred = meta_model.predict(S_train)
y_test_pred = meta_model.predict(S_test)

# Evaluation function
def evaluate_model(y_true, y_pred, label):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)

    print(f"\n{label} Performance:")
    print(f"R² Score: {r2:.4f}")
    print(f"MSE: {mse:.2f}")
    print(f"RMSE: {rmse:.2f}")

# Evaluate
evaluate_model(y_train, y_train_pred, "Training")
evaluate_model(y_test, y_test_pred, "Testing")



Training Performance:
R² Score: 0.9803
MSE: 351.48
RMSE: 18.75

Testing Performance:
R² Score: 0.9969
MSE: 45.12
RMSE: 6.72


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from xgboost import XGBRegressor

# Load and clean dataset
df = pd.read_csv("delhi_aqi_preprocessed.csv")
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

X = df.drop(columns=["computed_aqi"])
y = df["computed_aqi"]

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Base models
base_models = [
    ("dt", DecisionTreeRegressor(random_state=42)),
    ("rf", RandomForestRegressor(n_estimators=100, random_state=42)),
    ("xgb", XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)),
    ("svr", SVR(kernel='linear', C=1.0))
]

# Meta-model
meta_model = LinearRegression()

# Cross-validation setup
kf = KFold(n_splits=10, shuffle=True, random_state=42)

train_r2_list, test_r2_list = [], []
train_rmse_list, test_rmse_list = [], []
train_mse_list, test_mse_list = [], []

# Cross-validation loop
for fold, (train_idx, test_idx) in enumerate(kf.split(X_scaled), 1):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    S_train = np.zeros((X_train.shape[0], len(base_models)))
    S_test = np.zeros((X_test.shape[0], len(base_models)))

    inner_kf = KFold(n_splits=5, shuffle=True, random_state=42)

    for i, (name, model) in enumerate(base_models):
        S_test_i = np.zeros((X_test.shape[0], 5))
        for j, (inner_train_idx, inner_val_idx) in enumerate(inner_kf.split(X_train)):
            X_tr, X_val = X_train[inner_train_idx], X_train[inner_val_idx]
            y_tr, y_val = y_train.iloc[inner_train_idx], y_train.iloc[inner_val_idx]

            model.fit(X_tr, y_tr)
            S_train[inner_val_idx, i] = model.predict(X_val)
            S_test_i[:, j] = model.predict(X_test)
        S_test[:, i] = S_test_i.mean(axis=1)

    # Train meta-model
    meta_model.fit(S_train, y_train)

    y_train_pred = meta_model.predict(S_train)
    y_test_pred = meta_model.predict(S_test)

    # Evaluate
    train_r2_list.append(r2_score(y_train, y_train_pred))
    test_r2_list.append(r2_score(y_test, y_test_pred))

    train_mse_list.append(mean_squared_error(y_train, y_train_pred))
    test_mse_list.append(mean_squared_error(y_test, y_test_pred))

    train_rmse_list.append(np.sqrt(train_mse_list[-1]))
    test_rmse_list.append(np.sqrt(test_mse_list[-1]))

    print(f"\nFold {fold} Results:")
    print(f"Training -> R²: {train_r2_list[-1]:.4f}, MSE: {train_mse_list[-1]:.2f}, RMSE: {train_rmse_list[-1]:.2f}")
    print(f"Testing  -> R²: {test_r2_list[-1]:.4f}, MSE: {test_mse_list[-1]:.2f}, RMSE: {test_rmse_list[-1]:.2f}")

# Average performance across folds
print("\n=== Average Performance Over 10 Folds ===")
print(f"Train -> R²: {np.mean(train_r2_list):.4f}, MSE: {np.mean(train_mse_list):.2f}, RMSE: {np.mean(train_rmse_list):.2f}")
print(f"Test  -> R²: {np.mean(test_r2_list):.4f}, MSE: {np.mean(test_mse_list):.2f}, RMSE: {np.mean(test_rmse_list):.2f}")



Fold 1 Results:
Training -> R²: 0.9809, MSE: 343.72, RMSE: 18.54
Testing  -> R²: 0.9888, MSE: 175.34, RMSE: 13.24

Fold 2 Results:
Training -> R²: 0.9796, MSE: 358.23, RMSE: 18.93
Testing  -> R²: 0.9949, MSE: 96.03, RMSE: 9.80

Fold 3 Results:
Training -> R²: 0.9832, MSE: 299.03, RMSE: 17.29
Testing  -> R²: 0.9947, MSE: 95.88, RMSE: 9.79

Fold 4 Results:
Training -> R²: 0.9799, MSE: 358.27, RMSE: 18.93
Testing  -> R²: 0.9937, MSE: 112.31, RMSE: 10.60

Fold 5 Results:
Training -> R²: 0.9826, MSE: 316.75, RMSE: 17.80
Testing  -> R²: 0.9868, MSE: 193.19, RMSE: 13.90

Fold 6 Results:
Training -> R²: 0.9747, MSE: 447.96, RMSE: 21.17
Testing  -> R²: 0.9892, MSE: 203.76, RMSE: 14.27

Fold 7 Results:
Training -> R²: 0.9843, MSE: 278.22, RMSE: 16.68
Testing  -> R²: 0.9805, MSE: 343.77, RMSE: 18.54

Fold 8 Results:
Training -> R²: 0.9836, MSE: 288.94, RMSE: 17.00
Testing  -> R²: 0.9971, MSE: 51.47, RMSE: 7.17

Fold 9 Results:
Training -> R²: 0.9897, MSE: 184.04, RMSE: 13.57
Testing  -> R²: 0.93