In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor, StackingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from catboost import CatBoostRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.metrics import mean_squared_error

# Load data
train = pd.read_csv("processed_dataset/train_data_processed.csv")
test = pd.read_csv("processed_dataset/test_data_processed.csv")

# Feature engineering - add a few interaction terms (feel free to add more)
train['temp_irradiance'] = train['temperature'] * train['irradiance']
train['humidity_wind'] = train['humidity'] * train['wind_speed']
train['age_maintenance'] = train['panel_age'] * train['maintenance_count']

test['temp_irradiance'] = test['temperature'] * test['irradiance']
test['humidity_wind'] = test['humidity'] * test['wind_speed']
test['age_maintenance'] = test['panel_age'] * test['maintenance_count']

# Prepare X and y
target_col = "target"
drop_cols = ["id", target_col]

X = train.drop(columns=drop_cols)
y = train[target_col]
X_test = test.drop(columns=["id"])

# Define models with tuned hyperparameters
models = {
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.01),
    "ElasticNet": ElasticNet(alpha=0.01, l1_ratio=0.5),
    "RandomForest": RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1),
    "ExtraTrees": ExtraTreesRegressor(n_estimators=200, random_state=42, n_jobs=-1),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=300, learning_rate=0.05, random_state=42),
    "AdaBoost": AdaBoostRegressor(n_estimators=300, learning_rate=0.05, random_state=42),
    "XGBoost": xgb.XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=5, random_state=42, n_jobs=-1),
    "LightGBM": lgb.LGBMRegressor(n_estimators=300, learning_rate=0.05, random_state=42, n_jobs=-1),
    "CatBoost": CatBoostRegressor(verbose=0, n_estimators=300, learning_rate=0.05, random_state=42),
    "SVR": SVR(),
    "MLP": MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=1000, random_state=42)
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)

print("Training and cross-validating individual models...")
for name, model in models.items():
    print(f"Training and CV for {name}...")
    preds = cross_val_predict(model, X, y, cv=kf, n_jobs=-1)
    rmse = mean_squared_error(y, preds, squared=False)
    score = 100 * (1 - rmse)
    print(f"{name} CV Score: {score:.4f}")

# Stacking ensemble using best gradient boosting models and Ridge as final estimator
base_models = [
    ("lgbm", lgb.LGBMRegressor(n_estimators=300, learning_rate=0.05, random_state=42, n_jobs=-1)),
    ("catboost", CatBoostRegressor(verbose=0, n_estimators=300, learning_rate=0.05, random_state=42)),
    ("xgb", xgb.XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=5, random_state=42, n_jobs=-1)),
    ("ada", AdaBoostRegressor(n_estimators=300, learning_rate=0.05, random_state=42))
]

stack_model = StackingRegressor(
    estimators=base_models,
    final_estimator=Ridge(alpha=1.0),
    cv=kf,
    n_jobs=-1,
    passthrough=True
)

print("Training stacking ensemble...")
stack_preds = cross_val_predict(stack_model, X, y, cv=kf, n_jobs=-1)
stack_rmse = mean_squared_error(y, stack_preds, squared=False)
stack_score = 100 * (1 - stack_rmse)
print(f"Stacking Ensemble CV Score: {stack_score:.4f}")

# Train on full data and predict test set
print("Fitting stacking model on full training data...")
stack_model.fit(X, y)
test_preds = stack_model.predict(X_test)

# Save submission
submission = pd.DataFrame({
    "id": test["id"],
    "efficiency": test_preds
})
submission.to_csv("/kaggle/working/submission.csv", index=False)
print("Submission saved to /kaggle/working/submission.csv")
