In [7]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold, cross_val_score, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from catboost import CatBoostRegressor
import lightgbm as lgb
from scipy.optimize import minimize
from scipy.stats import randint as sp_randint
import time
import joblib

# Load your training dataset
train_df = pd.read_csv('/kaggle/input/solar-panels-performance/dataset/cleaned_data.csv')

# Define features and target variable
X = train_df.drop('efficiency', axis=1)
y = train_df['efficiency']

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)
X = pd.DataFrame(X_imputed, columns=X.columns)

print(f"Dataset shape: {X.shape}")
print(f"Target range: {y.min():.3f} to {y.max():.3f}")
print(f"Target mean: {y.mean():.3f}, std: {y.std():.3f}")

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define base models with Randomized Search for hyperparameter tuning
def get_base_models():
    # Define parameter distributions for Randomized Search
    param_dist_xgb = {
        'n_estimators': sp_randint(100, 300),
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'max_depth': sp_randint(3, 10),
        'subsample': [0.6, 0.7, 0.8, 0.9],
        'colsample_bytree': [0.6, 0.7, 0.8, 0.9]
    }

    param_dist_rf = {
        'n_estimators': sp_randint(100, 300),
        'max_depth': sp_randint(5, 20),
        'min_samples_split': sp_randint(2, 11),
        'min_samples_leaf': sp_randint(1, 11)
    }

    param_dist_mlp = {
        'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
        'activation': ['relu', 'tanh'],
        'solver': ['adam', 'sgd'],
        'alpha': [0.0001, 0.001, 0.01]
    }

    # Initialize models
    xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42, tree_method='hist', n_jobs=-1)
    rf_model = RandomForestRegressor(random_state=42, n_jobs=-1)
    mlp_model = MLPRegressor(random_state=42, max_iter=1000)

    # Perform Randomized Search for XGBRegressor
    random_search_xgb = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist_xgb,
                                            n_iter=10, cv=3, scoring='neg_mean_squared_error', random_state=42, n_jobs=-1)
    random_search_xgb.fit(X, y)

    # Perform Randomized Search for RandomForestRegressor
    random_search_rf = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist_rf,
                                          n_iter=10, cv=3, scoring='neg_mean_squared_error', random_state=42, n_jobs=-1)
    random_search_rf.fit(X, y)

    # Perform Randomized Search for MLPRegressor
    random_search_mlp = RandomizedSearchCV(estimator=mlp_model, param_distributions=param_dist_mlp,
                                           n_iter=10, cv=3, scoring='neg_mean_squared_error', random_state=42, n_jobs=-1)
    random_search_mlp.fit(X_scaled, y)

    return [
        random_search_xgb.best_estimator_,
        random_search_rf.best_estimator_,
        RandomForestRegressor(n_estimators=80, max_depth=15, random_state=123, n_jobs=-1),
        SVR(kernel='rbf', C=1.0, epsilon=0.1),
        CatBoostRegressor(iterations=200, learning_rate=0.05, depth=8, l2_leaf_reg=10, border_count=100,
                          bootstrap_type='Bernoulli', random_strength=5, verbose=False, random_state=42, thread_count=-1),
        lgb.LGBMRegressor(verbose=-1, n_estimators=300, max_bin=10000, random_state=42, n_jobs=-1),
        random_search_mlp.best_estimator_
    ]

# Custom scoring function
def custom_score(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    return 100 * (1 - np.sqrt(mse))

# Hill climbing for optimal weights
def hill_climbing_weights(oof_preds, y_true, n_iter=1000):
    n_models = oof_preds.shape[1]

    def objective(weights):
        weights = np.array(weights)
        weights = np.abs(weights)
        weights = weights / np.sum(weights)
        pred = np.dot(oof_preds, weights)
        return -custom_score(y_true, pred)

    best_weights = np.ones(n_models) / n_models
    best_score = -objective(best_weights)

    print(f"Initial score: {best_score:.4f}")

    for i in range(n_iter):
        perturbation = np.random.normal(0, 0.1, n_models)
        new_weights = best_weights + perturbation
        new_weights = np.abs(new_weights)
        new_weights = new_weights / np.sum(new_weights)

        new_score = -objective(new_weights)

        if new_score > best_score:
            best_weights = new_weights
            best_score = new_score
            if i % 100 == 0:
                print(f"Iteration {i}: New best score: {best_score:.4f}")

    return best_weights, best_score

class OptimizedStackingRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_folds=5, random_seeds=[42, 123, 456]):
        self.n_folds = n_folds
        self.random_seeds = random_seeds
        self.models_dict = {}
        self.weights = None
        self.scaler = StandardScaler()
        self.imputer = SimpleImputer(strategy='mean')

    def fit(self, X, y):
        print("=== Step 1: Generating Out-of-Fold Predictions ===")
        kf = KFold(n_splits=self.n_folds, shuffle=True, random_state=42)

        X_imputed = self.imputer.fit_transform(X)
        X_scaled = self.scaler.fit_transform(X_imputed)

        base_models = get_base_models()
        n_models = len(base_models)

        oof_preds = np.zeros((X.shape[0], n_models))

        for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
            print(f"Processing fold {fold + 1}/{self.n_folds}")

            for i, model in enumerate(base_models):
                if i in [2, 6]:  # Indices for models that need scaled data
                    X_train_fold = X_scaled[train_idx]
                    X_val_fold = X_scaled[val_idx]
                else:
                    X_train_fold = X_imputed[train_idx]
                    X_val_fold = X_imputed[val_idx]

                y_train_fold = y.iloc[train_idx] if hasattr(y, 'iloc') else y[train_idx]

                model.fit(X_train_fold, y_train_fold)
                oof_preds[val_idx, i] = model.predict(X_val_fold)

        print("\n=== Individual Model OOF Scores ===")
        for i in range(n_models):
            score = custom_score(y, oof_preds[:, i])
            print(f"Model {i+1} ({type(base_models[i]).__name__}): {score:.4f}")

        print("\n=== Step 2: Optimizing Weights with Hill Climbing ===")
        self.weights, best_score = hill_climbing_weights(oof_preds, y)

        print(f"\nOptimal weights: {self.weights}")
        print(f"Best OOF score: {best_score:.4f}")

        print("\n=== Step 3: Training Final Models on 100% Data ===")
        for seed_idx, seed in enumerate(self.random_seeds):
            print(f"Training with seed {seed} ({seed_idx + 1}/{len(self.random_seeds)})")

            models_for_seed = []
            for i, base_model in enumerate(base_models):
                model_params = base_model.get_params().copy()

                if hasattr(base_model, 'random_state') and not isinstance(base_model, SVR):
                    model_params['random_state'] = seed

                if hasattr(base_model, 'n_estimators'):
                    original_estimators = model_params.get('n_estimators', 100)
                    model_params['n_estimators'] = int(original_estimators * 1.25)
                elif hasattr(base_model, 'iterations'):
                    original_iterations = model_params.get('iterations', 100)
                    model_params['iterations'] = int(original_iterations * 1.25)

                final_model = type(base_model)(**model_params)

                if i in [2, 6]:  # Indices for models that need scaled data
                    final_model.fit(X_scaled, y)
                else:
                    final_model.fit(X_imputed, y)

                models_for_seed.append(final_model)

            self.models_dict[seed] = models_for_seed

        return self

    def predict(self, X):
        X_imputed = self.imputer.transform(X)
        X_scaled = self.scaler.transform(X_imputed)

        all_seed_preds = []

        for seed in self.random_seeds:
            models = self.models_dict[seed]
            seed_preds = np.zeros((X.shape[0], len(models)))

            for i, model in enumerate(models):
                if i in [2, 6]:  # Indices for models that need scaled data
                    seed_preds[:, i] = model.predict(X_scaled)
                else:
                    seed_preds[:, i] = model.predict(X_imputed)

            weighted_pred = np.dot(seed_preds, self.weights)
            all_seed_preds.append(weighted_pred)

        final_pred = np.mean(all_seed_preds, axis=0)
        return final_pred

# Train the optimized model
print("=== Training CPU-Optimized Stacking Ensemble ===")
start_time = time.time()

stacking_model = OptimizedStackingRegressor(n_folds=5, random_seeds=[42, 123, 456])
stacking_model.fit(X, y)

training_time = time.time() - start_time
print(f"\nTotal training time: {training_time/60:.2f} minutes")

# Save the model
model_path = '/kaggle/working/cpu_optimized_ensemble_model.pkl'
joblib.dump(stacking_model, model_path)
print(f"Model saved as '{model_path}'")

# Load test data and make predictions
print("\n=== Making Test Predictions ===")
test_data_path = '/kaggle/input/solar-panels-performance/dataset/test_data_processed.csv'
test_df = pd.read_csv(test_data_path)

feature_columns = X.columns
test_df_aligned = test_df[feature_columns]

predictions = stacking_model.predict(test_df_aligned)

# Create submission
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'efficiency': predictions
})

submission_path = '/kaggle/working/submission_cpu_optimized.csv'
submission_df.to_csv(submission_path, index=False)

print(f"Submission saved as {submission_path}")
print(f"Prediction range: {predictions.min():.3f} to {predictions.max():.3f}")
print(f"Prediction mean: {predictions.mean():.3f}")
print(f"Final model uses {len(stacking_model.random_seeds)} different seeds with hill-climbing optimized weights")

# Optional: Quick validation check
print("\n=== Quick Validation Check ===")
scorer = make_scorer(custom_score, greater_is_better=True)
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(stacking_model, X, y, cv=kfold, scoring=scorer)
print(f"5-Fold CV Score: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")


Dataset shape: (18513, 11)
Target range: 0.000 to 0.951
Target mean: 0.505, std: 0.138
=== Training CPU-Optimized Stacking Ensemble ===
=== Step 1: Generating Out-of-Fold Predictions ===
Processing fold 1/5
Processing fold 2/5
Processing fold 3/5
Processing fold 4/5
Processing fold 5/5

=== Individual Model OOF Scores ===
Model 1 (XGBRegressor): 89.5176
Model 2 (RandomForestRegressor): 89.5027
Model 3 (RandomForestRegressor): 89.2553
Model 4 (SVR): 88.9334
Model 5 (CatBoostRegressor): 89.4515
Model 6 (LGBMRegressor): 89.3689
Model 7 (MLPRegressor): 88.9127

=== Step 2: Optimizing Weights with Hill Climbing ===
Initial score: 89.5187

Optimal weights: [0.33603501 0.36354131 0.00294165 0.0048744  0.0648906  0.18021186
 0.04750517]
Best OOF score: 89.5653

=== Step 3: Training Final Models on 100% Data ===
Training with seed 42 (1/3)
Training with seed 123 (2/3)
Training with seed 456 (3/3)

Total training time: 5.44 minutes
Model saved as '/kaggle/working/cpu_optimized_ensemble_model.pkl