## Will work on Nvidia's GPUs

In [None]:
import pandas as pd
import numpy as np
import os
from cuml.ensemble import RandomForestRegressor as cuRandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.base import BaseEstimator, RegressorMixin
import joblib

# Load your training dataset
train_df = pd.read_csv('/kaggle/input/solar-panels-performance/dataset/cleaned_data.csv')

# Define features and target variable
X = train_df.drop('efficiency', axis=1)
y = train_df['efficiency']

print(f"Dataset shape: {X.shape}")
print(f"Target range: {y.min():.3f} to {y.max():.3f}")
print(f"Target mean: {y.mean():.3f}, std: {y.std():.3f}")

# Feature scaling for models that need it
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define the ImprovedStackingRegressor class
class ImprovedStackingRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, base_models, meta_model, use_scaling=True):
        self.base_models = base_models
        self.meta_model = meta_model
        self.use_scaling = use_scaling
        if use_scaling:
            self.scaler = StandardScaler()

    def fit(self, X, y):
        if self.use_scaling:
            X_processed = self.scaler.fit_transform(X)
        else:
            X_processed = X

        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        base_predictions = np.zeros((X.shape[0], len(self.base_models)))

        for i, model in enumerate(self.base_models):
            fold_predictions = np.zeros(X.shape[0])

            for train_idx, val_idx in kf.split(X):
                if i < 1:  # Only the first model uses scaled data
                    X_train_fold = X_processed[train_idx]
                    X_val_fold = X_processed[val_idx]
                else:
                    X_train_fold = X.iloc[train_idx] if hasattr(X, 'iloc') else X[train_idx]
                    X_val_fold = X.iloc[val_idx] if hasattr(X, 'iloc') else X[val_idx]

                y_train_fold = y.iloc[train_idx] if hasattr(y, 'iloc') else y[train_idx]

                model.fit(X_train_fold, y_train_fold)
                fold_predictions[val_idx] = model.predict(X_val_fold)

            base_predictions[:, i] = fold_predictions

        self.meta_model.fit(base_predictions, y)

        for i, model in enumerate(self.base_models):
            if i < 1:
                model.fit(X_processed, y)
            else:
                model.fit(X, y)

        return self

    def predict(self, X):
        if self.use_scaling:
            X_processed = self.scaler.transform(X)
        else:
            X_processed = X

        base_predictions = np.zeros((X.shape[0], len(self.base_models)))

        for i, model in enumerate(self.base_models):
            if i < 1:
                base_predictions[:, i] = model.predict(X_processed)
            else:
                base_predictions[:, i] = model.predict(X)

        return self.meta_model.predict(base_predictions)

# Define optimized base models with better hyperparameters
base_models = [
    XGBRegressor(
        objective='reg:squarederror',
        n_estimators=200,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        tree_method='hist',  # Use 'hist' tree method
        device='cuda'  # Set device to CUDA for GPU training
    ),
    cuRandomForestRegressor(
        n_estimators=200,
        max_depth=15,
        random_state=42,
        n_streams=1  # Set n_streams to 1 for reproducibility
    ),
    cuRandomForestRegressor(
        n_estimators=150,
        max_depth=20,
        random_state=123,
        n_streams=1  # Set n_streams to 1 for reproducibility
    )
]

# Use Ridge regression as meta-model
meta_model = Ridge(alpha=1.0)

# Create stacking model
stacking_model = ImprovedStackingRegressor(
    base_models=base_models,
    meta_model=meta_model,
    use_scaling=True
)

# Define the same scoring method as your original code
def custom_score(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    return 100 * (1 - np.sqrt(mse))

scorer = make_scorer(custom_score, greater_is_better=True)

# Set up k-fold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Test individual models first
print("\n=== Individual Model Performance ===")
for i, model in enumerate(base_models):
    if i < 1:
        cv_scores = cross_val_score(model, X_scaled, y, cv=kfold, scoring=scorer)
    else:
        cv_scores = cross_val_score(model, X, y, cv=kfold, scoring=scorer)

    print(f"Model {i+1} CV Score: {cv_scores.mean():.3f} (+/- {cv_scores.std()*2:.3f})")

# Test the stacking ensemble
print("\n=== Stacking Ensemble Performance ===")
cv_scores = cross_val_score(stacking_model, X, y, cv=kfold, scoring=scorer)

print("Cross-Validation Scores:", [f"{score:.3f}" for score in cv_scores])
print(f"Mean CV Score: {cv_scores.mean():.3f}")
print(f"Std CV Score: {cv_scores.std():.3f}")

# Train final model
print("\n=== Training Final Model ===")
stacking_model.fit(X, y)

# Save the model
model_path = '/kaggle/working/optimized_ensemble_model.pkl'
joblib.dump(stacking_model, model_path)
print(f"Model saved as '{model_path}'")

# Load the test data
test_data_path = '/kaggle/input/solar-panels-performance/dataset/test_data_processed.csv'
test_df = pd.read_csv(test_data_path)

# Ensure the test data has the same features as the training data
feature_columns = X.columns
test_df_aligned = test_df[feature_columns]

# Make predictions on the test data
predictions = stacking_model.predict(test_df_aligned)

# Create a submission DataFrame
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'efficiency': predictions
})

# Save the submission file in the /kaggle/working/ directory
submission_path = '/kaggle/working/submission.csv'
submission_df.to_csv(submission_path, index=False)

# Check if the file exists and print its size
if os.path.exists(submission_path):
    file_size = os.path.getsize(submission_path)
    print(f"Submission file saved as {submission_path} with size {file_size} bytes.")
else:
    print(f"Failed to save the submission file at {submission_path}.")
