In [3]:
import pandas as pd
import numpy as np
from cuml.ensemble import RandomForestRegressor as cuRandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.base import BaseEstimator, RegressorMixin
import joblib

# Load your training dataset
train_df = pd.read_csv('dataset/cleaned_data.csv')

# Define features and target variable
X = train_df.drop('efficiency', axis=1)
y = train_df['efficiency']

print(f"Dataset shape: {X.shape}")
print(f"Target range: {y.min():.3f} to {y.max():.3f}")
print(f"Target mean: {y.mean():.3f}, std: {y.std():.3f}")

# Feature scaling for models that need it
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define the ImprovedStackingRegressor class
class ImprovedStackingRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, base_models, meta_model, use_scaling=True):
        self.base_models = base_models
        self.meta_model = meta_model
        self.use_scaling = use_scaling
        if use_scaling:
            self.scaler = StandardScaler()

    def fit(self, X, y):
        if self.use_scaling:
            X_processed = self.scaler.fit_transform(X)
        else:
            X_processed = X

        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        base_predictions = np.zeros((X.shape[0], len(self.base_models)))

        for i, model in enumerate(self.base_models):
            fold_predictions = np.zeros(X.shape[0])

            for train_idx, val_idx in kf.split(X):
                if i < 1:  # Only the first model uses scaled data
                    X_train_fold = X_processed[train_idx]
                    X_val_fold = X_processed[val_idx]
                else:
                    X_train_fold = X.iloc[train_idx] if hasattr(X, 'iloc') else X[train_idx]
                    X_val_fold = X.iloc[val_idx] if hasattr(X, 'iloc') else X[val_idx]

                y_train_fold = y.iloc[train_idx] if hasattr(y, 'iloc') else y[train_idx]

                model.fit(X_train_fold, y_train_fold)
                fold_predictions[val_idx] = model.predict(X_val_fold)

            base_predictions[:, i] = fold_predictions

        self.meta_model.fit(base_predictions, y)

        for i, model in enumerate(self.base_models):
            if i < 1:
                model.fit(X_processed, y)
            else:
                model.fit(X, y)

        return self

    def predict(self, X):
        if self.use_scaling:
            X_processed = self.scaler.transform(X)
        else:
            X_processed = X

        base_predictions = np.zeros((X.shape[0], len(self.base_models)))

        for i, model in enumerate(self.base_models):
            if i < 1:
                base_predictions[:, i] = model.predict(X_processed)
            else:
                base_predictions[:, i] = model.predict(X)

        return self.meta_model.predict(base_predictions)

# Define optimized base models with better hyperparameters
base_models = [
    XGBRegressor(
        objective='reg:squarederror',
        n_estimators=200,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        tree_method='hist',  # Use 'hist' tree method
        device='cuda'  # Set device to CUDA for GPU training
    ),
    cuRandomForestRegressor(
        n_estimators=200,
        max_depth=15,
        random_state=42,
        n_streams=1  # Set n_streams to 1 for reproducibility
    ),
    cuRandomForestRegressor(
        n_estimators=150,
        max_depth=20,
        random_state=123,
        n_streams=1  # Set n_streams to 1 for reproducibility
    )
]

# Use Ridge regression as meta-model
meta_model = Ridge(alpha=1.0)

# Create stacking model
stacking_model = ImprovedStackingRegressor(
    base_models=base_models,
    meta_model=meta_model,
    use_scaling=True
)

# Define the same scoring method as your original code
def custom_score(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    return 100 * (1 - np.sqrt(mse))

scorer = make_scorer(custom_score, greater_is_better=True)

# Set up k-fold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Test individual models first
print("\n=== Individual Model Performance ===")
for i, model in enumerate(base_models):
    if i < 1:
        cv_scores = cross_val_score(model, X_scaled, y, cv=kfold, scoring=scorer)
    else:
        cv_scores = cross_val_score(model, X, y, cv=kfold, scoring=scorer)

    print(f"Model {i+1} CV Score: {cv_scores.mean():.3f} (+/- {cv_scores.std()*2:.3f})")

# Test the stacking ensemble
print("\n=== Stacking Ensemble Performance ===")
cv_scores = cross_val_score(stacking_model, X, y, cv=kfold, scoring=scorer)

print("Cross-Validation Scores:", [f"{score:.3f}" for score in cv_scores])
print(f"Mean CV Score: {cv_scores.mean():.3f}")
print(f"Std CV Score: {cv_scores.std():.3f}")

# Train final model
print("\n=== Training Final Model ===")
stacking_model.fit(X, y)

# Save the model
joblib.dump(stacking_model, 'optimized_ensemble_model.pkl')
print("Model saved as 'optimized_ensemble_model.pkl'")

# Load the test data
test_data_path = 'dataset/test.csv'
test_df = pd.read_csv(test_data_path)

# Preprocess the test data
def preprocess_test_data(test_df, scaler=None):
    # Convert columns to appropriate data types
    test_df['humidity'] = pd.to_numeric(test_df['humidity'], errors='coerce')
    test_df['wind_speed'] = pd.to_numeric(test_df['wind_speed'], errors='coerce')
    test_df['pressure'] = pd.to_numeric(test_df['pressure'], errors='coerce')

    # Fill missing panel_age with the median age of the same string_id
    test_df['panel_age'] = test_df.groupby('string_id')['panel_age'].transform(
        lambda x: x.fillna(x.median())
    )

    # Impute missing values for maintenance_count
    error_string_median = test_df.groupby(['string_id', 'error_code'])['maintenance_count'].median()
    error_code_median = test_df.groupby('error_code')['maintenance_count'].median()
    string_id_median = test_df.groupby('string_id')['maintenance_count'].median()
    overall_median = test_df['maintenance_count'].median()

    missing_mask = test_df['maintenance_count'].isna()
    for idx in test_df[missing_mask].index:
        string_id = test_df.loc[idx, 'string_id']
        error_code = test_df.loc[idx, 'error_code']

        if pd.notna(error_code) and (string_id, error_code) in error_string_median:
            test_df.loc[idx, 'maintenance_count'] = error_string_median[(string_id, error_code)]
        elif pd.notna(error_code) and error_code in error_code_median:
            test_df.loc[idx, 'maintenance_count'] = error_code_median[error_code]
        elif string_id in string_id_median:
            test_df.loc[idx, 'maintenance_count'] = string_id_median[string_id]
        else:
            test_df.loc[idx, 'maintenance_count'] = overall_median

    # Impute missing values for soiling_ratio
    maintenance_bins = pd.cut(test_df['maintenance_count'], bins=5, include_lowest=True)
    maintenance_soiling_median = test_df.groupby(maintenance_bins)['soiling_ratio'].median()

    missing_soiling_mask = test_df['soiling_ratio'].isna()
    for idx in test_df[missing_soiling_mask].index:
        maintenance_val = test_df.loc[idx, 'maintenance_count']

        if pd.notna(maintenance_val):
            for bin_range, median_soiling in maintenance_soiling_median.items():
                if maintenance_val >= bin_range.left and maintenance_val <= bin_range.right:
                    test_df.loc[idx, 'soiling_ratio'] = median_soiling
                    break

    # Impute missing values for module_temperature
    if 'temperature' in test_df.columns:
        temp_numeric = pd.to_numeric(test_df['temperature'], errors='coerce')
        module_temp_numeric = pd.to_numeric(test_df['module_temperature'], errors='coerce')

        train_mask = module_temp_numeric.notna() & temp_numeric.notna()
        if train_mask.sum() > 10:
            lr = LinearRegression()
            lr.fit(temp_numeric[train_mask].values.reshape(-1, 1), module_temp_numeric[train_mask])

            predict_mask = module_temp_numeric.isna() & temp_numeric.notna()
            test_df.loc[predict_mask, 'module_temperature'] = lr.predict(temp_numeric[predict_mask].values.reshape(-1, 1))

    numeric_features = test_df.select_dtypes(include=['number']).columns.tolist()
    features_to_use = [f for f in numeric_features if f != 'module_temperature']

    if features_to_use:
        df_numeric = test_df[['module_temperature'] + features_to_use].apply(pd.to_numeric, errors='coerce')
        imputer = KNNImputer(n_neighbors=5)
        imputed_values = imputer.fit_transform(df_numeric)
        test_df['module_temperature'] = imputed_values[:, 0]

    # Create the 'power' feature
    test_df['power'] = test_df['current'] * test_df['voltage']

    # Impute missing values for irradiance
    irradiance = pd.to_numeric(test_df['irradiance'], errors='coerce')
    zero_mask = (irradiance == 0)
    to_impute = zero_mask | irradiance.isna()

    if 'power' in test_df.columns and 'efficiency' in test_df.columns:
        estimated_irradiance = pd.to_numeric(test_df['power'], errors='coerce') / (pd.to_numeric(test_df['efficiency'], errors='coerce') + 0.0001)
        plausible_mask = (estimated_irradiance > 0) & (estimated_irradiance < 1500)
        irradiance[to_impute & plausible_mask] = estimated_irradiance[to_impute & plausible_mask]

    relevant_features = ['temperature', 'module_temperature', 'humidity', 'cloud_coverage']
    use_features = [f for f in relevant_features if f in test_df.columns]

    if use_features:
        knn_data = test_df[use_features].apply(pd.to_numeric, errors='coerce')
        knn_data['irradiance'] = irradiance
        imputer = KNNImputer(n_neighbors=5)
        imputed = imputer.fit_transform(knn_data)
        irradiance[to_impute] = imputed[to_impute, -1]

    irradiance = np.clip(irradiance, 0, 1500)
    if irradiance.isna().any():
        irradiance.fillna(irradiance.median(), inplace=True)
    test_df['irradiance'] = irradiance

    # Impute missing values for humidity
    humidity = pd.to_numeric(test_df['humidity'], errors='coerce')
    zero_mask = (humidity == 0)
    to_impute = zero_mask | humidity.isna()

    relevant_features = ['temperature', 'module_temperature', 'cloud_coverage', 'wind_speed', 'pressure']
    use_features = [f for f in relevant_features if f in test_df.columns]

    if use_features:
        knn_data = test_df[use_features].apply(pd.to_numeric, errors='coerce')
        knn_data['humidity'] = humidity
        imputer = KNNImputer(n_neighbors=5)
        imputed = imputer.fit_transform(knn_data)
        humidity[to_impute] = imputed[to_impute, -1]

    humidity = np.clip(humidity, 0, 100)
    if humidity.isna().any():
        humidity.fillna(humidity.median(), inplace=True)
    test_df['humidity'] = humidity

    # Handle categorical variables: one-hot encode 'error_code'
    test_df = pd.get_dummies(test_df, columns=['error_code'])

    # Ensure all one-hot encoded columns from training are present
    expected_columns = ['error_code_E00', 'error_code_E01', 'error_code_E02', 'error_code_Unknown']
    for column in expected_columns:
        if column not in test_df.columns:
            test_df[column] = 0

    # Scale numerical features using the same scaler as the training data
    if scaler:
        numerical_features = ['module_temperature', 'irradiance', 'power', 'panel_age', 'maintenance_count', 'soiling_ratio', 'humidity']
        test_df[numerical_features] = scaler.transform(test_df[numerical_features])

    return test_df

# Load the scaler used for training data
scaler = joblib.load('scaler.pkl')  # Make sure to replace 'scaler.pkl' with the actual path to your saved scaler

# Preprocess the test data
test_df_processed = preprocess_test_data(test_df, scaler)

# Ensure the test data has the same features as the training data
training_feature_columns = X.columns.tolist()
test_df_aligned = test_df_processed[['id'] + [col for col in training_feature_columns if col in test_df_processed.columns]]

# Make predictions on the test data
predictions = stacking_model.predict(test_df_aligned[training_feature_columns])

# Create a submission DataFrame
submission_df = pd.DataFrame({
    'id': test_df_aligned['id'],
    'efficiency': predictions
})

# Save the submission file
submission_path = 'submission.csv'
submission_df.to_csv(submission_path, index=False)

print(f"Submission file saved as {submission_path}")


ModuleNotFoundError: No module named 'cuml'