In [4]:
!pip install pandas numpy scikit-learn xgboost catboost lightgbm joblib


Collecting catboost
  Using cached catboost-1.2.8-cp312-cp312-macosx_11_0_universal2.whl.metadata (1.4 kB)
Collecting graphviz (from catboost)
  Using cached graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Collecting plotly (from catboost)
  Using cached plotly-6.1.2-py3-none-any.whl.metadata (6.9 kB)
Collecting narwhals>=1.15.1 (from plotly->catboost)
  Using cached narwhals-1.41.0-py3-none-any.whl.metadata (11 kB)
Using cached catboost-1.2.8-cp312-cp312-macosx_11_0_universal2.whl (27.8 MB)
Using cached graphviz-0.20.3-py3-none-any.whl (47 kB)
Using cached plotly-6.1.2-py3-none-any.whl (16.3 MB)
Using cached narwhals-1.41.0-py3-none-any.whl (357 kB)
Installing collected packages: narwhals, graphviz, plotly, catboost
Successfully installed catboost-1.2.8 graphviz-0.20.3 narwhals-1.41.0 plotly-6.1.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39

In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import lightgbm as lgb
import joblib
import warnings
warnings.filterwarnings('ignore')

# Load your processed data
print("=== Loading Processed Data ===")
train_df = pd.read_csv('processed_dataset/train.csv')

print(f"Dataset shape: {train_df.shape}")
print(f"Columns: {list(train_df.columns)}")

# Prepare features and target
# Use ALL columns except 'id' and 'efficiency'
feature_cols = [col for col in train_df.columns if col not in ['id', 'efficiency']]
X = train_df[feature_cols]
y = train_df['efficiency']

print(f"\nFeatures shape: {X.shape}")
print(f"Features: {feature_cols}")
print(f"Target range: {y.min():.4f} to {y.max():.4f}")

# Custom scoring function (same as competition)
def custom_score(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    return 100 * (1 - np.sqrt(mse))

# Split data for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\nTrain set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")

# Define models
print("\n=== Defining Models ===")
models = {
    'XGBoost': XGBRegressor(
        n_estimators=300,
        learning_rate=0.1,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    ),
    'RandomForest': RandomForestRegressor(
        n_estimators=200,
        max_depth=12,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1
    ),
    'CatBoost': CatBoostRegressor(
        iterations=300,
        learning_rate=0.1,
        depth=6,
        l2_leaf_reg=3,
        verbose=False,
        random_state=42,
        thread_count=-1
    ),
    'LightGBM': lgb.LGBMRegressor(
        n_estimators=300,
        learning_rate=0.1,
        max_depth=6,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        verbose=-1,
        random_state=42,
        n_jobs=-1
    ),
    'Ridge': Ridge(alpha=1.0)
}

# Train individual models and evaluate
print("\n=== Training Individual Models ===")
individual_scores = {}
trained_models = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train model
    model.fit(X_train, y_train)
    trained_models[name] = model
    
    # Predict on validation set
    val_pred = model.predict(X_val)
    score = custom_score(y_val, val_pred)
    individual_scores[name] = score
    
    print(f"{name} validation score: {score:.4f}")

# Create ensemble (Voting Regressor)
print("\n=== Creating Ensemble Model ===")
ensemble_models = [
    ('xgb', models['XGBoost']),
    ('rf', models['RandomForest']),
    ('cat', models['CatBoost']),
    ('lgb', models['LightGBM']),
    ('ridge', models['Ridge'])
]

ensemble = VotingRegressor(ensemble_models)
ensemble.fit(X_train, y_train)

# Evaluate ensemble
ensemble_pred = ensemble.predict(X_val)
ensemble_score = custom_score(y_val, ensemble_pred)
print(f"Ensemble validation score: {ensemble_score:.4f}")

# Cross-validation on full dataset
print("\n=== Cross-Validation on Full Dataset ===")
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# CV for best individual model
best_model_name = max(individual_scores, key=individual_scores.get)
best_model = models[best_model_name]

cv_scores_individual = []
cv_scores_ensemble = []

for fold, (train_idx, val_idx) in enumerate(kfold.split(X)):
    print(f"Fold {fold + 1}/5...")
    
    X_train_fold = X.iloc[train_idx]
    X_val_fold = X.iloc[val_idx]
    y_train_fold = y.iloc[train_idx]
    y_val_fold = y.iloc[val_idx]
    
    # Individual model
    best_model.fit(X_train_fold, y_train_fold)
    pred_individual = best_model.predict(X_val_fold)
    score_individual = custom_score(y_val_fold, pred_individual)
    cv_scores_individual.append(score_individual)
    
    # Ensemble model
    ensemble.fit(X_train_fold, y_train_fold)
    pred_ensemble = ensemble.predict(X_val_fold)
    score_ensemble = custom_score(y_val_fold, pred_ensemble)
    cv_scores_ensemble.append(score_ensemble)

print(f"\n=== Final Results ===")
print(f"Best individual model: {best_model_name}")
print(f"Individual model CV score: {np.mean(cv_scores_individual):.4f} (+/- {np.std(cv_scores_individual)*2:.4f})")
print(f"Ensemble model CV score: {np.mean(cv_scores_ensemble):.4f} (+/- {np.std(cv_scores_ensemble)*2:.4f})")

# Train final model on full dataset
print(f"\n=== Training Final Model on Full Dataset ===")
if np.mean(cv_scores_ensemble) > np.mean(cv_scores_individual):
    final_model = ensemble
    final_model_name = "Ensemble"
    print("Using Ensemble as final model")
else:
    final_model = best_model
    final_model_name = best_model_name
    print(f"Using {best_model_name} as final model")

final_model.fit(X, y)

# Save the final model
model_filename = f'solar_panel_{final_model_name.lower()}_model.pkl'
joblib.dump(final_model, model_filename)
print(f"\nFinal model saved as: {model_filename}")

# Feature importance (if available)
print(f"\n=== Feature Importance ===")
if hasattr(final_model, 'feature_importances_'):
    importance = final_model.feature_importances_
    feature_importance = pd.DataFrame({
        'feature': feature_cols,
        'importance': importance
    }).sort_values('importance', ascending=False)
    
    print("Top 10 most important features:")
    print(feature_importance.head(10))
elif final_model_name == "Ensemble":
    # For ensemble, get average importance from tree-based models
    importances = []
    for name, model in ensemble.named_estimators_.items():
        if hasattr(model, 'feature_importances_'):
            importances.append(model.feature_importances_)
    
    if importances:
        avg_importance = np.mean(importances, axis=0)
        feature_importance = pd.DataFrame({
            'feature': feature_cols,
            'importance': avg_importance
        }).sort_values('importance', ascending=False)
        
        print("Top 10 most important features (ensemble average):")
        print(feature_importance.head(10))

print(f"\n=== Summary ===")
print(f"✅ Used {len(feature_cols)} features for training")
print(f"✅ Final model: {final_model_name}")
print(f"✅ Cross-validation score: {np.mean(cv_scores_ensemble if final_model_name == 'Ensemble' else cv_scores_individual):.4f}")
print(f"✅ Model saved as: {model_filename}")
print(f"✅ Ready for test predictions!")

# Quick prediction test
print(f"\n=== Quick Prediction Test ===")
sample_pred = final_model.predict(X.head(5))
print(f"Sample predictions: {sample_pred}")
print(f"Actual values: {y.head(5).values}")
print(f"Prediction range: {sample_pred.min():.4f} to {sample_pred.max():.4f}")

=== Loading Processed Data ===
Dataset shape: (20000, 19)
Columns: ['id', 'temperature', 'irradiance', 'humidity', 'panel_age', 'maintenance_count', 'soiling_ratio', 'voltage', 'current', 'module_temperature', 'cloud_coverage', 'wind_speed', 'pressure', 'power', 'adjusted_irradiance', 'string_id_encoded', 'error_code_encoded', 'installation_type_encoded', 'efficiency']

Features shape: (20000, 17)
Features: ['temperature', 'irradiance', 'humidity', 'panel_age', 'maintenance_count', 'soiling_ratio', 'voltage', 'current', 'module_temperature', 'cloud_coverage', 'wind_speed', 'pressure', 'power', 'adjusted_irradiance', 'string_id_encoded', 'error_code_encoded', 'installation_type_encoded']
Target range: 0.0000 to 0.9871

Train set: (16000, 17)
Validation set: (4000, 17)

=== Defining Models ===

=== Training Individual Models ===

Training XGBoost...
XGBoost validation score: 88.9981

Training RandomForest...
RandomForest validation score: 89.1203

Training CatBoost...
CatBoost validation