<a href="https://colab.research.google.com/github/sana-f-shah/Solar-Panel-Analytics/blob/main/notebooks/3a_regression_model_comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

df = pd.read_parquet('/content/drive/MyDrive/Portfolio/Solar Panel Analytics/data_versions/original.parquet')

In [None]:
print(df.columns)

Index(['Day', 'Month', 'Year', 'TempOut', 'OutHum', 'WindSpeed', 'Bar', 'Rain',
       'RainRate', 'SolarEnergy', 'CoolD-D', 'InHum', 'ET', 'WindSamp',
       'WindTx', 'ArcInt', 'WindDir_E', 'WindDir_ENE', 'WindDir_ESE',
       'WindDir_N', 'WindDir_NE', 'WindDir_NNE', 'WindDir_NNW', 'WindDir_NW',
       'WindDir_S', 'WindDir_SE', 'WindDir_SSE', 'WindDir_SSW', 'WindDir_SW',
       'WindDir_Unknown', 'WindDir_W', 'WindDir_WNW', 'WindDir_WSW',
       'Site_Easthill Road', 'Site_Elm Crescent', 'Site_Forest Road',
       'Site_Maple Drive East', 'Site_YMCA'],
      dtype='object')


In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(columns='SolarEnergy')
y = df['SolarEnergy']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
results = {
    'dataset': [],
    'rmse': [],
    'mae': [],
    'mape (%)': [],
    'median_ae': [],
    'r2': [],
    'mbe (bias)': []
}

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import BayesianRidge, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, median_absolute_error
import numpy as np
import joblib

models = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Linear Regression': LinearRegression(),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'K Neighbour': KNeighborsRegressor(n_neighbors=5),
    'XGBoost': xgb.XGBRegressor(random_state=42),
    'MLP': MLPRegressor(random_state=42),
    'Bayesian Ridge': BayesianRidge(),
    'Elastic Net': ElasticNet(random_state=42)
}

results = {
    'model': [],
    'r2': [],
    'rmse': [],
    'mae': [],
    'mape (%)': [],
    'median_ae': [],
    'mbe': []
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    median_ae = median_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mbe = np.mean(y_pred - y_test)

    results['model'].append(name)
    results['rmse'].append(rmse)
    results['mae'].append(mae)
    results['mape (%)'].append(mape)
    results['median_ae'].append(median_ae)
    results['r2'].append(r2)
    results['mbe'].append(mbe)

    filename = f'{name.replace(" ", "_").lower()}_model.pkl'
    output_folder = f'/content/drive/MyDrive/Portfolio/Solar Panel Analytics/models/model_comparison/{filename}'
    joblib.dump(model, output_folder)
    print(f'Saved {name} model to {filename}')

results_df = pd.DataFrame(results)
print(results_df)

Saved Random Forest model to random_forest_model.pkl
Saved Linear Regression model to linear_regression_model.pkl
Saved Gradient Boosting model to gradient_boosting_model.pkl
Saved K Neighbour model to k_neighbour_model.pkl
Saved XGBoost model to xgboost_model.pkl
Saved MLP model to mlp_model.pkl
Saved Bayesian Ridge model to bayesian_ridge_model.pkl
Saved Elastic Net model to elastic_net_model.pkl
               model        r2      rmse       mae  mape (%)  median_ae  \
0      Random Forest  0.800203  3.418244  1.803683       inf   0.604600   
1  Linear Regression  0.587770  4.909963  3.252818       inf   2.073636   
2  Gradient Boosting  0.727770  3.990030  2.351932       inf   1.102145   
3        K Neighbour  0.722682  4.027146  2.148610       inf   0.726000   
4            XGBoost  0.805963  3.368609  1.932080       inf   0.865673   
5                MLP  0.440663  5.719332  4.003838       inf   2.949761   
6     Bayesian Ridge  0.587774  4.909943  3.251973       inf   2.071868  