In [24]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder, StandardScaler
from sklearn.feature_selection import SequentialFeatureSelector, SelectFromModel
from sklearn.svm import SVR

pd.options.display.float_format = '{:.2f}'.format

import warnings
warnings.filterwarnings('ignore')

In [25]:
# source: https://www.kaggle.com/datasets/stucom/solar-energy-power-generation-dataset/data
solar_df = pd.read_csv('spg.csv')

In [26]:
solar_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4213 entries, 0 to 4212
Data columns (total 21 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   temperature_2_m_above_gnd          4213 non-null   float64
 1   relative_humidity_2_m_above_gnd    4213 non-null   int64  
 2   mean_sea_level_pressure_MSL        4213 non-null   float64
 3   total_precipitation_sfc            4213 non-null   float64
 4   snowfall_amount_sfc                4213 non-null   float64
 5   total_cloud_cover_sfc              4213 non-null   float64
 6   high_cloud_cover_high_cld_lay      4213 non-null   int64  
 7   medium_cloud_cover_mid_cld_lay     4213 non-null   int64  
 8   low_cloud_cover_low_cld_lay        4213 non-null   int64  
 9   shortwave_radiation_backwards_sfc  4213 non-null   float64
 10  wind_speed_10_m_above_gnd          4213 non-null   float64
 11  wind_direction_10_m_above_gnd      4213 non-null   float

In [27]:
solar_df.isna().sum()

temperature_2_m_above_gnd            0
relative_humidity_2_m_above_gnd      0
mean_sea_level_pressure_MSL          0
total_precipitation_sfc              0
snowfall_amount_sfc                  0
total_cloud_cover_sfc                0
high_cloud_cover_high_cld_lay        0
medium_cloud_cover_mid_cld_lay       0
low_cloud_cover_low_cld_lay          0
shortwave_radiation_backwards_sfc    0
wind_speed_10_m_above_gnd            0
wind_direction_10_m_above_gnd        0
wind_speed_80_m_above_gnd            0
wind_direction_80_m_above_gnd        0
wind_speed_900_mb                    0
wind_direction_900_mb                0
wind_gust_10_m_above_gnd             0
angle_of_incidence                   0
zenith                               0
azimuth                              0
generated_power_kw                   0
dtype: int64

## Observations
- This is an extremely straightforward, numeric-only dataset with no missing values. 
- generated_power_kw is the y value, everything else is an input

## Linear Regression

In [28]:
X = solar_df.drop(columns = 'generated_power_kw')
y = solar_df['generated_power_kw']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [29]:
# define a function for running regression grid and appending to results df
def RunRegressionGrid(pipe_, params_, modelLabel_, results_df_):
    
    grid = GridSearchCV(pipe_, param_grid = params_, scoring = 'neg_mean_squared_error')
    
    grid.fit(X_train, y_train)
    
    best_model = grid.best_estimator_
    
    train_mse = mean_squared_error(y_train, best_model.predict(X_train))
    test_mse = mean_squared_error(y_test, best_model.predict(X_test))
    
    print(grid.best_params_)
    
    print(f'Train score: {train_mse}')
    print(f'Test score: {test_mse}')
    
    results_df_.loc[len(results_df_)] = {'Model' : modelLabel_, 'Train MSE' : train_mse, 'Test MSE' : test_mse, 
                                       'Best Params' : grid.best_params_}

In [30]:
results_columns = ['Model', 'Train MSE', 'Test MSE', 'Best Params']

results_df = pd.DataFrame(columns = results_columns)

In [31]:
lr_pipe = Pipeline([('poly', PolynomialFeatures()), 
                        ('scaler', StandardScaler()), 
                        ('lr', LinearRegression())])

grid_params = {'poly__degree': range(1, 4)}

In [32]:
RunRegressionGrid(lr_pipe, grid_params, 'Linear Regression', results_df)

{'poly__degree': 1}
Train score: 254053.90655879152
Test score: 269926.5103518734


## Ridge Regression

In [33]:
ridge_pipe = Pipeline([('poly', PolynomialFeatures()), 
                        ('scaler', StandardScaler()), 
                        ('ridge', Ridge())])

grid_params = {'poly__degree': range(1, 4),
              'ridge__alpha' : np.logspace(-5, 5, 11)}

In [34]:
RunRegressionGrid(ridge_pipe, grid_params, 'Ridge Regression', results_df)

{'poly__degree': 3, 'ridge__alpha': 1000.0}
Train score: 204490.610876858
Test score: 454566.78656544426


## Lasso Regression

In [35]:
lasso_pipe = Pipeline([('poly', PolynomialFeatures()), 
                       ('scaler', StandardScaler()), 
                       ('lasso', Lasso())])

grid_params = {'poly__degree': range(1, 4),
              'lasso__alpha' : np.logspace(-5, 5, 11)}

In [36]:
RunRegressionGrid(lasso_pipe, grid_params, 'Lasso Regression', results_df)

{'lasso__alpha': 10.0, 'poly__degree': 3}
Train score: 218645.90495787017
Test score: 254651.87198827224


## Linear Regression w/ FFS

In [37]:
ffs_pipe = Pipeline([('poly', PolynomialFeatures()), 
                     ('scaler', StandardScaler()),
                     ('sfs', SequentialFeatureSelector(LinearRegression(), scoring = 'neg_mean_squared_error')),
                     ('lr', LinearRegression())])

grid_params = {'poly__degree': range(1, 4),
              'sfs__n_features_to_select': range(2, X.shape[1], 2)}

In [38]:
RunRegressionGrid(ffs_pipe, grid_params, 'Linear Regression w/ FFS', results_df)

{'poly__degree': 3, 'sfs__n_features_to_select': 18}
Train score: 207985.29062868247
Test score: 234314.75724009232


## Linear Regression w/ Lasso feature selection

In [41]:
lr_lasso_pipe = Pipeline([('poly', PolynomialFeatures()),
                          ('scaler', StandardScaler()),
                          ('selector', SelectFromModel(Lasso())),
                          ('lr', LinearRegression())])

grid_params = {'poly__degree': range(1, 4)}

In [42]:
RunRegressionGrid(lr_lasso_pipe, grid_params, 'Linear Regression w/ Lasso', results_df)

{'poly__degree': 1}
Train score: 254305.4264483617
Test score: 269975.1034053896


## SVR ?

In [43]:
svr_pipe = Pipeline([('scaler', StandardScaler()), ('svr', SVR())])

grid_params = {'svr__kernel': ['rbf', 'poly', 'linear', 'sigmoid'],
         'svr__gamma': [0.1, 1.0, 10.0, 100.0]}

In [44]:
# TODO : this is taking too long. revisit later

#RunRegressionGrid(svr_pipe, grid_params, 'SVR', results_df)

In [53]:
pd.set_option('display.max_colwidth', None)
results_df

Unnamed: 0,Model,Train MSE,Test MSE,Best Params
0,Linear Regression,254053.91,269926.51,{'poly__degree': 1}
1,Ridge Regression,204490.61,454566.79,"{'poly__degree': 3, 'ridge__alpha': 1000.0}"
2,Lasso Regression,218645.9,254651.87,"{'lasso__alpha': 10.0, 'poly__degree': 3}"
3,Linear Regression w/ FFS,207985.29,234314.76,"{'poly__degree': 3, 'sfs__n_features_to_select': 18}"
4,Linear Regression w/ Lasso,254305.43,269975.1,{'poly__degree': 1}


In [54]:
# TODO : examine feature importances
# TODO : create charts