## Data preparation for regression models

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder , label_binarize
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import BayesianRidge
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score


In [2]:
df = pd.read_csv('Steel_industry_cleaned_data.csv')
df

Unnamed: 0.1,Unnamed: 0,date,Usage_kWh,Lagging_Current_Reactive.Power_kVarh,Leading_Current_Reactive_Power_kVarh,CO2(tCO2),Lagging_Current_Power_Factor,NSM,WeekStatus,Day_of_week,Load_Type
0,0,2018-01-01 00:15:00,3.17,2.95,0.00,0.0,73.21,900,Weekday,Monday,Light_Load
1,1,2018-01-01 00:30:00,4.00,4.46,0.00,0.0,66.77,1800,Weekday,Monday,Light_Load
2,2,2018-01-01 00:45:00,3.24,3.28,0.00,0.0,70.28,2700,Weekday,Monday,Light_Load
3,3,2018-01-01 01:00:00,3.31,3.56,0.00,0.0,68.09,3600,Weekday,Monday,Light_Load
4,4,2018-01-01 01:15:00,3.82,4.50,0.00,0.0,64.72,4500,Weekday,Monday,Light_Load
...,...,...,...,...,...,...,...,...,...,...,...
35035,35035,2018-12-31 23:00:00,3.85,4.86,0.00,0.0,62.10,82800,Weekday,Monday,Light_Load
35036,35036,2018-12-31 23:15:00,3.74,3.74,0.00,0.0,70.71,83700,Weekday,Monday,Light_Load
35037,35037,2018-12-31 23:30:00,3.78,3.17,0.07,0.0,76.62,84600,Weekday,Monday,Light_Load
35038,35038,2018-12-31 23:45:00,3.78,3.06,0.11,0.0,77.72,85500,Weekday,Monday,Light_Load


In [3]:
## will drop weekstatus as it seems there is a big difference between Saturday and Sunday also

df = df.drop(columns = ['Unnamed: 0' ,'WeekStatus' ])
df

Unnamed: 0,date,Usage_kWh,Lagging_Current_Reactive.Power_kVarh,Leading_Current_Reactive_Power_kVarh,CO2(tCO2),Lagging_Current_Power_Factor,NSM,Day_of_week,Load_Type
0,2018-01-01 00:15:00,3.17,2.95,0.00,0.0,73.21,900,Monday,Light_Load
1,2018-01-01 00:30:00,4.00,4.46,0.00,0.0,66.77,1800,Monday,Light_Load
2,2018-01-01 00:45:00,3.24,3.28,0.00,0.0,70.28,2700,Monday,Light_Load
3,2018-01-01 01:00:00,3.31,3.56,0.00,0.0,68.09,3600,Monday,Light_Load
4,2018-01-01 01:15:00,3.82,4.50,0.00,0.0,64.72,4500,Monday,Light_Load
...,...,...,...,...,...,...,...,...,...
35035,2018-12-31 23:00:00,3.85,4.86,0.00,0.0,62.10,82800,Monday,Light_Load
35036,2018-12-31 23:15:00,3.74,3.74,0.00,0.0,70.71,83700,Monday,Light_Load
35037,2018-12-31 23:30:00,3.78,3.17,0.07,0.0,76.62,84600,Monday,Light_Load
35038,2018-12-31 23:45:00,3.78,3.06,0.11,0.0,77.72,85500,Monday,Light_Load


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35040 entries, 0 to 35039
Data columns (total 9 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   date                                  35040 non-null  object 
 1   Usage_kWh                             35040 non-null  float64
 2   Lagging_Current_Reactive.Power_kVarh  35040 non-null  float64
 3   Leading_Current_Reactive_Power_kVarh  35040 non-null  float64
 4   CO2(tCO2)                             35040 non-null  float64
 5   Lagging_Current_Power_Factor          35040 non-null  float64
 6   NSM                                   35040 non-null  int64  
 7   Day_of_week                           35040 non-null  object 
 8   Load_Type                             35040 non-null  object 
dtypes: float64(5), int64(1), object(3)
memory usage: 2.4+ MB


In [5]:
X = df.drop(columns = ['date' ,'Usage_kWh'])
y = df['Usage_kWh']

X

Unnamed: 0,Lagging_Current_Reactive.Power_kVarh,Leading_Current_Reactive_Power_kVarh,CO2(tCO2),Lagging_Current_Power_Factor,NSM,Day_of_week,Load_Type
0,2.95,0.00,0.0,73.21,900,Monday,Light_Load
1,4.46,0.00,0.0,66.77,1800,Monday,Light_Load
2,3.28,0.00,0.0,70.28,2700,Monday,Light_Load
3,3.56,0.00,0.0,68.09,3600,Monday,Light_Load
4,4.50,0.00,0.0,64.72,4500,Monday,Light_Load
...,...,...,...,...,...,...,...
35035,4.86,0.00,0.0,62.10,82800,Monday,Light_Load
35036,3.74,0.00,0.0,70.71,83700,Monday,Light_Load
35037,3.17,0.07,0.0,76.62,84600,Monday,Light_Load
35038,3.06,0.11,0.0,77.72,85500,Monday,Light_Load


In [6]:
y

0        3.17
1        4.00
2        3.24
3        3.31
4        3.82
         ... 
35035    3.85
35036    3.74
35037    3.78
35038    3.78
35039    3.67
Name: Usage_kWh, Length: 35040, dtype: float64

In [7]:
## encode categorical data 




ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [5,6])], remainder='passthrough')

X = np.array(ct.fit_transform(X))



In [8]:
## split the dataset

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train

array([[0.000e+00, 0.000e+00, 1.000e+00, ..., 0.000e+00, 1.000e+02,
        6.210e+04],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 5.096e+01,
        9.900e+03],
       [1.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 6.595e+01,
        6.300e+03],
       ...,
       [0.000e+00, 0.000e+00, 1.000e+00, ..., 1.000e-02, 9.999e+01,
        4.770e+04],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 7.319e+01,
        8.370e+04],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 1.000e+02,
        4.680e+04]])

## Classification model training with grid search to find best model and params

In [None]:
models = [
    ("Linear Regression", LinearRegression(), {}),
    ("Ridge Regression", Ridge(), {
        'alpha': [0.1, 1, 10, 100]  # Regularization strength
    }),
    ("Decision Tree Regression", DecisionTreeRegressor(), {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    }),
    ("Random Forest Regression", RandomForestRegressor(), {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'bootstrap': [True, False]
    }),
    ("Gradient Boosting Regression", GradientBoostingRegressor(), {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.3],
        'max_depth': [3, 6, 10]
    }),
    ("KNN Regression", KNeighborsRegressor(), {
        'n_neighbors': [3, 5, 7, 10],
        'metric': ['euclidean', 'manhattan']
    }),
    ("Bayesian Ridge Regression", BayesianRidge(), {
        'alpha_1': [1e-6, 1e-5, 1e-4],
        'alpha_2': [1e-6, 1e-5, 1e-4],
        'lambda_1': [1e-6, 1e-5, 1e-4],
        'lambda_2': [1e-6, 1e-5, 1e-4]
    }),
    ("XGBoost Regression", XGBRegressor(), {
        'learning_rate': [0.01, 0.1, 0.3],
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 6, 10]
    }),
    ("SVR", SVR(), {
        'kernel': ['linear', 'rbf'],
        'C': [0.1, 1, 10],
        'epsilon': [0.01, 0.1, 0.2]
    })
]

results = []

for name, model, param_grid in models:
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=10, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_

    y_pred = best_model.predict(X_test)
    
    #R2 and MSE metrics for regression
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    results.append({
        'Model': name,
        'Best Params': grid_search.best_params_,
        'R2 Score': r2,
        'MSE': mse
    })
    print(f'{model} is trained')

results_df = pd.DataFrame(results)

results_df


### Compare models with not so many parameters, than fine tune the best one with Grid Search

In [9]:
models = [
    ("Linear Regression", LinearRegression(), {}),
    ("Ridge Regression", Ridge(), {'alpha': 1}),  
    ("Decision Tree Regression", DecisionTreeRegressor(), {'max_depth': 10}),
    ("Random Forest Regression", RandomForestRegressor(), {'n_estimators': 100, 'max_depth': 10}),
    ("Gradient Boosting Regression", GradientBoostingRegressor(), {'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 3}),
    ("KNN Regression", KNeighborsRegressor(), {'n_neighbors': 5, 'metric': 'euclidean'}),
    ("Bayesian Ridge Regression", BayesianRidge(), {'alpha_1': 1e-6, 'alpha_2': 1e-6}),
    ("XGBoost Regression", XGBRegressor(), {'learning_rate': 0.1, 'n_estimators': 100, 'max_depth': 3}),
    ("SVR", SVR(), {'C': 1, 'kernel': 'rbf', 'epsilon': 0.1})
]

results = []


for name, model, params in models:
    model.set_params(**params)  
    model.fit(X_train, y_train)  
    
    y_pred = model.predict(X_test) 
    
    # Calculate R2 and MSE metrics for regression
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    
    results.append({
        'Model': name,
        'Best Params': params, 
        'R2 Score': r2,
        'MSE': mse
    })
    print(f'{name} is trained')


results_df = pd.DataFrame(results)


print(results_df)


Linear Regression is trained
Ridge Regression is trained
Decision Tree Regression is trained
Random Forest Regression is trained
Gradient Boosting Regression is trained
KNN Regression is trained
Bayesian Ridge Regression is trained
XGBoost Regression is trained
SVR is trained
                          Model  \
0             Linear Regression   
1              Ridge Regression   
2      Decision Tree Regression   
3      Random Forest Regression   
4  Gradient Boosting Regression   
5                KNN Regression   
6     Bayesian Ridge Regression   
7            XGBoost Regression   
8                           SVR   

                                         Best Params  R2 Score         MSE  
0                                                 {}  0.984180   17.982585  
1                                       {'alpha': 1}  0.957944   47.805818  
2                                  {'max_depth': 10}  0.994949    5.741412  
3             {'n_estimators': 100, 'max_depth': 10}  0.996739  

In [10]:
df = results_df.sort_values(by = ['R2 Score'] , ascending = False)

fig = px.bar(df, x='Model', y='R2 Score', 
             labels={'Model': 'Classifier', 'R2 Score': 'R2 Score'},
             title='Regressor Accuracies')

fig.show()

In [11]:
df = results_df.sort_values(by = ['MSE'] , ascending = False)

fig = px.bar(df, x='Model', y='MSE', 
             labels={'Model': 'Classifier', 'MSE': 'MSE'},
             title='Regressor Accuracies')

fig.show()

## Use grid search for Random Forrest 

In [13]:
model = ("Random Forest Regression", RandomForestRegressor(), {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'bootstrap': [True, False]
    })

grid_search = GridSearchCV(estimator= model[1] , param_grid= model[2] , cv =10 , n_jobs= -1 ,verbose = 1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"R2 Score: {r2}")
print(f"Mean Squared Error: {mse}")

Fitting 10 folds for each of 72 candidates, totalling 720 fits
Best Parameters: {'bootstrap': True, 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
R2 Score: 0.9987374763000003
Mean Squared Error: 1.435138891793182
