In [152]:
import numpy as np
import pickle
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline

In [270]:
data = pd.read_csv(r"C:\\Users\Reza\Downloads\energy_efficiency_data.csv")

In [271]:
data.shape

(768, 10)

In [272]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Relative_Compactness       768 non-null    float64
 1   Surface_Area               768 non-null    float64
 2   Wall_Area                  768 non-null    float64
 3   Roof_Area                  768 non-null    float64
 4   Overall_Height             768 non-null    float64
 5   Orientation                768 non-null    int64  
 6   Glazing_Area               768 non-null    float64
 7   Glazing_Area_Distribution  768 non-null    int64  
 8   Heating_Load               768 non-null    float64
 9   Cooling_Load               768 non-null    float64
dtypes: float64(8), int64(2)
memory usage: 60.1 KB


In [273]:
data.head()

Unnamed: 0,Relative_Compactness,Surface_Area,Wall_Area,Roof_Area,Overall_Height,Orientation,Glazing_Area,Glazing_Area_Distribution,Heating_Load,Cooling_Load
0,0.98,514.5,294.0,110.25,7.0,2,0.0,0,15.55,21.33
1,0.98,514.5,294.0,110.25,7.0,3,0.0,0,15.55,21.33
2,0.98,514.5,294.0,110.25,7.0,4,0.0,0,15.55,21.33
3,0.98,514.5,294.0,110.25,7.0,5,0.0,0,15.55,21.33
4,0.9,563.5,318.5,122.5,7.0,2,0.0,0,20.84,28.28


In [274]:
data.isnull().sum()

Relative_Compactness         0
Surface_Area                 0
Wall_Area                    0
Roof_Area                    0
Overall_Height               0
Orientation                  0
Glazing_Area                 0
Glazing_Area_Distribution    0
Heating_Load                 0
Cooling_Load                 0
dtype: int64

In [275]:
 data.mean()

Relative_Compactness           0.764167
Surface_Area                 671.708333
Wall_Area                    318.500000
Roof_Area                    176.604167
Overall_Height                 5.250000
Orientation                    3.500000
Glazing_Area                   0.234375
Glazing_Area_Distribution      2.812500
Heating_Load                  22.307201
Cooling_Load                  24.587760
dtype: float64

In [276]:
total_energy = data["Cooling_Load"]+data["Heating_Load"]
data.insert(10, "total_energy", total_energy, True)

In [277]:
y_col = ["total_energy"]
drops = ["Cooling_Load", "total_energy", "Heating_Load"]
X = data.drop(drops, axis=1)
y = data[y_col]

In [258]:
from sklearn.model_selection import GridSearchCV

kf = KFold(shuffle=True, random_state=72018, n_splits=3)

# Same estimator as before
estimator = Pipeline([("scaler", StandardScaler()),
        ("polynomial_features", PolynomialFeatures()),
        ("ridge_regression", Ridge())])

params = {
    'polynomial_features__degree': [2, 3, 4, 5],
    'ridge_regression__alpha': np.geomspace(0.01, 0.25, 1, 2)
}

grid = GridSearchCV(estimator, params, cv=kf)

In [259]:
grid.fit(X, y)

In [260]:
grid.best_score_, grid.best_params_

(0.9946144548556868,
 {'polynomial_features__degree': 4, 'ridge_regression__alpha': 0.01})

In [261]:
y_predict = grid.predict(X)
r2_score(y, y_predict)

0.997124322761064

In [199]:
from sklearn.model_selection import GridSearchCV

kf = KFold(shuffle=True, random_state=72018, n_splits=3)

# Same estimator as before
estimator = Pipeline([("scaler", StandardScaler()),
        ("polynomial_features", PolynomialFeatures()),
        ("ridge_regression", Ridge())])

params = {
    'polynomial_features__degree': [1, 2],
    'ridge_regression__alpha': np.geomspace(0.01, 0.25, 1, 2)
}

grid = GridSearchCV(estimator, params, cv=kf)

In [200]:
grid.fit(X, y)

In [201]:
grid.best_score_, grid.best_params_

(0.9841554939030605,
 {'polynomial_features__degree': 2, 'ridge_regression__alpha': 0.01})

In [202]:
y_predict = grid.predict(X)

In [203]:
r2_score(y, y_predict)

0.9856773448018925

In [192]:
col_names_dict = dict(zip(list(range(len(X.columns.values))), X.columns.values))
col_names_dict

{0: 'Relative_Compactness',
 1: 'Surface_Area',
 2: 'Wall_Area',
 3: 'Roof_Area',
 4: 'Overall_Height',
 5: 'Orientation',
 6: 'Glazing_Area',
 7: 'Glazing_Area_Distribution'}

In [247]:
coefficients = pd.concat([pd.DataFrame(grid.best_estimator_.named_steps["polynomial_features"].get_feature_names_out(), columns=['a']), pd.DataFrame(np.transpose(grid.best_estimator_.named_steps['ridge_regression'].coef_), columns=['Coef'])], axis = 1)
coefficients.sort_values(by="Coef")

Unnamed: 0,a,Coef
30,x3^2,-74.42054
11,x0 x2,-65.570097
19,x1 x3,-57.371919
5,x4,-48.846458
17,x1^2,-46.358476
31,x3 x4,-44.853707
9,x0^2,-38.027967
10,x0 x1,-34.504177
24,x2^2,-28.345151
12,x0 x3,-1.978773


In [120]:
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [123]:
from sklearn.model_selection import GridSearchCV

kf = KFold(shuffle=True, random_state=72018, n_splits=3)

# Same estimator as before
estimator = Pipeline([("scaler", StandardScaler()),
        ("polynomial_features", PolynomialFeatures()),
        ("lasso_regression", Lasso())])

params = {
    'polynomial_features__degree': [1, 2, 5, 6],
    'lasso_regression__alpha': np.geomspace(0.01, 0.25, 1, 2)
}

grid = GridSearchCV(estimator, params, cv=kf)

In [124]:
grid.fit(X, y)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [125]:
grid.best_score_, grid.best_params_

(0.9952112688483646,
 {'lasso_regression__alpha': 0.01, 'polynomial_features__degree': 5})

In [126]:
y_predict = grid.predict(X)

In [127]:
r2_score(y, y_predict)

0.9969451273785815

In [148]:
kf = KFold(shuffle=True, random_state=72018, n_splits=3)
kf.split(X)

<generator object _BaseKFold.split at 0x0000021181D7C900>

In [267]:
for train_index, test_index in kf.split(X):
    print("Train index:", train_index[:10], len(train_index))
    print("Test index:", test_index[:10], len(test_index))
    print('')

Train index: [ 2  3  5  7  8  9 10 11 13 14] 512
Test index: [ 0  1  4  6 12 15 16 17 23 25] 256

Train index: [ 0  1  3  4  5  6  8 10 11 12] 512
Test index: [ 2  7  9 19 21 24 27 29 30 34] 256

Train index: [ 0  1  2  4  6  7  9 12 15 16] 512
Test index: [ 3  5  8 10 11 13 14 18 20 22] 256

