In [18]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import mean_squared_error, r2_score

import warnings
warnings.filterwarnings('ignore')

In [28]:
df = pd.read_csv("energy_consumption.csv")
df.head()

Unnamed: 0,Building Type,Square Footage,Number of Occupants,Appliances Used,Average Temperature,Day of Week,Energy Consumption
0,Residential,7063.0,76,10,29.84,Weekday,2713.95
1,Commercial,44372.0,66,45,16.72,Weekday,5744.99
2,Industrial,19255.0,37,17,14.3,Weekend,4101.24
3,Residential,13265.0,14,41,32.82,Weekday,3009.14
4,Commercial,13375.0,26,18,11.92,Weekday,3279.17


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Building Type        1000 non-null   object 
 1   Square Footage       999 non-null    float64
 2   Number of Occupants  1000 non-null   object 
 3   Appliances Used      1000 non-null   object 
 4   Average Temperature  1000 non-null   float64
 5   Day of Week          1000 non-null   object 
 6   Energy Consumption   1000 non-null   float64
dtypes: float64(3), object(4)
memory usage: 54.8+ KB


# Data preprocessing

In [30]:

df['Number of Occupants'] = pd.to_numeric(df['Number of Occupants'], errors='coerce')
print(df['Number of Occupants'])

0      76.0
1      66.0
2      37.0
3      14.0
4      26.0
       ... 
995    68.0
996     7.0
997    88.0
998    67.0
999    57.0
Name: Number of Occupants, Length: 1000, dtype: float64


In [31]:
df['Appliances Used'] = pd.to_numeric(df['Appliances Used'], errors='coerce')
df['Appliances Used']

0      10.0
1      45.0
2      17.0
3      41.0
4      18.0
       ... 
995    44.0
996    22.0
997    20.0
998    37.0
999    11.0
Name: Appliances Used, Length: 1000, dtype: float64

# missing values

In [32]:
df.isnull().sum()

Building Type          0
Square Footage         1
Number of Occupants    1
Appliances Used        1
Average Temperature    0
Day of Week            0
Energy Consumption     0
dtype: int64

In [33]:
for col in df.columns:
  if df[col].dtype == 'object':
    df[col].fillna(df[col].mode()[0], inplace=True)
  else:
    df[col].fillna(df[col].mean(), inplace=True)
df.isnull().sum()

Building Type          0
Square Footage         0
Number of Occupants    0
Appliances Used        0
Average Temperature    0
Day of Week            0
Energy Consumption     0
dtype: int64

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Building Type        1000 non-null   object 
 1   Square Footage       1000 non-null   float64
 2   Number of Occupants  1000 non-null   float64
 3   Appliances Used      1000 non-null   float64
 4   Average Temperature  1000 non-null   float64
 5   Day of Week          1000 non-null   object 
 6   Energy Consumption   1000 non-null   float64
dtypes: float64(5), object(2)
memory usage: 54.8+ KB


# Encoding

In [35]:
cat_col = df.select_dtypes(include='object').columns

In [36]:
encoder = LabelEncoder()
for col in cat_col:
    cardinality = df[col].nunique()
    if cardinality <= 5:
        # one-hot encoding
        dummies = pd.get_dummies(df[col], prefix=col, dtype=int)
        # drop the original column and merge the dummy columns
        df = pd.concat([df.drop(col, axis=1), dummies], axis=1)
    else:
        # Apply label encoding
        df[col] = encoder.fit_transform(df[col])

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Square Footage             1000 non-null   float64
 1   Number of Occupants        1000 non-null   float64
 2   Appliances Used            1000 non-null   float64
 3   Average Temperature        1000 non-null   float64
 4   Energy Consumption         1000 non-null   float64
 5   Building Type_Commercial   1000 non-null   int64  
 6   Building Type_Industrial   1000 non-null   int64  
 7   Building Type_Residential  1000 non-null   int64  
 8   Day of Week_Weekday        1000 non-null   int64  
 9   Day of Week_Weekend        1000 non-null   int64  
dtypes: float64(5), int64(5)
memory usage: 78.3 KB


In [38]:
df.head()

Unnamed: 0,Square Footage,Number of Occupants,Appliances Used,Average Temperature,Energy Consumption,Building Type_Commercial,Building Type_Industrial,Building Type_Residential,Day of Week_Weekday,Day of Week_Weekend
0,7063.0,76.0,10.0,29.84,2713.95,0,0,1,1,0
1,44372.0,66.0,45.0,16.72,5744.99,1,0,0,1,0
2,19255.0,37.0,17.0,14.3,4101.24,0,1,0,0,1
3,13265.0,14.0,41.0,32.82,3009.14,0,0,1,1,0
4,13375.0,26.0,18.0,11.92,3279.17,1,0,0,1,0


# Scaling

In [39]:
num_col = df.select_dtypes(include='number').columns
scaler = StandardScaler()
df[num_col] = scaler.fit_transform(df[num_col])
df.head()

Unnamed: 0,Square Footage,Number of Occupants,Appliances Used,Average Temperature,Energy Consumption,Building Type_Commercial,Building Type_Industrial,Building Type_Residential,Day of Week_Weekday,Day of Week_Weekend
0,-1.287427,0.950716,-1.109245,1.012925,-1.556851,-0.711354,-0.68127,1.371803,0.986097,-0.986097
1,1.324974,0.606048,1.375524,-0.825544,1.692388,1.40577,-0.68127,-0.728967,0.986097,-0.986097
2,-0.433735,-0.393486,-0.612291,-1.164652,-0.069693,-0.711354,1.467847,-0.728967,-1.014099,1.014099
3,-0.853159,-1.186221,1.09155,1.430504,-1.240411,-0.711354,-0.68127,1.371803,0.986097,-0.986097
4,-0.845457,-0.77262,-0.541298,-1.498155,-0.950942,1.40577,-0.68127,-0.728967,0.986097,-0.986097


# Model training

In [40]:
x = df.drop(columns=['Energy Consumption'])
y = df['Energy Consumption']

x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.2, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_temp, y_temp, test_size=0.2, random_state=42)


In [43]:
x_train.shape, x_val.shape

((800, 9), (40, 9))

In [44]:
y_train.shape, y_val.shape

((800,), (40,))

# linear regression

In [45]:
lr = LinearRegression()
lr.fit(x_train, y_train)

In [58]:
y_pred = lr.predict(x_test)
mse_lr=mean_squared_error(y_test,y_pred)
r2_lr=r2_score(y_test,y_pred)
print('MSE: ',mse_lr)
print('R2: ',r2_lr)

MSE:  0.0009994099524036106
R2:  0.9989408927674522


In [55]:
from sklearn.model_selection import KFold, cross_val_score
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(lr, x, y,cv=kf, scoring='neg_mean_squared_error')
cv_scores = np.sqrt(-cv_scores)

print('Cross-Validation-Scores:', cv_scores)
print("Mean CV Score:", np.mean(cv_scores))

Cross-Validation-Scores: [0.02831293 0.00418904 0.03355098 0.04929396 0.00379398]
Mean CV Score: 0.023828178020482566


# Decision Tree

In [52]:
dt = DecisionTreeRegressor()
dt.fit(x_train, y_train)

In [59]:
y_pred = dt.predict(x_test)
mse_dt=mean_squared_error(y_test,y_pred)
r2_dt=r2_score(y_test,y_pred)
print('MSE: ',mse_dt)
print('R2: ',r2_dt)

MSE:  0.05448235628787956
R2:  0.9422632749934491


In [56]:

from sklearn.model_selection import KFold, cross_val_score
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(dt, x, y,cv=kf, scoring='neg_mean_squared_error')
cv_scores = np.sqrt(-cv_scores)

print('Cross-Validation-Scores:', cv_scores)
print("Mean CV Score:", np.mean(cv_scores))

Cross-Validation-Scores: [0.25736895 0.28431421 0.25522961 0.31546261 0.26843816]
Mean CV Score: 0.2761627076968633


# Model Comparison

In [60]:
headers=['Model','MSE','R2 Score']
data = [
  ['Linear Regression', mse_lr, r2_lr],
  ['Decision Tree Regressor', mse_dt, r2_dt],
]
table = tabulate(data, headers, tablefmt='grid')
print(table)

+-------------------------+------------+------------+
| Model                   |        MSE |   R2 Score |
| Linear Regression       | 0.00099941 |   0.998941 |
+-------------------------+------------+------------+
| Decision Tree Regressor | 0.0544824  |   0.942263 |
+-------------------------+------------+------------+
