In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error, r2_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv(r"C:\Users\Saidabrorkhon\ML_Lectures\ML_Lectures2\energy_consumption.csv")

Data exploration

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Building Type        1000 non-null   object 
 1   Square Footage       999 non-null    float64
 2   Number of Occupants  1000 non-null   object 
 3   Appliances Used      1000 non-null   object 
 4   Average Temperature  1000 non-null   float64
 5   Day of Week          1000 non-null   object 
 6   Energy Consumption   1000 non-null   float64
dtypes: float64(3), object(4)
memory usage: 54.8+ KB


In [8]:
df.head(3)

Unnamed: 0,Building Type,Square Footage,Number of Occupants,Appliances Used,Average Temperature,Day of Week,Energy Consumption
0,Residential,7063.0,76,10,29.84,Weekday,2713.95
1,Commercial,44372.0,66,45,16.72,Weekday,5744.99
2,Industrial,19255.0,37,17,14.3,Weekend,4101.24


Data preprocessing

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Building Type        1000 non-null   object 
 1   Square Footage       1000 non-null   float64
 2   Number of Occupants  1000 non-null   object 
 3   Appliances Used      1000 non-null   object 
 4   Average Temperature  1000 non-null   float64
 5   Day of Week          1000 non-null   object 
 6   Energy Consumption   1000 non-null   float64
dtypes: float64(3), object(4)
memory usage: 54.8+ KB


Handling missing values

In [14]:
df['Number of Occupants'].str.isdigit()
print(df['Number of Occupants'])

0      76
1      66
2      37
3      14
4      26
       ..
995    68
996     7
997    88
998    67
999    57
Name: Number of Occupants, Length: 1000, dtype: object


In [3]:
df['Number of Occupants'] = pd.to_numeric(df['Number of Occupants'], errors='coerce')

In [17]:
df['Number of Occupants']

0      76.0
1      66.0
2      37.0
3      14.0
4      26.0
       ... 
995    68.0
996     7.0
997    88.0
998    67.0
999    57.0
Name: Number of Occupants, Length: 1000, dtype: float64

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Building Type        1000 non-null   object 
 1   Square Footage       1000 non-null   float64
 2   Number of Occupants  999 non-null    float64
 3   Appliances Used      1000 non-null   object 
 4   Average Temperature  1000 non-null   float64
 5   Day of Week          1000 non-null   object 
 6   Energy Consumption   1000 non-null   float64
dtypes: float64(4), object(3)
memory usage: 54.8+ KB


In [4]:
df['Appliances Used'] = pd.to_numeric(df['Appliances Used'], errors='coerce')

In [20]:
df['Appliances Used']

0      10.0
1      45.0
2      17.0
3      41.0
4      18.0
       ... 
995    44.0
996    22.0
997    20.0
998    37.0
999    11.0
Name: Appliances Used, Length: 1000, dtype: float64

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Building Type        1000 non-null   object 
 1   Square Footage       1000 non-null   float64
 2   Number of Occupants  999 non-null    float64
 3   Appliances Used      999 non-null    float64
 4   Average Temperature  1000 non-null   float64
 5   Day of Week          1000 non-null   object 
 6   Energy Consumption   1000 non-null   float64
dtypes: float64(5), object(2)
memory usage: 54.8+ KB


In [5]:
for col in df.columns:
  if df[col].dtype == 'object':
    df[col].fillna(df[col].mode()[0], inplace=True)
  else:
    df[col].fillna(df[col].mean(), inplace=True)

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Building Type        1000 non-null   object 
 1   Square Footage       1000 non-null   float64
 2   Number of Occupants  1000 non-null   float64
 3   Appliances Used      1000 non-null   float64
 4   Average Temperature  1000 non-null   float64
 5   Day of Week          1000 non-null   object 
 6   Energy Consumption   1000 non-null   float64
dtypes: float64(5), object(2)
memory usage: 54.8+ KB


Encoding

In [25]:
df['Energy Consumption']

0      2713.95
1      5744.99
2      4101.24
3      3009.14
4      3279.17
        ...   
995    3661.21
996    3546.34
997    5147.21
998    3244.98
999    3423.63
Name: Energy Consumption, Length: 1000, dtype: float64

In [6]:
high_cardinality_cols = []
low_cardinality_cols = []

for col in df.select_dtypes(include='object').columns:
    cardinality = df[col].nunique()
    if cardinality >= 4:
        high_cardinality_cols.append(col)
    else:
        low_cardinality_cols.append(col)

le = LabelEncoder()
for col in high_cardinality_cols:
    df[col] = le.fit_transform(df[col])

df = pd.get_dummies(df, columns=low_cardinality_cols, dtype=int, drop_first=True)

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Square Footage             1000 non-null   float64
 1   Number of Occupants        1000 non-null   float64
 2   Appliances Used            1000 non-null   float64
 3   Average Temperature        1000 non-null   float64
 4   Energy Consumption         1000 non-null   float64
 5   Building Type_Industrial   1000 non-null   int64  
 6   Building Type_Residential  1000 non-null   int64  
 7   Day of Week_Weekend        1000 non-null   int64  
dtypes: float64(5), int64(3)
memory usage: 62.6 KB


In [32]:
df.head()

Unnamed: 0,Square Footage,Number of Occupants,Appliances Used,Average Temperature,Energy Consumption,Building Type_Industrial,Building Type_Residential,Day of Week_Weekend
0,7063.0,76.0,10.0,29.84,2713.95,0,1,0
1,44372.0,66.0,45.0,16.72,5744.99,0,0,0
2,19255.0,37.0,17.0,14.3,4101.24,1,0,1
3,13265.0,14.0,41.0,32.82,3009.14,0,1,0
4,13375.0,26.0,18.0,11.92,3279.17,0,0,0


Scaling

In [7]:
num_col = df.select_dtypes(include='number').columns
scaler = StandardScaler()
df[num_col] = scaler.fit_transform(df[num_col])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Square Footage             1000 non-null   float64
 1   Number of Occupants        1000 non-null   float64
 2   Appliances Used            1000 non-null   float64
 3   Average Temperature        1000 non-null   float64
 4   Energy Consumption         1000 non-null   float64
 5   Building Type_Industrial   1000 non-null   float64
 6   Building Type_Residential  1000 non-null   float64
 7   Day of Week_Weekend        1000 non-null   float64
dtypes: float64(8)
memory usage: 62.6 KB


In [8]:
df.head()

Unnamed: 0,Square Footage,Number of Occupants,Appliances Used,Average Temperature,Energy Consumption,Building Type_Industrial,Building Type_Residential,Day of Week_Weekend
0,-1.287427,0.950716,-1.109245,1.012925,-1.556851,-0.68127,1.371803,-0.986097
1,1.324974,0.606048,1.375524,-0.825544,1.692388,-0.68127,-0.728967,-0.986097
2,-0.433735,-0.393486,-0.612291,-1.164652,-0.069693,1.467847,-0.728967,1.014099
3,-0.853159,-1.186221,1.09155,1.430504,-1.240411,-0.68127,1.371803,-0.986097
4,-0.845457,-0.77262,-0.541298,-1.498155,-0.950942,-0.68127,-0.728967,-0.986097


Model training

In [9]:
x = df.drop(columns=['Energy Consumption'])
y = df['Energy Consumption']

x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.2, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_temp, y_temp, test_size=0.2, random_state=42)


In [38]:
x_train.shape, x_val.shape

((800, 7), (40, 7))

In [40]:
y_train.shape, y_val.shape

((800,), (40,))

In [20]:
rf_model = RandomForestRegressor()
rf_model.fit(x_train, y_train)

In [21]:
y_pred = rf_model.predict(x_val)

In [22]:
y_pred

array([ 0.20135561, -0.67922923,  0.59408504, -0.3253495 , -0.91191505,
       -0.28912188, -1.28299858, -0.37748027, -0.28380332,  0.15198499,
       -0.69089107,  0.0156845 ,  1.47830692, -1.63407078,  0.79405593,
       -0.0570248 ,  0.07044529, -0.68519774, -0.48996043,  1.98727116,
        0.4161046 , -0.69023437, -1.29146043, -0.40292037, -0.58322306,
       -1.75204767, -0.5439912 ,  0.38936611, -0.0525113 , -1.87854836,
        0.01870108,  0.49202965, -0.04859565,  0.76752166,  0.80377972,
       -0.21605164,  1.29151763,  0.27918783, -1.288245  ,  0.25911325])

In [23]:
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
print(r2)
print(mse)

0.9741641167350821
0.021688712171704753


In [None]:
# 1 Using Linear Regression
# 0.999987529367477
# 1.0468848949940286e-05

# 2 Using Decision Tree Regressor
# 0.9223911838256532
# 0.06515106368659686

# 3 Using Random Forest Regressor
# 0.9741641167350821
# 0.021688712171704753

In [26]:
from tabulate import tabulate

In [28]:
headers=['Model','MSE','R2 Score']
data = [
  ['Linear Regression', 0.999987529367477, 1.0468848949940286],
  ['Decision Tree Regressor', 0.9223911838256532, 0.06515106368659686],
  ['Random Forest Regressor', 0.9741641167350821, 0.021688712171704753],
]
table = tabulate(data, headers, tablefmt='grid')
print(table)

+-------------------------+----------+------------+
| Model                   |      MSE |   R2 Score |
| Linear Regression       | 0.999988 |  1.04688   |
+-------------------------+----------+------------+
| Decision Tree Regressor | 0.922391 |  0.0651511 |
+-------------------------+----------+------------+
| Random Forest Regressor | 0.974164 |  0.0216887 |
+-------------------------+----------+------------+
