In [1]:
#importing different libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
#reading the dataset
df = pd.read_csv("Household_power_consumption.csv")

In [4]:
df.head()

Unnamed: 0,datetime,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,Sub_metering_4
0,2006-12-16 17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0,52.266667
1,2006-12-16 17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0,72.333333
2,2006-12-16 17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0,70.566667
3,2006-12-16 17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0,71.8
4,2006-12-16 17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0,43.1


In [5]:
df.shape

(2075259, 9)

In [6]:
#Since the dataset is very huge let us trim the dataset
df = df.iloc[:2000,:]
df.shape

(2000, 9)

In [9]:
#checking out duplicated data
df.duplicated().sum()

0

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   datetime               2000 non-null   object 
 1   Global_active_power    2000 non-null   float64
 2   Global_reactive_power  2000 non-null   float64
 3   Voltage                2000 non-null   float64
 4   Global_intensity       2000 non-null   float64
 5   Sub_metering_1         2000 non-null   float64
 6   Sub_metering_2         2000 non-null   float64
 7   Sub_metering_3         2000 non-null   float64
 8   Sub_metering_4         2000 non-null   float64
dtypes: float64(8), object(1)
memory usage: 140.8+ KB


In [11]:
#Let us drop the datetime column
df = df.drop("datetime",axis = 1)

In [12]:
#independent and dependent features
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [13]:
#Splitting the dataset into train and test
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test  = train_test_split(X,y,test_size=0.3,random_state=42)

In [14]:
#Standarization of the train and test data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Linear Regression

In [15]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train_scaled,y_train)

In [16]:
y_pred = reg.predict(X_test_scaled)

In [17]:
from sklearn.metrics import r2_score
score  = r2_score(y_test,y_pred)
score

1.0

### SVR

In [18]:
from sklearn.svm import SVR
svr = SVR()
svr.fit(X_train_scaled,y_train)

In [19]:
y_pred_svr = svr.predict(X_test_scaled)
score = r2_score(y_test,y_pred_svr)
score

0.8883162264339386

### Decision Tree Regressor

In [20]:
from sklearn.tree import DecisionTreeRegressor
DTR = DecisionTreeRegressor()
DTR.fit(X_train_scaled,y_train)

In [22]:
y_pred_DTR = DTR.predict(X_test_scaled)
score = r2_score(y_test,y_pred_DTR)
score

0.9559608045157246

### Ensemble Techniques

#### 1. Bagging Regressor

In [24]:
from sklearn.ensemble import BaggingRegressor
model_bagging_svr = BaggingRegressor(base_estimator=svr,n_estimators=10,random_state=42,n_jobs=-1) 
classifier = model_bagging_svr.fit(X_train_scaled,y_train)



In [25]:
y_pred_br = model_bagging_svr.predict(X_test_scaled)
score = r2_score(y_test,y_pred_br)

In [26]:
score

0.8907301386967924

#### 2. Extra Tree Regressor

In [30]:
from sklearn.ensemble import ExtraTreesRegressor
ETR = ExtraTreesRegressor(criterion='squared_error',n_estimators=100, random_state=0)
ETR.fit(X_train_scaled,y_train)

In [32]:
y_pred_ETR  = ETR.predict(X_test_scaled)
score = r2_score(y_test,y_pred_ETR)
score

0.9796750298658378

#### 3. Voting Regressor

In [33]:
from sklearn.ensemble import VotingRegressor
VC = VotingRegressor(estimators=[('lg',reg),('svr',svr),('dt',DTR)])

In [34]:
VC.fit(X_train_scaled,y_train)

In [35]:
y_pred_VR = VC.predict(X_test_scaled)
score = r2_score(y_test,y_pred_VR)
score

0.9791900898493021