**Loading the Required Libraries and Modules**

In [11]:
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import TimeSeriesSplit 

In [12]:
from google.colab import drive 
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


**Reading the Data and Performing Basic Data Checks**

In [13]:
df = pd.read_csv("/content/drive/MyDrive/energydata_complete.csv", parse_dates=["date"])
print(df.shape)
df.describe()

(19735, 29)


Unnamed: 0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
count,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0
mean,97.694958,3.801875,21.686571,40.259739,20.341219,40.42042,22.267611,39.2425,20.855335,39.026904,19.592106,50.949283,7.910939,54.609083,20.267106,35.3882,22.029107,42.936165,19.485828,41.552401,7.411665,755.522602,79.750418,4.039752,38.330834,3.760707,24.988033,24.988033
std,102.524891,7.935988,1.606066,3.979299,2.192974,4.069813,2.006111,3.254576,2.042884,4.341321,1.844623,9.022034,6.090347,31.149806,2.109993,5.114208,1.956162,5.224361,2.014712,4.151497,5.317409,7.399441,14.901088,2.451221,11.794719,4.194648,14.496634,14.496634
min,10.0,0.0,16.79,27.023333,16.1,20.463333,17.2,28.766667,15.1,27.66,15.33,29.815,-6.065,1.0,15.39,23.2,16.306667,29.6,14.89,29.166667,-5.0,729.3,24.0,0.0,1.0,-6.6,0.005322,0.005322
25%,50.0,0.0,20.76,37.333333,18.79,37.9,20.79,36.9,19.53,35.53,18.2775,45.4,3.626667,30.025,18.7,31.5,20.79,39.066667,18.0,38.5,3.666667,750.933333,70.333333,2.0,29.0,0.9,12.497889,12.497889
50%,60.0,0.0,21.6,39.656667,20.0,40.5,22.1,38.53,20.666667,38.4,19.39,49.09,7.3,55.29,20.033333,34.863333,22.1,42.375,19.39,40.9,6.916667,756.1,83.666667,3.666667,40.0,3.433333,24.897653,24.897653
75%,100.0,0.0,22.6,43.066667,21.5,43.26,23.29,41.76,22.1,42.156667,20.619643,53.663333,11.256,83.226667,21.6,39.0,23.39,46.536,20.6,44.338095,10.408333,760.933333,91.666667,5.5,40.0,6.566667,37.583769,37.583769
max,1080.0,70.0,26.26,63.36,29.856667,56.026667,29.236,50.163333,26.2,51.09,25.795,96.321667,28.29,99.9,26.0,51.4,27.23,58.78,24.5,53.326667,26.1,772.3,100.0,14.0,66.0,15.5,49.99653,49.99653


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 19735 entries, 2016-01-11 17:00:00 to 2016-05-27 18:00:00
Data columns (total 28 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Appliances   19735 non-null  int64  
 1   lights       19735 non-null  float64
 2   T1           19735 non-null  float64
 3   RH_1         19735 non-null  float64
 4   T2           19735 non-null  float64
 5   RH_2         19735 non-null  float64
 6   T3           19735 non-null  float64
 7   RH_3         19735 non-null  float64
 8   T4           19735 non-null  float64
 9   RH_4         19735 non-null  float64
 10  T5           19735 non-null  float64
 11  RH_5         19735 non-null  float64
 12  T6           19735 non-null  float64
 13  RH_6         19735 non-null  float64
 14  T7           19735 non-null  float64
 15  RH_7         19735 non-null  float64
 16  T8           19735 non-null  float64
 17  RH_8         19735 non-null  float64
 18  T9         

In [14]:
df.columns

Index(['date', 'Appliances', 'lights', 'T1', 'RH_1', 'T2', 'RH_2', 'T3',
       'RH_3', 'T4', 'RH_4', 'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8',
       'RH_8', 'T9', 'RH_9', 'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed',
       'Visibility', 'Tdewpoint', 'rv1', 'rv2'],
      dtype='object')

In [16]:
df = df.set_index("date")

In [17]:
df.head()

Unnamed: 0_level_0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,17.166667,55.2,7.026667,84.256667,17.2,41.626667,18.2,48.9,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,17.166667,55.2,6.833333,84.063333,17.2,41.56,18.2,48.863333,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,17.166667,55.09,6.56,83.156667,17.2,41.433333,18.2,48.73,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,17.166667,55.09,6.433333,83.423333,17.133333,41.29,18.1,48.59,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,17.2,55.09,6.366667,84.893333,17.2,41.23,18.1,48.59,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


**Creating Arrays for the Features and the Response Variable**

In [18]:
target_column = ['Appliances'] 
predictors = list(set(list(df.columns))-set(target_column))
df[predictors] = df[predictors]/df[predictors].max()
df.describe()

Unnamed: 0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
count,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0
mean,97.694958,0.054312,0.82584,0.635413,0.681296,0.72145,0.76165,0.782295,0.796005,0.763885,0.759531,0.528949,0.279637,0.546637,0.779504,0.688486,0.809001,0.730455,0.79534,0.779205,0.283972,0.978276,0.797504,0.288554,0.58077,0.242626,0.499795,0.499795
std,102.524891,0.113371,0.06116,0.062805,0.07345,0.072641,0.068618,0.06488,0.077973,0.084974,0.071511,0.093666,0.215283,0.31181,0.081154,0.099498,0.071838,0.08888,0.082233,0.07785,0.203732,0.009581,0.149011,0.175087,0.178708,0.270622,0.289953,0.289953
min,10.0,0.0,0.639375,0.426505,0.539243,0.365243,0.588316,0.57346,0.576336,0.541398,0.594301,0.309536,-0.214387,0.01001,0.591923,0.451362,0.598849,0.503573,0.607755,0.546943,-0.191571,0.944322,0.24,0.0,0.015152,-0.425806,0.000106,0.000106
25%,50.0,0.0,0.790556,0.589226,0.62934,0.676464,0.71111,0.735597,0.74542,0.695439,0.708568,0.471337,0.128196,0.300551,0.719231,0.61284,0.763496,0.664625,0.734694,0.721965,0.140485,0.972334,0.703333,0.142857,0.439394,0.058065,0.249975,0.249975
50%,60.0,0.0,0.822544,0.625894,0.669867,0.72287,0.755917,0.768091,0.788804,0.751615,0.751696,0.509646,0.258042,0.553453,0.770513,0.678275,0.811605,0.720908,0.791429,0.766971,0.265006,0.979024,0.836667,0.261905,0.606061,0.221505,0.497988,0.497988
75%,100.0,0.0,0.860625,0.679714,0.720107,0.772132,0.796621,0.832481,0.843511,0.825145,0.799366,0.557126,0.397879,0.8331,0.830769,0.758755,0.858979,0.791698,0.840816,0.831443,0.398787,0.985282,0.916667,0.392857,0.606061,0.423656,0.751728,0.751728
max,1080.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


**Creating the Training and Test Datasets**

In [37]:
X = df[predictors].values
y = df[target_column].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
print(X_train.shape); print(X_test.shape)

(13814, 27)
(5921, 27)


**Build, Predict and Evaluate the Regression Model**

**Linear Regression**

In [20]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [21]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

LinearRegression(n_jobs=1, normalize=False)

In [34]:
pred_train_lr= lr.predict(X_train)
print("RMSE_train: ",np.sqrt(mean_squared_error(y_train,pred_train_lr)))
print("R2_train: ",r2_score(y_train, pred_train_lr))

pred_test_lr= lr.predict(X_test)
print("RMSE_test: ",np.sqrt(mean_squared_error(y_test,pred_test_lr))) 
print("R2_test: ",r2_score(y_test, pred_test_lr))
importance = lr.coef_
print(importance)

RMSE_train:  93.70606242285344
R2_train:  0.1695005666728967
RMSE_test:  93.79279580239442
R2_test:  0.15133743343176564
[[ 203.41721079 -369.50931455   -1.01132354   27.28281201  141.10898875
   271.84545237 -798.63276608  -11.18891074  -44.57632681  -39.67388784
   -70.06586452  179.70293942   -1.01132354  -93.95869483    2.60383454
    12.79567574 -569.06412705  755.100433    279.57091885   59.41594884
   951.43945327   21.49880352  -49.55570638   12.80095115 -284.42011928
  -248.223047      1.00586652]]


In [39]:
from sklearn.metrics import mean_absolute_error
MSE = mean_absolute_error(y_train, pred_train_lr)
print(MSE)

68.13042617143768


**Regularized Regression**

**Ridge Regression**

In [27]:
rr = Ridge(alpha=0.01)
rr.fit(X_train, y_train) 
pred_train_rr= rr.predict(X_train)
print("RMSE_train: ", np.sqrt(mean_squared_error(y_train,pred_train_rr)))
print("R2_train: ",r2_score(y_train, pred_train_rr))

pred_test_rr= rr.predict(X_test)
print("RMSE_test: ",np.sqrt(mean_squared_error(y_test,pred_test_rr))) 
print("R2_test: ",r2_score(y_test, pred_test_rr))

RMSE_train:  93.70612056887904
R2_train:  0.16949953599771828
RMSE_test:  93.79078234977312
R2_test:  0.1513738695676935


In [32]:
rr = Ridge(alpha=0.4)
rr.fit(X_train, y_train) 
pred_train_rr= rr.predict(X_train)
print("RMSE_train: ", np.sqrt(mean_squared_error(y_train,pred_train_rr)))
print("R2_train: ",r2_score(y_train, pred_train_rr))

pred_test_rr= rr.predict(X_test)
print("RMSE_test: ",np.sqrt(mean_squared_error(y_test,pred_test_rr))) 
print("R2_test: ",r2_score(y_test, pred_test_rr))

RMSE_train:  93.7628278006487
R2_train:  0.1684940598956075
RMSE_test:  93.78447973951235
R2_test:  0.15148791872935596


**Lasso Regression**

In [28]:
model_lasso = Lasso(alpha=0.01)
model_lasso.fit(X_train, y_train) 
pred_train_lasso= model_lasso.predict(X_train)
print("RMSE_train: ",np.sqrt(mean_squared_error(y_train,pred_train_lasso)))
print("R2_train: ",r2_score(y_train, pred_train_lasso))

pred_test_lasso= model_lasso.predict(X_test)
print("RMSE_test: ",np.sqrt(mean_squared_error(y_test,pred_test_lasso))) 
print("R2_test: ",r2_score(y_test, pred_test_lasso))

RMSE_train:  93.74586228566724
R2_train:  0.168794939264459
RMSE_test:  93.7705388406747
R2_test:  0.1517401596532002


**ElasticNet Regression**

In [29]:
#Elastic Net
model_enet = ElasticNet(alpha = 0.01)
model_enet.fit(X_train, y_train) 
pred_train_enet= model_enet.predict(X_train)
print("RMSE_train: ",np.sqrt(mean_squared_error(y_train,pred_train_enet)))
print("R2_train: ",r2_score(y_train, pred_train_enet))

pred_test_enet= model_enet.predict(X_test)
print("RMSE_test: ",np.sqrt(mean_squared_error(y_test,pred_test_enet)))
print("R2_test: ",r2_score(y_test, pred_test_enet))

RMSE_train:  97.69042679636749
R2_train:  0.09737371463329803
RMSE_test:  97.50510535218429
R2_test:  0.08282798502472066
