In [51]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import MinMaxScaler
from pathlib import Path
import IPython.display as ipd

In [26]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

In [4]:
HOME_PATH = Path.cwd()

LOCAL_DATA_URI = HOME_PATH / 'data' / 'energydata_complete.csv' # as pathlib.Path object
LOCAL_DATA_FILE_PATH = str(LOCAL_DATA_URI) # As string representation

In [5]:
energy_data = pd.read_csv(LOCAL_DATA_FILE_PATH,encoding='latin')

In [6]:
energy_data.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


## Question 12

In [8]:
X = energy_data[['T2']]
y = energy_data['T6']

In [10]:
linear_reg = LinearRegression()
linear_reg.fit(X,y)

LinearRegression()

In [14]:
y_pred = linear_reg.predict(X)
score = r2_score(y,y_pred)
print(score)

0.6418990830855493


## Question 13

In [19]:
cols = [el for el in energy_data.columns if el not in ['date','lights']]
scaler = MinMaxScaler()
energy_data_scaled = scaler.fit_transform(energy_data[cols])

In [20]:
X_data, y_data = energy_data_scaled[:,1:], energy_data_scaled[:,0]

In [21]:
X_train, X_test, ytrain, ytest = train_test_split(X_data, y_data, test_size=0.3, random_state=42)

In [23]:
multi_lin_reg = LinearRegression()
multi_lin_reg.fit(X_train,ytrain)

LinearRegression()

In [24]:
ypred = multi_lin_reg.predict(X_test)

In [25]:
mae = mean_absolute_error(ytest, ypred)
print('{:.2f}'.format(mae))

0.05


## Question 14

In [29]:
rss = (len(ypred) * mean_squared_error(ytest, ypred))
print('{:.2f}'.format(rss))

45.35


In [30]:
## Question 15

In [33]:
rmse = np.sqrt(mean_squared_error(ytest, ypred))
print("{:.3f}".format(rmse))

0.088


## Question 16

In [35]:
coeff_determ = r2_score(ytest, ypred)
print('{:.2f}'.format(coeff_determ))

0.15


## Question 17

In [37]:
multi_lin_reg.coef_

array([-0.00328105,  0.5535466 , -0.23617792, -0.45669795,  0.29062714,
        0.09604827,  0.028981  ,  0.02638578, -0.01565684,  0.01600579,
        0.23642491,  0.03804865,  0.01031878, -0.04461364,  0.10199505,
       -0.15759548, -0.18994077, -0.03980032, -0.32185967,  0.00683933,
       -0.07767065,  0.02918313,  0.01230661,  0.11775773,  0.0007701 ,
        0.0007701 ])

In [43]:
model_weights = pd.DataFrame({'feature':energy_data.columns.tolist()[3:],'weight':multi_lin_reg.coef_}).sort_values(by='weight')
model_weights

Unnamed: 0,feature,weight
3,RH_2,-0.456698
18,T_out,-0.32186
2,T2,-0.236178
16,T9,-0.189941
15,RH_8,-0.157595
20,RH_out,-0.077671
13,RH_7,-0.044614
17,RH_9,-0.0398
8,T5,-0.015657
0,T1,-0.003281


## Question 18

In [46]:
multi_ridge = Ridge(alpha=0.4)

In [47]:
multi_ridge.fit(X_train, ytrain)

Ridge(alpha=0.4)

In [48]:
ypred_ridge = multi_ridge.predict(X_test)

In [49]:
rmse_ridge = np.sqrt(mean_squared_error(ytest, ypred_ridge))
rmse == rmse_ridge

False

In [50]:
rmse_ridge

0.08753385704628003

## Question 19

In [57]:
lasso = Lasso(alpha=0.001)

In [58]:
lasso.fit(X_train, ytrain)

Lasso(alpha=0.001)

In [65]:
non_zero_mask = [lasso.coef_ > 0]


In [66]:
num_non_zero = np.sum(non_zero_mask)
print("{} features are non-zero".format(num_non_zero))

2 features are non-zero


## Question 20

In [59]:
ypred_lasso = lasso.predict(X_test)

In [60]:
rmse_lasso = np.sqrt(mean_squared_error(ytest, ypred_lasso))
rmse_lasso

0.09358170467245137