In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('energydata_complete.csv')

**Q12.** From the dataset, fit a linear model on the relationship between the temperature in the living room in Celsius (x = T2) and the temperature outside the building (y = T6). What is the R^2 value in two d.p.?

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [4]:
x = df[['T2']]
y = df[['T6']]

# Splitting the dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
# creating the linear model
model = LinearRegression()
# fitting th linear model
model.fit(x_train, y_train)
pred = model.predict(x_test)

In [5]:
r2 = r2_score(y_test, pred)
print("R-Squared:", round(r2,2))

R-Squared: 0.64


**Q13.** Normalize the dataset using the MinMaxScaler after removing the following columns: [“date”, “lights”]. The target variable is “Appliances”. Use a 70-30 train-test set split with a random state of 42 (for reproducibility). Run a multiple linear regression using the training set and evaluate your model on the test set. Answer the following questions:

What is the Mean Absolute Error (in two decimal places)?

In [6]:
from sklearn.preprocessing import MinMaxScaler

In [7]:
df_1 = df.drop(columns=['date', 'lights'])
scaler = MinMaxScaler()

In [8]:
df_1_norm = pd.DataFrame(scaler.fit_transform(df_1), columns=df_1.columns)

In [9]:
x_1 = df_1_norm.drop(columns=['Appliances'])
y_1 = df_1_norm['Appliances']

x_train_1, x_test_1, y_train_1, y_test_1 = train_test_split(x_1, y_1, test_size=0.3, random_state=42)
model_1 = LinearRegression(normalize=True)
model_1.fit(x_train_1, y_train_1)
pred_1 = model_1.predict(x_test_1)

In [10]:
from sklearn.metrics import mean_absolute_error

In [11]:
mae = mean_absolute_error(y_test_1, pred_1)
print("Mean Absolute Error:", round(mae,2))


Mean Absolute Error: 0.05


**Q14.** What is the Residual Sum of Squares (in two decimal places)?

In [12]:
rss = np.sum(np.square(y_test_1 - pred_1))
print("Residual Sum of Squares:", round(rss,2))

Residual Sum of Squares: 45.35


**Q15.** What is the Root Mean Squared Error (in three decimal places)?

In [13]:
from sklearn.metrics import mean_squared_error

In [14]:
rmse = round(np.sqrt(mean_squared_error(y_test_1, pred_1)), 3)
print("Root Mean squared Error:", rmse)

Root Mean squared Error: 0.088


**Q16.** What is the Coefficient of Determination (in two decimal places)?

In [15]:
r2_1 = r2_score(y_test_1, pred_1)
print("Coefficient of Determination:", round(r2_1,2))

Coefficient of Determination: 0.15


**Q17.** Obtain the feature weights from your linear model above. Which features have the lowest and highest weights respectively?

In [16]:
weight = pd.Series(model_1.coef_, x_train_1.columns).sort_values()
weight

RH_2          -0.456698
T_out         -0.321860
T2            -0.236178
T9            -0.189941
RH_8          -0.157595
RH_out        -0.077671
RH_7          -0.044614
RH_9          -0.039800
T5            -0.015657
T1            -0.003281
rv2            0.000770
rv1            0.000770
Press_mm_hg    0.006839
T7             0.010319
Visibility     0.012307
RH_5           0.016006
RH_4           0.026386
T4             0.028981
Windspeed      0.029183
RH_6           0.038049
RH_3           0.096048
T8             0.101995
Tdewpoint      0.117758
T6             0.236425
T3             0.290627
RH_1           0.553547
dtype: float64

In [17]:
print("Minimum and maximum weights are of {} and {} respectively".format(weight.idxmin(), weight.idxmax()))

Minimum and maximum weights are of RH_2 and RH_1 respectively


**Q18.** Train a ridge regression model with an alpha value of 0.4. Is there any change to the root mean squared error (RMSE) when evaluated on the test set?

In [18]:
from sklearn.linear_model import Ridge

In [19]:
model_r = Ridge(alpha=0.4)
model_r.fit(x_train_1, y_train_1)
pred_r = model_r.predict(x_test_1)

In [20]:
rmse_r = np.sqrt(mean_squared_error(y_test_1, pred_r))
print("Root Mean Squared Error of Ridge model with \u03B1 = 0.4 is",round(rmse_r,3))

Root Mean Squared Error of Ridge model with α = 0.4 is 0.088


Since there is no difference in RMSE, hence there `No` change in Root Mean Squared Error.

**Q19.** Train a lasso regression model with an alpha value of 0.001 and obtain the new feature weights with it. How many of the features have non-zero feature weights?

In [21]:
from sklearn.linear_model import Lasso

In [22]:
model_l = Lasso(alpha=0.001)
model_l.fit(x_train_1, y_train_1)
pred_l = model_l.predict(x_test_1)

In [23]:
weight_l = pd.Series(model_1.coef_, x_train_1.columns).sort_values()
non_zero = weight_l[weight_l!=0.0].count()
print(non_zero)

26


**Q20.** What is the new RMSE with the lasso regression? (Answer should be in three (3) decimal places)?

In [24]:
rmse_l = round(np.sqrt(mean_squared_error(y_test_1, pred_l)), 3)
print("Root Mean Squared Error of Lasso model with \u03B1 = 0.001 is", rmse_l)

Root Mean Squared Error of Lasso model with α = 0.001 is 0.094
