In [94]:
import pandas as pd
import numpy as np


In [95]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
energy_efficiency = fetch_ucirepo(id=242) 
  
# data (as pandas dataframes) 
X = energy_efficiency.data.features 
y = energy_efficiency.data.targets 
  
# metadata 
print(energy_efficiency.metadata) 
  
# variable information 
print(energy_efficiency.variables) 

{'uci_id': 242, 'name': 'Energy Efficiency', 'repository_url': 'https://archive.ics.uci.edu/dataset/242/energy+efficiency', 'data_url': 'https://archive.ics.uci.edu/static/public/242/data.csv', 'abstract': 'This study looked into assessing the heating load and cooling load requirements of buildings (that is, energy efficiency) as a function of building parameters.', 'area': 'Computer Science', 'tasks': ['Classification', 'Regression'], 'characteristics': ['Multivariate'], 'num_instances': 768, 'num_features': 8, 'feature_types': ['Integer', 'Real'], 'demographics': [], 'target_col': ['Y1', 'Y2'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2012, 'last_updated': 'Mon Feb 26 2024', 'dataset_doi': '10.24432/C51307', 'creators': ['Athanasios Tsanas', 'Angeliki Xifara'], 'intro_paper': {'ID': 379, 'type': 'NATIVE', 'title': 'Accurate quantitative estimation of energy performance of residential buildings using statistical machine 

In [96]:
def train_valid_test_split(X, y):
    ''' Split the data into train, validation and test datasets using pandas'''
    train=int(X.shape[0]*0.7)
    valid=int(X.shape[0]*0.2)
    test=int(X.shape[0]*0.1)
    print("Data amounts to training data {}, validation data {} and testing data {}. ".format(train, valid, test))

    X_df=pd.DataFrame(X)
    X_df.insert(0, 'W0', 1)
    X_train=X_df.iloc[0:train,:].values
    X_valid=X_df.iloc[train:train+valid,:].values
    X_test=X_df.iloc[train+valid:,:].values
    y_df=pd.DataFrame(y)
    y_train=y_df.iloc[0:train,:].values
    y_valid=y_df.iloc[train:train+valid,:].values
    y_test=y_df.iloc[train+valid:,:].values
    
    return X_train, X_valid, X_test, y_train, y_valid, y_test

In [97]:
def w_hat_lin_calc(X_train, y_train):
    w_hat=np.linalg.inv(X_train.T @ X_train) @ X_train.T @ y_train
    return w_hat

def w_hat_ridge_calc(X_train, y_train,lamb):
    n_features=X_train.shape[1]
    w_hat=np.linalg.inv(X_train.T @ X_train + lamb*np.eye(n_features)) @ X_train.T @ y_train
    return w_hat

def predict_linear(X, w_hat):
    y_predict_lin=X @ w_hat
    return y_predict_lin

def predict_ridge(X, w_hat):
    y_predict_rid=X @ w_hat
    return y_predict_rid

def compare(y_test, y_predict):
    for i in range(y_test.shape[1]):
        mae_i = np.mean(np.abs(y_test[:, i] - y_predict[:,i]))
        mse_i = np.mean((y_test[:, i] - y_predict[:, i]) ** 2)
        rmse_i = np.sqrt(mse_i)
        print("For the Y{}, the MAE is {:.3f}, MSE is {:.3f} and RMSE is {:.3f}."
            .format(i+1, mae_i, mse_i, rmse_i))
    return



In [98]:
X_train, X_valid, X_test, y_train, y_valid, y_test=train_valid_test_split(X,y)
w_hat_lin=w_hat_lin_calc(X_train, y_train)
w_hat_rid=w_hat_ridge_calc(X_train, y_train, 1)
y_predict_linear=predict_linear(X_test, w_hat_lin)
y_predict_ridge=predict_ridge(X_test, w_hat_rid)
print("-------------------------")
print("Linear Regression Results:")
print("-------------------------")
compare(y_test, y_predict_linear)
print("-------------------------")
print("Ridge Regression Results:")
print("-------------------------")
compare(y_test, y_predict_ridge)
print("-------------------------")


Data amounts to training data 537, validation data 153 and testing data 76. 
-------------------------
Linear Regression Results:
-------------------------
For the Y1, the MAE is 17.156, MSE is 443.113 and RMSE is 21.050.
For the Y2, the MAE is 19.660, MSE is 605.008 and RMSE is 24.597.
-------------------------
Ridge Regression Results:
-------------------------
For the Y1, the MAE is 2.228, MSE is 8.208 and RMSE is 2.865.
For the Y2, the MAE is 2.111, MSE is 8.735 and RMSE is 2.956.
-------------------------


In [99]:

from sklearn.linear_model import LinearRegression, Ridge
X_df=pd.DataFrame(X)
train=int(X.shape[0]*0.7)
valid=int(X.shape[0]*0.2)
test=int(X.shape[0]*0.1)
X_train=X_df.iloc[0:train,:].values
X_valid=X_df.iloc[train:train+valid,:].values
X_test=X_df.iloc[train+valid:,:].values
y_df=pd.DataFrame(y)
y_train=y_df.iloc[0:train,:].values
y_valid=y_df.iloc[train:train+valid,:].values
y_test=y_df.iloc[train+valid:,:].values
LR_model=LinearRegression()
LR_model.fit(X_train, y_train)
y_predict_sklearn=LR_model.predict(X_test)
RR_model=Ridge(alpha=1)
RR_model.fit(X_train, y_train)
y_predict_ridge_sklearn=RR_model.predict(X_test)

print("--------------")
print("Sklearn Linear Regression Results:")
print("--------------")
compare(y_test, y_predict_sklearn)

print("--------------")
print("Sklearn Ridge Regression Results:")
print("--------------")
compare(y_test, y_predict_ridge_sklearn)
print("-------------------------")

--------------
Sklearn Linear Regression Results:
--------------
For the Y1, the MAE is 2.844, MSE is 10.969 and RMSE is 3.312.
For the Y2, the MAE is 2.424, MSE is 9.462 and RMSE is 3.076.
--------------
Sklearn Ridge Regression Results:
--------------
For the Y1, the MAE is 2.303, MSE is 8.344 and RMSE is 2.889.
For the Y2, the MAE is 2.151, MSE is 8.847 and RMSE is 2.974.
-------------------------
