In [1]:
import pandas as pd
import numpy as np


In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
energy_efficiency = fetch_ucirepo(id=242) 
  
# data (as pandas dataframes) 
X = energy_efficiency.data.features 
y = energy_efficiency.data.targets 
  
# metadata 
print(energy_efficiency.metadata) 
  
# variable information 
print(energy_efficiency.variables) 

{'uci_id': 242, 'name': 'Energy Efficiency', 'repository_url': 'https://archive.ics.uci.edu/dataset/242/energy+efficiency', 'data_url': 'https://archive.ics.uci.edu/static/public/242/data.csv', 'abstract': 'This study looked into assessing the heating load and cooling load requirements of buildings (that is, energy efficiency) as a function of building parameters.', 'area': 'Computer Science', 'tasks': ['Classification', 'Regression'], 'characteristics': ['Multivariate'], 'num_instances': 768, 'num_features': 8, 'feature_types': ['Integer', 'Real'], 'demographics': [], 'target_col': ['Y1', 'Y2'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2012, 'last_updated': 'Mon Feb 26 2024', 'dataset_doi': '10.24432/C51307', 'creators': ['Athanasios Tsanas', 'Angeliki Xifara'], 'intro_paper': {'ID': 379, 'type': 'NATIVE', 'title': 'Accurate quantitative estimation of energy performance of residential buildings using statistical machine 

In [3]:
def train_valid_test_split(X, y):
    ''' Split the data into train, validation and test datasets using pandas'''
    train=int(X.shape[0]*0.7)
    valid=int(X.shape[0]*0.2)
    test=int(X.shape[0]*0.1)
    print("Data amounts to training data {}, validation data {} and testing data {}. ".format(train, valid, test))

    X_df=pd.DataFrame(X, dtype=float)
    X_df.insert(0, 'W0', 1)
    X_train=X_df.iloc[0:train,:].values
    X_valid=X_df.iloc[train:train+valid,:].values
    X_test=X_df.iloc[train+valid:,:].values
    y_df=pd.DataFrame(y, dtype=float)
    y_train=y_df.iloc[0:train,:].values
    y_valid=y_df.iloc[train:train+valid,:].values
    y_test=y_df.iloc[train+valid:,:].values
    
    return X_train, X_valid, X_test, y_train, y_valid, y_test

In [4]:
def w_hat_lin_calc(X_train, y_train):
    w_hat=np.linalg.inv(X_train.T @ X_train) @ X_train.T @ y_train
    return w_hat

def w_hat_lin_calc_pseudo(X_train, y_train):
    w_hat=np.linalg.pinv(X_train.T @ X_train) @ X_train.T @ y_train
    return w_hat

def w_hat_ridge_calc(X_train, y_train,lamb):
    n_features=X_train.shape[1]
    w_hat=np.linalg.inv(X_train.T @ X_train + lamb*np.eye(n_features)) @ X_train.T @ y_train
    return w_hat

def predict(X, w_hat):
    y_predict=X @ w_hat
    return y_predict

def compare(y_test, y_predict,n):
    mae_i = np.mean(np.abs(y_test - y_predict))
    mse_i = np.mean((y_test - y_predict) ** 2)
    rmse_i = np.sqrt(mse_i)
    print("For the Y{}, the MAE is {:.3f}, MSE is {:.3f} and RMSE is {:.3f}."
        .format(n, mae_i, mse_i, rmse_i))
    return

def standardize(X):
    X_std=(X - np.mean(X)) / np.std(X)
    return X_std


In [5]:
# validation dataset calculation and results
X_train, X_valid, X_test, y_train, y_valid, y_test=train_valid_test_split(X,y)
w_hat_lin1=w_hat_lin_calc(X_train, y_train[:, 0])
w_hat_lin2=w_hat_lin_calc(X_train, y_train[:, 1])
y_predict_linear1=predict(X_valid, w_hat_lin1)
y_predict_linear2=predict(X_valid, w_hat_lin2)
print("-------------------------")
print("Linear Regression Results using np.linalg.inv:")
print("-------------------------")
compare(y_valid[:, 0], y_predict_linear1,1)
compare(y_valid[:, 1], y_predict_linear2,2)
print("-------------------------")

X_train, X_valid, X_test, y_train, y_valid, y_test=train_valid_test_split(X,y)
w_hat_lin_std1=w_hat_lin_calc(standardize(X_train), y_train[:, 0])
w_hat_lin_std2=w_hat_lin_calc(standardize(X_train), y_train[:, 1])
y_predict_linear1=predict(standardize(X_valid), w_hat_lin_std1)
y_predict_linear2=predict(standardize(X_valid), w_hat_lin_std2)
print("-------------------------")
print("Linear Regression Results using standardized data:")
print("-------------------------")
compare(y_valid[:, 0], y_predict_linear1,1)
compare(y_valid[:, 1], y_predict_linear2,2)
print("-------------------------")

w_hat_lin_calc_pseudo1=w_hat_lin_calc_pseudo(X_train, y_train[:, 0])
w_hat_lin_calc_pseudo2=w_hat_lin_calc_pseudo(X_train, y_train[:, 1])
y_predict_linear1=predict(X_valid, w_hat_lin_calc_pseudo1)
y_predict_linear2=predict(X_valid, w_hat_lin_calc_pseudo2)
print("-------------------------")
print("Linear Regression Results using np.linalg.pinv:")
print("-------------------------")
compare(y_valid[:, 0], y_predict_linear1,1)
compare(y_valid[:, 1], y_predict_linear2,2)
print("-------------------------") 

w_hat_rid1=w_hat_ridge_calc(X_train, y_train[:, 0],1)
w_hat_rid2=w_hat_ridge_calc(X_train, y_train[:, 1],1)
y_predict_ridge1=predict(X_valid, w_hat_rid1)
y_predict_ridge2=predict(X_valid, w_hat_rid2)

print("Ridge Regression Results:")
print("-------------------------")
compare(y_valid[:, 0], y_predict_ridge1,1)
compare(y_valid[:, 1], y_predict_ridge2,2)
print("-------------------------")

Data amounts to training data 537, validation data 153 and testing data 76. 
-------------------------
Linear Regression Results using np.linalg.inv:
-------------------------
For the Y1, the MAE is 3458967698418.256, MSE is 12095596415070139030962176.000 and RMSE is 3477872397755.579.
For the Y2, the MAE is 3885442615520.533, MSE is 15260163431430971910520832.000 and RMSE is 3906425915262.053.
-------------------------
Data amounts to training data 537, validation data 153 and testing data 76. 
-------------------------
Linear Regression Results using standardized data:
-------------------------
For the Y1, the MAE is 12.788, MSE is 223.997 and RMSE is 14.967.
For the Y2, the MAE is 14.674, MSE is 290.574 and RMSE is 17.046.
-------------------------
-------------------------
Linear Regression Results using np.linalg.pinv:
-------------------------
For the Y1, the MAE is 2.594, MSE is 10.629 and RMSE is 3.260.
For the Y2, the MAE is 2.656, MSE is 12.134 and RMSE is 3.483.
------------

In [6]:
# testing dataset calculation and results   
y_predict_linear1=predict(X_test, w_hat_lin1)
y_predict_linear2=predict(X_test, w_hat_lin2)
print("-------------------------")
print("Linear Regression Results using np.linalg.inv for test dataset:")
print("-------------------------")
compare(y_test[:, 0], y_predict_linear1,1)
compare(y_test[:, 1], y_predict_linear2,2)
print("-------------------------")
y_predict_linear1=predict(standardize(X_test), w_hat_lin_std1)
y_predict_linear2=predict(standardize(X_test), w_hat_lin_std2)
print("-------------------------")
print("Linear Regression Results using standardized data for test dataset:")
print("-------------------------")
compare(y_test[:, 0], y_predict_linear1,1)
compare(y_test[:, 1], y_predict_linear2,2)
print("-------------------------")
y_predict_linear1=predict(X_test, w_hat_lin_calc_pseudo1)
y_predict_linear2=predict(X_test, w_hat_lin_calc_pseudo2)
print("-------------------------")
print("Linear Regression Results using np.linalg.pinv for test dataset:")
print("-------------------------")
compare(y_test[:, 0], y_predict_linear1,1)
compare(y_test[:, 1], y_predict_linear2,2)
print("-------------------------")
print("-------------------------")
y_predict_ridge1=predict(X_test, w_hat_rid1)
y_predict_ridge2=predict(X_test, w_hat_rid2)
print("Ridge Regression Results for test dataset:")
print("-------------------------")
compare(y_test[:, 0], y_predict_ridge1,1)
compare(y_test[:, 1], y_predict_ridge2,2)
print("-------------------------")


-------------------------
Linear Regression Results using np.linalg.inv for test dataset:
-------------------------
For the Y1, the MAE is 3557101781710.507, MSE is 12754970807425642300178432.000 and RMSE is 3571410198706.618.
For the Y2, the MAE is 3995489290913.481, MSE is 16091482501846399178506240.000 and RMSE is 4011419013497.144.
-------------------------
-------------------------
Linear Regression Results using standardized data for test dataset:
-------------------------
For the Y1, the MAE is 22.039, MSE is 613.727 and RMSE is 24.774.
For the Y2, the MAE is 26.220, MSE is 848.866 and RMSE is 29.135.
-------------------------
-------------------------
Linear Regression Results using np.linalg.pinv for test dataset:
-------------------------
For the Y1, the MAE is 2.844, MSE is 10.969 and RMSE is 3.312.
For the Y2, the MAE is 2.424, MSE is 9.462 and RMSE is 3.076.
-------------------------
-------------------------
Ridge Regression Results for test dataset:
---------------------

In [7]:
from sklearn.linear_model import LinearRegression, Ridge
X_df=pd.DataFrame(X)
train=int(X.shape[0]*0.7)
valid=int(X.shape[0]*0.2)
test=int(X.shape[0]*0.1)
X_train=X_df.iloc[0:train,:].values
X_valid=X_df.iloc[train:train+valid,:].values
X_test=X_df.iloc[train+valid:,:].values
y_df=pd.DataFrame(y)
y_train=y_df.iloc[0:train,:].values
y_valid=y_df.iloc[train:train+valid,:].values
y_test=y_df.iloc[train+valid:,:].values
LR_model=LinearRegression()
LR_model.fit(X_train, y_train)
y_predict_sklearn=LR_model.predict(X_test)
RR_model=Ridge(alpha=1)
RR_model.fit(X_train, y_train)
y_predict_ridge_sklearn=RR_model.predict(X_test)

print("--------------")
print("Sklearn Linear Regression Results:")
print("--------------")
compare(y_test, y_predict_sklearn,1)

print("--------------")
print("Sklearn Ridge Regression Results:")
print("--------------")
compare(y_test, y_predict_ridge_sklearn,1)
print("-------------------------")

--------------
Sklearn Linear Regression Results:
--------------
For the Y1, the MAE is 2.634, MSE is 10.215 and RMSE is 3.196.
--------------
Sklearn Ridge Regression Results:
--------------
For the Y1, the MAE is 2.227, MSE is 8.596 and RMSE is 2.932.
-------------------------
