In [17]:
import pandas as pd
import numpy as np


In [18]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
energy_efficiency = fetch_ucirepo(id=242) 
  
# data (as pandas dataframes) 
X = energy_efficiency.data.features 
y = energy_efficiency.data.targets 
  
# metadata 
print(energy_efficiency.metadata) 
  
# variable information 
print(energy_efficiency.variables) 

{'uci_id': 242, 'name': 'Energy Efficiency', 'repository_url': 'https://archive.ics.uci.edu/dataset/242/energy+efficiency', 'data_url': 'https://archive.ics.uci.edu/static/public/242/data.csv', 'abstract': 'This study looked into assessing the heating load and cooling load requirements of buildings (that is, energy efficiency) as a function of building parameters.', 'area': 'Computer Science', 'tasks': ['Classification', 'Regression'], 'characteristics': ['Multivariate'], 'num_instances': 768, 'num_features': 8, 'feature_types': ['Integer', 'Real'], 'demographics': [], 'target_col': ['Y1', 'Y2'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2012, 'last_updated': 'Mon Feb 26 2024', 'dataset_doi': '10.24432/C51307', 'creators': ['Athanasios Tsanas', 'Angeliki Xifara'], 'intro_paper': {'ID': 379, 'type': 'NATIVE', 'title': 'Accurate quantitative estimation of energy performance of residential buildings using statistical machine 

In [19]:
def train_valid_test_split(X, y):
    ''' Split the data into train, validation and test datasets using pandas'''
    train=int(X.shape[0]*0.7)
    valid=int(X.shape[0]*0.2)
    test=int(X.shape[0]*0.1)
    print("Data amounts to training data {}, validation data {} and testing data {}. ".format(train, valid, test))

    X_df=pd.DataFrame(X)
    X_df.insert(0, 'W0', 1)
    X_train=X_df.iloc[0:train,:].values
    X_valid=X_df.iloc[train:train+valid,:].values
    X_test=X_df.iloc[train+valid:,:].values
    y_df=pd.DataFrame(y)
    y_train=y_df.iloc[0:train,:].values
    y_valid=y_df.iloc[train:train+valid,:].values
    y_test=y_df.iloc[train+valid:,:].values
    
    return X_train, X_valid, X_test, y_train, y_valid, y_test

In [24]:
def w_hat_lin_calc(X_train, y_train):
    w_hat=np.linalg.inv(X_train.T @ X_train) @ X_train.T @ y_train
    return w_hat

def w_hat_ridge_calc(X_train, y_train,lamb):
    n_features=X_train.shape[1]
    w_hat=np.linalg.inv(X_train.T @ X_train + lamb*np.eye(n_features)) @ X_train.T @ y_train
    return w_hat

def predict_linear(X, w_hat):
    y_predict_lin=X @ w_hat
    return y_predict_lin

def predict_ridge(X, w_hat):
    y_predict_rid=X @ w_hat
    return y_predict_rid

In [None]:
X_train, X_valid, X_test, y_train, y_valid, y_test=train_valid_test_split(X,y)
w_hat_lin=w_hat_lin_calc(X_train, y_train)
w_hat_rid=w_hat_ridge_calc(X_train, y_train, 1)
y_predict_linear=predict_linear(X_test, w_hat_lin)
y_predict_ridge=predict_ridge(X_test, w_hat_rid)



Data amounts to training data 537, validation data 153 and testing data 76. 


array([[ 58.91284425,  63.17095524],
       [ 58.88782517,  63.26103684],
       [ 65.19505295,  68.57438616],
       [ 65.17003387,  68.66446776],
       [ 65.14501479,  68.75454937],
       [ 65.11999571,  68.84463097],
       [ 29.0157948 ,  32.44810655],
       [ 28.99077572,  32.53818815],
       [ 28.96575664,  32.62826976],
       [ 28.94073756,  32.71835136],
       [ 33.19151475,  36.8942636 ],
       [ 33.16649567,  36.9843452 ],
       [ 33.14147659,  37.0744268 ],
       [ 33.11645751,  37.1645084 ],
       [ 20.18149316,  21.06206307],
       [ 20.15647408,  21.15214467],
       [ 20.131455  ,  21.24222628],
       [ 20.10643592,  21.33230788],
       [ 24.35721311,  25.50822012],
       [ 24.33219403,  25.59830172],
       [ 24.30717494,  25.68838332],
       [ 24.28215586,  25.77846492],
       [ 11.34719152,   9.67601959],
       [ 11.32217244,   9.76610119],
       [ 11.29715335,   9.8561828 ],
       [ 11.27213427,   9.9462644 ],
       [ -1.66283007,  -6.15618093],
 