# Linear Regression with real data
##### <div style='text-align:right'>made by Wonbin Kim</div>

In [1]:
import numpy as np
from os import makedirs, path
from util import printProgress, file_download, handling_xlsx, _shuffle

data_path = './datasets'
if not path.exists(data_path):
    makedirs(data_path)

# Dataset 2. Energy-efficiency Data set

https://archive.ics.uci.edu/ml/datasets/energy+efficiency

In [2]:
data1_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00242/ENB2012_data.xlsx'

In [3]:
file_path = path.join(data_path, 'ENB2012_data.xlsx')
# file_path = file_download(data1_url, data_path)

In [4]:
raw = handling_xlsx(file_path, True)

In [5]:
print(raw[0].keys())

dict_keys(['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'Y1', 'Y2'])


## Sub-work (1)
### Splitting raw data into Covariate X and Response Y

In [6]:
# Implement 1
keys = list(raw[0].keys())
raw_data = np.array([[raw[i][k] for k in keys] for i in range(len(raw))]).astype(float)
X = raw_data[:, :8]
Y = raw_data[:, 8:]

## Sub-work (2)
### adding 1 vector to data for representing bias

### <center>\\(y = \mathbf{w}^T\phi(x)+b = [w_1, w_2, ... , w_d] \cdot \left[\begin{matrix}{\phi_1(x) \\ \phi_2(x) \\ ... \\ \phi_d(x)}\end{matrix}\right] + b \\)</center>

### <center>\\(= [w_1, w_2, ... , w_d, b] \cdot \left[\begin{matrix}{\phi_1(x) \\ \phi_2(x) \\ ... \\ \phi_d(x) \\ "1"}\end{matrix}\right]\\)</center>

In [7]:
def add_bias_term(X):
    return np.concatenate([X, np.ones([len(X), 1])], axis=1)

## Sub-work (3)
### For estimation, Seperate X and Y into training set and test set.
### Train : Test  = 0.7 : 0.3

In [8]:
whole_X, whole_Y = _shuffle(X,Y)
no_data = len(whole_X)
no_training = int(no_data*0.7)
train_X, train_Y = whole_X[:no_training], whole_Y[:no_training]
test_X, test_Y = whole_X[no_training:], whole_Y[no_training:]

print(train_X.shape, test_X.shape, whole_X.shape)

(537, 8) (231, 8) (768, 8)


# 1. Linear Squares Method

### <center>\\(\mathcal{J}_{LS}(w)=\frac{1}{2N}\sum_{n=1}^{N}\left(y_n-\mathbf{w}^T\phi(x_n)\right)^2 = \frac{1}{2N}||y-\phi^T\mathbf{w}||^2_2\\)</center> <br>
### Find <center> \\( \hat{\mathbf{w}}_{LS} = \arg \min\limits_{\mathbf{W}} \frac{1}{2}||y-\phi^T\mathbf{w}||^2_2\\) </center>

where $\phi(x) \in \mathbb{R}^{d\times N}$ , $y \in \mathbb{R}^{k \times N}$, and $\mathbf{w} \in \mathbb{R}^{k\times d}$. N, d, k denote the number of instances, the dimensionality of covariate X, and the dimentionality of response Y, respectively.

### Caution!

For implementational simplicity, X is transposed in practice. i.e. $ x \in \mathbb{R}^{N\times D}$, a row represents a instance.

In [9]:
# Objective function
def ls(x, y, w, b = 0., l=0.):
    out = np.square(y-np.dot(x, w)-b)
    if len(out.shape) == 2: 
        out = np.sum(out, axis=1)
    return 0.5*np.mean(out)+0.5*l*np.sum(np.square(w))

In [10]:
def find_w(x, y, l = 0.):
    x_mtx = np.dot(x.T, x)
    shape = x_mtx.shape
    inv_x_mtx = np.linalg.inv(x_mtx+ l*np.eye(shape[0]))
    w = np.dot(inv_x_mtx, np.dot(x.T, y))
    return w

## Objective function & Generalization

<img src='image/pic11.png'>

## (1) Linear Squared Method - identity basis function

### <center> \\( \phi(\mathbf{x}) = \mathbf{x}\\) </center>

In [11]:
new_train_X = add_bias_term(train_X)
new_test_X = add_bias_term(test_X)
new_whole_X = add_bias_term(whole_X)

w_ls = find_w(new_train_X, train_Y)
pred_y = np.dot(new_whole_X, w_ls)

In [12]:
print(w_ls)

[[ 1.31118585e+02  1.39477244e+02]
 [-4.87614914e+01 -5.83990410e+01]
 [ 5.03448451e+01  5.74108745e+01]
 [ 9.40861452e+01  1.09360049e+02]
 [ 3.92859159e+00  3.97491966e+00]
 [ 5.64965453e-02  2.55347186e-01]
 [ 1.95422434e+01  1.45637574e+01]
 [ 2.31384944e-01  7.93877066e-02]
 [ 9.72667507e+01  1.20182011e+02]]


### Result : Loss

In [13]:
print(ls(new_train_X, train_Y, w_ls)) # [4.17002527 5.33017311]
print(ls(new_test_X, test_Y, w_ls))   # [4.51320678 4.55826841]

1066582.4125371163
1051092.6525813031


## By sklearn
### <center> \\( \phi(\mathbf{x}) = \mathbf{x}\\) </center>

In [14]:
from sklearn import linear_model as lm

lr = lm.LinearRegression()
lr.fit(train_X, train_Y)
pred_y = lr.predict(whole_X)
w_ls = lr.coef_
b_ls = lr.intercept_

In [15]:
print(w_ls)

[[-7.18811217e+01 -5.17007033e+11  5.17007033e+11  1.03401407e+12
   3.92884991e+00  5.61136669e-02  1.95420426e+01  2.31226032e-01]
 [-8.37192772e+01 -3.16342859e+11  3.16342859e+11  6.32685719e+11
   3.97518820e+00  2.55287484e-01  1.45636390e+01  7.94318249e-02]]


### Result : Loss

In [16]:
print(ls(train_X, train_Y, w_ls.T, b_ls)) # [4.17089996 5.33201875]
print(ls(test_X, test_Y, w_ls.T, b_ls))   # [4.52692975 4.57075813]

9.502918709299651
9.097687881237237
