# Linear Regression with real data
##### <div style='text-align:right'>made by Wonbin Kim</div>

In [None]:
import numpy as np
from os import makedirs, path
from util import printProgress, file_download, handling_xlsx, _shuffle

data_path = './datasets'
if not path.exists(data_path):
    makedirs(data_path)

# Dataset 2. Energy-efficiency Data set

https://archive.ics.uci.edu/ml/datasets/energy+efficiency

In [None]:
data1_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00242/ENB2012_data.xlsx'

In [None]:
file_path = path.join(data_path, 'ENB2012_data.xlsx')
# file_path = file_download(data1_url, data_path)

In [None]:
raw = handling_xlsx(file_path, True)

In [None]:
print(raw[0].keys())

## Sub-work (1)
### Splitting raw data into Covariate X and Response Y

In [None]:
# Implement 1
keys = list(raw[0].keys())
raw_data = np.array([[raw[i][k] for k in keys] for i in range(len(raw))]).astype(float)
X = raw_data[:, :8]
Y = raw_data[:, 8:]

## Sub-work (2)
### adding 1 vector to data for representing bias

### <center>\\(y = \mathbf{w}^T\phi(x)+b = [w_1, w_2, ... , w_d] \cdot \left[\begin{matrix}{\phi_1(x) \\ \phi_2(x) \\ ... \\ \phi_d(x)}\end{matrix}\right] + b \\)</center>

### <center>\\(= [w_1, w_2, ... , w_d, b] \cdot \left[\begin{matrix}{\phi_1(x) \\ \phi_2(x) \\ ... \\ \phi_d(x) \\ "1"}\end{matrix}\right]\\)</center>

In [None]:
def add_bias_term(X):
    ###
    pass
    ###
    return new_X

## Sub-work (3)
### For estimation, Seperate X and Y into training set and test set.
### Train : Test  = 0.7 : 0.3

In [None]:
whole_X, whole_Y = _shuffle(X,Y)
no_data = len(whole_X)
no_training = int(no_data*0.7)
train_X, train_Y = whole_X[:no_training], whole_Y[:no_training]
test_X, test_Y = whole_X[no_training:], whole_Y[no_training:]

print(train_X.shape, test_X.shape, whole_X.shape)

# 1. Linear Squares Method

### <center>\\(\mathcal{J}_{LS}(w)=\frac{1}{2N}\sum_{n=1}^{N}\left(y_n-\mathbf{w}^T\phi(x_n)\right)^2 = \frac{1}{2N}||y-\phi^T\mathbf{w}||^2_2\\)</center> <br>
### Find <center> \\( \hat{\mathbf{w}}_{LS} = \arg \min\limits_{\mathbf{W}} \frac{1}{2}||y-\phi^T\mathbf{w}||^2_2\\) </center>

where $\phi(x) \in \mathbb{R}^{d\times N}$ , $y \in \mathbb{R}^{k \times N}$, and $\mathbf{w} \in \mathbb{R}^{k\times d}$. N, d, k denote the number of instances, the dimensionality of covariate X, and the dimentionality of response Y, respectively.

### Caution!

For implementational simplicity, X is transposed in practice. i.e. $ x \in \mathbb{R}^{N\times D}$, a row represents a instance.

In [None]:
# Objective function
def ls(x, y, w, b = 0., l=0.):
    ###
    pass
    ###
    return loss

In [None]:
def find_w(x, y, l = 0.):
    ###
    pass
    ###
    return w_ls

## Objective function & Generalization

<img src='image/pic11.png'>

## (1) Linear Squared Method - identity basis function

### <center> \\( \phi(\mathbf{x}) = \mathbf{x}\\) </center>

In [None]:
pass

### Result : Loss

In [None]:
pass

## By sklearn
### <center> \\( \phi(\mathbf{x}) = \mathbf{x}\\) </center>

In [None]:
from sklearn import linear_model as lm

lr = lm.LinearRegression()
lr.fit(train_X, train_Y)
pred_y = lr.predict(whole_X)
w_ls = lr.coef_
b_ls = lr.intercept_

### Result : Loss

In [None]:
pass