In [None]:
# Basic setting
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Optimization of Linear Regression

In [None]:
# Load the dataset
with open('./data/linear_regression.pickle', 'rb') as f:
    X, y = pickle.load(f)

In [None]:
# Check the dataset distribution    
plt.scatter(X, y)
plt.show()

## [P.1] 데이터를 학습 데이터와 평가 데이터로 나누기

In [None]:
# Use sklearn library
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
from sklearn.model_selection import ???

In [None]:
X_train, X_test, y_train, y_test = ???(X, y, test_size=???, random_state=???)

In [None]:
# Visualize train dataset  
plt.scatter(X_train, y_train)
plt.show()

In [None]:
# Visualize train dataset  
plt.scatter(X_test, y_test)
plt.show()

## [P.2] <font color=red>numpy</font> 라이브러리를 활용해, 최적의 선형 회귀 모델 찾기

- 최적의 모델은 모든 데이터에 대해 실제값과 예측값의 차이가 제일 작은 모델
- 선형 회귀 모델은  일반적으로 <font color=red>MSE(Mean Squared Error)</font> 손실을 최소화하도록 학습
- 손실 함수와 모델 파라미터의 gradient에 관한 빈칸 부분 채워넣기

**MSE 손실 함수**
\begin{equation*}
\left( \frac{1}{n} \sum_{i=1}^n (y_i - (wX_i + b))^2 \right)
\end{equation*}

**Gradients 계산**
\begin{equation*}
\frac{\partial L}{\partial w} = -2 * \frac{1}{n} \sum_{i=1}^n (y_i - (wX_i + b)) * X_i
\end{equation*}

\begin{equation*}
\frac{\partial L}{\partial b} = -2 * \frac{1}{n} \sum_{i=1}^n (y_i - (wX_i + b))
\end{equation*}

In [None]:
def gd_numpy(X, y, epochs, lr):
    # Model weights and bias parameters
    w = ???
    b = ???

    # Store model parameters and loss for visualization
    w_list, b_list, loss_list = [], [], []

    # Perform Gradient Descent
    for i in range(???):


    #################################################
    ######## Hint: use +, -, *, /, **, np.mean() ########
        loss = ???   # MSE loss

        dw = ???   # derivative w.r.t to w
        db = ???   # derivative w.r.t to b
    #################################################


        w = ???   # update w
        b = ???   # update b

        w_list.append(w)
        b_list.append(b)
        loss_list.append(loss)

    print('Trained model weights : %.4f' % w)
    print('Trained model bias : %.4f' % b)
    
    return w, b, w_list, b_list, loss_list

In [None]:
# Training setting
epochs = ???
learning_rate = ???

w, b, w_list, b_list, loss_list = gd_numpy(X_train, y_train, epochs, learning_rate)

In [None]:
# get train loss
y_pred = ???

train_loss = np.mean((y_train - y_pred)**2)
print('Train Loss for LinearRegression model : %.4f' % train_loss)

In [None]:
# Visualize the intermediate trained model
nums = 6 
epochs_list = [round(epochs / (nums-1) * n) for n in range(nums)]

for i in range(len(epochs_list)):
    plt.scatter(X_train, y_train)   # scatter the original data
    
    # Load trained weights in specific epoch
    epoch = epochs_list[i] - 1   # In python, all indexes start from 0
    w = w_list[epoch]
    b = b_list[epoch]
    
    plt.plot(X_train, y_pred, color='red')
    plt.show()

In [None]:
# Visualize the change of loss
plt.plot(loss_list)
plt.show()

### 평가 데이터에 대한 예측 결과는 다음과 같음

In [None]:
# Visualize the trained linear regression model
# scatter the original data
plt.scatter(X_test, y_test)   

# plot prediction results
y_pred = ???

plt.plot(X_test, y_pred, color='red')
plt.show()

### 학습된 선형 회귀 모델의 성능은 평가 데이터셋에 대한 손실임

In [None]:
test_loss = np.mean((y_test - y_pred)**2)
print('Test Loss for LinearRegression model : %.4f' % test_loss)

## [P.3] <font color=red>sklearn</font> 라이브러리의 `LinearRegression` 모델을 활용해, 최적의 선형 회귀 모델 찾기

- 이 모듈은 경사하강법 대신 최소제곱법(Least Squares Method)를 사용해 모델 학습

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
from sklearn.linear_model import LinearRegression

In [None]:
def lr_sklearn(X, y):
    X_2d = X.???
    
    reg = ???
    
    w = ???
    b = ???

    print('Trained model weights : %.4f' % w)
    print('Trained model bias : %.4f' % b)
    
    return reg

In [None]:
model = lr_sklearn(X_train, y_train)

In [None]:
# get train loss
y_pred = model.???

train_loss = np.mean((y_train - y_pred)**2)
print('Train Loss for LinearRegression model : %.4f' % train_loss)

### 평가 데이터에 대한 예측 결과는 다음과 같이 시각화

In [None]:
# Visualize the trained linear regression model
# scatter the original data
plt.scatter(X_test, y_test)   

# plot prediction results
y_pred = model.???

plt.plot(X_test, y_pred, color='red')
plt.show()

In [None]:
# get test loss
test_loss = np.mean((y_test - y_pred)**2)
print('Test Loss for LinearRegression model : %.4f' % test_loss)

## [P.4] <font color=red>sklearn</font> 라이브러리의 `SGDRegression` 모델을 활용해, 최적의 선형 회귀 모델 찾기

- 이 모듈은 경사하강법을 사용해 모델 학습

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html
from sklearn.linear_model import SGDRegressor

In [None]:
def gd_sklearn(X, y, epochs, lr, alpha):
    X_2d = X.???
    
    reg = SGDRegressor(penalty='l2', 
                       alpha=alpha,
                       max_iter=epochs,
                       tol=1e-3,
                       learning_rate='invscaling',
                       eta0=lr,
                       random_state=42)
    
    reg.???
    
    w = reg.???
    b = reg.???

    print('Trained model weights : %.4f' % w)
    print('Trained model bias : %.4f' % b)
    
    return reg

In [None]:
# training setting
epochs = 1000
lr = 1e-5
alpha = 0.001

model = gd_sklearn(X_train, y_train, epochs, lr, alpha)

In [None]:
# get train loss
y_pred = model.???

train_loss = np.mean((y_train - y_pred)**2)
print('Train Loss for LinearRegression model : %.4f' % train_loss)

In [None]:
# Visualize the trained linear regression model
# scatter the original data
plt.scatter(X_test, y_test)   

# plot prediction results
y_pred = model.???

plt.plot(X_test, y_pred, color='red')
plt.show()

In [None]:
# get test loss
test_loss = np.mean((y_test - y_pred)**2)
print('Test Loss for LinearRegression model : %.4f' % test_loss)

`sklearn` 라이브러리를 사용하면, 별도의 gradient 계산 없이 모델을 쉽게 학습시킬 수 있음