# SGD를 사용하여 2차 함수 모델 fiiting하기

In [None]:
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

## 데이터 생성하기

In [None]:
# function
f = lambda x: x**2 + 1.0/3.0 * x + 5.0

In [None]:
x_train = np.linspace(-20, 60, 50)
fx = f(x_train)

In [None]:
np.random.seed(313)
y_train = fx + 500 * np.random.rand(len(x_train))

In [None]:
plt.plot(x_train,y_train, 'o')
plt.grid()
plt.show()

\begin{equation}
loss(w) = \frac{1}{N}\sum_{i=1}^N |w_0 x_i^2 + w_1x_i + w_2 - y_i|^2
\end{equation}

In [None]:
def loss(w, x_set, y_set):
    num_data = len(x_set)
    val = 0.0
    for i in range(num_data):
        val += 1.0 * (w[0] * x_set[i]**2 + w[1] * x_set[i] + w[2] - y_set[i])**2
    return val / num_data

## 1. Gradient Descent 사용하기
1. Define gradient
1. Tune Parameters

\begin{equation}
loss(w) = \frac{1}{N}\sum_{i=1}^N |w_0 x_i^2 + w_1x_i + w_2 - y_i|^2
\end{equation}

\begin{equation}
\nabla loss(w) =
\frac{2}{N}\sum_{i=1}^N
(w_0 x_i^2 + w_1x_i + w_2 - y_i)
\begin{bmatrix}
x_i^2\\
x_i\\
1
\end{bmatrix}
\end{equation}

In [None]:
def grad_loss(w, x_set, y_set):
    num_data = len(x_set)
    val = np.zeros(len(w))
    for i in range(num_data):
        er = w[0] * x_set[i]**2 + w[1] * x_set[i] + w[2] - y_set[i]
        val += 2.0 * er * np.array([x_set[i]**2, x_set[i], 1.0])
    return val / num_data

In [None]:
def steepest_descent_3d(loss, grad_func, w0, x_set, y_set, learning_rate=0.01, MaxIter=10):
    for i in range(MaxIter):
        w1 = w0 -learning_rate * grad_func(w0, x_set, y_set)
        w0 = w1
    return w0

In [None]:
w0 = np.array([1.0, 1.0, 1.0])
w_gd = steepest_descent_3d(loss, grad_loss, w0, x_train, y_train, \
                           learning_rate=2E-7, MaxIter=2500)
print(w_gd)

In [None]:
y_pred = w_gd[0] * x_train ** 2 + w_gd[1] * x_train + w_gd[2]
plt.plot(x_train,y_train, 'o')
plt.plot(x_train,y_pred, 'r-')
plt.grid()
plt.xlabel('x')
plt.xlabel('y')
plt.show()

## 2. Normalization 하기
1. min/max normalization
```python
scaled_x = (x - np.min(x)) / (np.max(x) - np.min(x))
```
1. mean/variance normailzation
```python
scaled_x = (x - np.mean(x)) / np.sqrt(np.var(x))
```

### 2-1. min/max normalization
```python
scaled_x = (x - np.min(x)) / (np.max(x) - np.min(x))
```

In [None]:
scaled_x_train1 = (x_train - np.min(x_train)) / (np.max(x_train) - np.min(x_train))
print(x_train)
print(scaled_x_train1)

In [None]:
w0 = np.array([1.0, 1.0, 1.0])
w_gd_sc1 = steepest_descent_3d(loss, grad_loss, w0, scaled_x_train1, y_train, \
                           learning_rate=.2, MaxIter=2500)
print(w_gd_sc1)

In [None]:
y_pred = w_gd_sc1[0] * scaled_x_train1 ** 2 + w_gd_sc1[1] * scaled_x_train1 + w_gd_sc1[2]
plt.plot(scaled_x_train1, y_train, 'o')
plt.plot(scaled_x_train1, y_pred, 'r-')
plt.grid()
plt.xlabel('scaled x')
plt.xlabel('y')
plt.show()

### 2-2. mean/variance normalization
```python
scaled_x = (x - np.mean(x)) / np.sqrt(np.var(x))
```

In [None]:
scaled_x_train2 = (x_train - np.mean(x_train)) / np.sqrt(np.var(x_train))
print(x_train)
print(scaled_x_train2)

In [None]:
w0 = np.array([1.0, 1.0, 1.0])
w_gd_sc2 = steepest_descent_3d(loss, grad_loss, w0, scaled_x_train2, y_train, \
                           learning_rate=.2, MaxIter=2500)
print(w_gd_sc2)

In [None]:
y_pred = w_gd_sc2[0] * scaled_x_train2 ** 2 + w_gd_sc2[1] * scaled_x_train2 + w_gd_sc2[2]
plt.plot(scaled_x_train2, y_train, 'o')
plt.plot(scaled_x_train2, y_pred, 'r-')
plt.grid()
plt.xlabel('scaled x')
plt.xlabel('y')
plt.show()

## 3. Stochastic Gradient Descent
1. `np.random.shuffle()`을 사용하여 `x_train`을 섞는다.
1. `generate_batches()`를 사용하여 batch들을 만든다.
1. Stochastic Gradient Method를 적용한다.

In [None]:
def generate_batches(batch_size, features, labels):
    assert len(features) == len(labels)
    out_batches = []

    sample_size = len(features)
    for start_i in range(0, sample_size, batch_size):
        end_i = start_i + batch_size
        batch = [features[start_i:end_i], labels[start_i:end_i]]
        out_batches.append(batch)

    return out_batches

### 3-1. Shuffle하기
```python
np.random.shuffle(x)
```

In [None]:
a = np.arange(len(x_train))
print(a)
print(x_train[a])
np.random.shuffle(a)
print(a)
print(x_train[a])

### 3-2. Batch 만들기

In [None]:
batch_size = 5
for x_train_batch, y_train_batch in generate_batches(batch_size, scaled_x_train1, y_train):
    print('x_batch = {0}'.format(x_train_batch))
    print('y_batch = {0}'.format(y_train_batch))
    print('')

### 3-3. Stochastic Gradient Descent 적용
1. min/max normalization을 적용
1. `np.random.shuffle()` 이용하여 데이터 골고루 섞기
1. 다음과 같이 Parameter를 설정
    1. `batch_size=10`
    1. `learning_rate=0.2`
    1. `w0=np.array([1.0, 1.0, 1.0])`
    1. `MaxEpochs = 2500`
1. 아래 for loop 안에 SGD를 구현하시면 됩니다.
    ```python
    for epoch in range(MaxEpochs):
        for x_batch, y_batch in generate_batches(_, _, _):
            grad = grad_loss(w0, x_batch, y_batch)
            # do gradient descent with x_batch and y_batch
    ```
1. SGD 구현을 올바르게하고, 위의 parameter로 설정하셨다면, 다음과 비슷한 그림이 나와야합니다.
![week3_project_result.png](week3_project_result.png)

In [None]:
# TODO 1
scaled_x_train = None
# TODO 2
idx = np.arange(len(x_train))
np.random.shuffle(None)

sh_scaled_x_train = scaled_x_train[None]
sh_y_train = y_train[None]
# TODO 3
batch_size = None
MaxEpochs = None
learning_rate = None
w0 = np.array([1,1,1])
for epoch in range(MaxEpochs):
    for x_batch, y_batch in generate_batches(batch_size, sh_scaled_x_train, sh_y_train):
        # TODO 4
        grad = grad_loss(None, None, None)
        w1 = None
        w0 = w1
w_sgd = w0

In [None]:
# TODO 5(Just run, Don't modify below)
y_pred = w_sgd[0] * scaled_x_train ** 2 + w_sgd[1] * scaled_x_train + w_sgd[2]
plt.plot(scaled_x_train, y_train, 'o')
plt.plot(scaled_x_train, y_pred, 'r-')
plt.grid()
plt.xlabel('scaled x')
plt.xlabel('y')
plt.title('SGD : loss = {0} '.format(loss(w_sgd, scaled_x_train, y_train)))
plt.show()