# Tobig's 14기 2주차 Optimization 과제
### Made by 이지용

# Gradient Descent 구현하기

### 1) "..." 표시되어 있는 빈 칸을 채워주세요  
### 2) 강의내용과 코드에 대해 공부한 내용을 적어서 과제를 채워주세요

In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
data = pd.read_csv('assignment_2.csv')
data.head()

Unnamed: 0,Label,bias,experience,salary
0,1,1,0.7,48000
1,0,1,1.9,48000
2,1,1,2.5,60000
3,0,1,4.2,63000
4,0,1,6.0,76000


## Train Test 데이터 나누기
### 데이터셋을 train/test로 나눠주는 메소드  
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, 1:], data.iloc[:, 0], test_size=0.25, random_state = 0)

In [5]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((150, 3), (50, 3), (150,), (50,))

## Scaling  

experience와 salary의 단위, 평균, 분산이 크게 차이나므로 scaler를 사용해 단위를 맞춰줍니다. 

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
bias_train = X_train["bias"]
bias_train = bias_train.reset_index()["bias"]
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
X_train["bias"] = bias_train
X_train.head()

Unnamed: 0,bias,experience,salary
0,1,0.187893,-1.143335
1,1,1.185555,0.043974
2,1,-0.310938,-0.351795
3,1,-1.629277,-1.34122
4,1,-1.3086,0.043974


이때 scaler는 X_train에 fit 해주시고, fit한 scaler를 X_test에 적용시켜줍니다.  
똑같이 X_test에다 fit하면 안돼요!

In [7]:
bias_test = X_test["bias"]
bias_test = bias_test.reset_index()["bias"]
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)
X_test["bias"] = bias_test
X_test.head()

Unnamed: 0,bias,experience,salary
0,1,-1.344231,-0.615642
1,1,0.50857,0.307821
2,1,-0.310938,0.571667
3,1,1.363709,1.956862
4,1,-0.987923,-0.747565


In [8]:
# parameter 개수
N = len(X_train.loc[0])

In [9]:
# 초기 parameter들을 임의로 설정해줍니다.
parameters = np.array([random.random() for i in range(N)])
random_parameters = parameters.copy()
parameters

array([0.12859637, 0.04914215, 0.67824026])

### * LaTeX   

Jupyter Notebook은 LaTeX 문법으로 수식 입력을 지원하고 있습니다.  
http://triki.net/apps/3466  
https://jjycjnmath.tistory.com/117

## Logistic Function

## $p = \frac{1}{1 +exp(-X_{i}\theta)}$

In [10]:
def logistic(X, parameters):
    z = 0
    for i in range(len(parameters)) :
        z += X[i]*parameters[i]
    p = 1 / (1 + np.exp(-z))
    
    return p

In [11]:
logistic(X_train.iloc[1], parameters)

0.5539596258792737

## Object Function

Object Function : 목적함수는 Gradient Descent를 통해 최적화 하고자 하는 함수입니다.  
로지스틱 회귀의 목적함수를 작성해주세요
## $l(p) = -\Sigma \{y_{i}logp(X_{i}) + (1-y_{i})log(1-p(X_{i}))\}$

In [12]:
def minus_log_cross_entropy_i(X, y, parameters) :
    p = logistic(X, parameters)                            # 위에서 작성한 함수를 활용하세요
    loss = -(y*np.log(p) + (1-y)*np.log(1-p))
    return loss

In [13]:
def minus_log_cross_entropy(X_set, y_set, parameters) :
    loss = 0
    for i in range(X_set.shape[0]):
        X = X_set.iloc[i, :]
        y = y_set.iloc[i]
        loss += minus_log_cross_entropy_i(X,y,parameters)
    return loss

In [14]:
minus_log_cross_entropy(X_test, y_test, parameters)

46.1564553557467

## Gradient of Minus Log Cross Entropy

## ${\partial\over{\partial \theta_j}}l(p)= -\Sigma(y_{i}-p_{i})x_{ij}$

In [15]:
# cross_entropy를 theta_j에 대해 미분한 값을 구하는 함수
def get_gradient_ij_minus_log_cross_entropy(X, y, parameters, j):
    p = logistic(X, parameters)
    gradient = -((y-p)*X[j])
    return gradient

In [16]:
get_gradient_ij_minus_log_cross_entropy(X_train.iloc[0, :], y_train.iloc[0], parameters, 1)

-0.12292274774446302

## Batch Gradient Descent  

Batch Gradient Descent : 학습 한 번에 모든 데이터셋에 대해 기울기를 구한다

In [17]:
def get_gradients_bgd(X_train, y_train, parameters) :
    gradients = [0 for i in range(len(parameters))]
    
    for i in range(len(y_train)):
        X = X_train.iloc[i, :]
        y = y_train.iloc[i]
        for j in range(len(parameters)):
            gradients[j] += get_gradient_ij_minus_log_cross_entropy(X,y,parameters,j)
            
    return gradients

In [18]:
gradients_bgd = get_gradients_bgd(X_train, y_train, parameters)
gradients_bgd

[37.082517734342275, 0.6410018944907581, 38.08844876964129]

## Stochastic Gradient Descent  

Stochastic Gradient Descent : 학습 한 번에 임의의 데이터에 대해서만 기울기를 구한다

In [19]:
def get_gradients_sgd(X_train, y_train, parameters) :
    gradients = [0 for i in range(len(parameters))]
    r = int(random.random()*len(y_train))
    X = X_train.iloc[r, :]
    y = y_train.iloc[r]
        
    for j in range(len(parameters)):
        gradients[j] = get_gradient_ij_minus_log_cross_entropy(X,y,parameters,j)
        
    return gradients

In [20]:
gradients_sgd = get_gradients_sgd(X_train, y_train, parameters)
gradients_sgd

[0.6064896098352108, 0.07073576728428496, 0.2667002965447674]

## Update Parameters  

In [21]:
def update_parameters(parameters, gradients, learning_rate) :
    for i in range(len(parameters)) :
        gradients[i] *= learning_rate
    parameters -= gradients
    return parameters

In [22]:
update_parameters(parameters, gradients_bgd, 0.01)

array([-0.2422288 ,  0.04273214,  0.29735577])

## Gradient Descent  

위에서 작성한 함수들을 조합해서 Gradient Descent를 진행하는 함수를 완성해주세요

learning_rate = 학습률  
max_iter = 최대 반복 횟수  
tolerance = Step이 너무 작아서 더 이상의 학습이 무의미할 때 학습을 멈추는 조건

In [23]:
def gradient_descent(X_train, y_train, learning_rate=0.01, max_iter=100000, tolerance=0.0001, optimizer="bgd") :
    count = 1
    point = 100 if optimizer == "bgd" else 10000
    N = len(X_train.iloc[0])
    parameters = np.array([random.random() for i in range(N)])
    gradients = [0 for i in range(N)]
    loss = minus_log_cross_entropy(X_train, y_train, parameters)
    
    while count < max_iter :
        
        if optimizer == "bgd" :
            gradients = get_gradients_bgd(X_train, y_train, parameters)
        elif optimizer == "sgd" :
            gradients = get_gradients_sgd(X_train, y_train, parameters)
            # loss, 중단 확인
        if count%point == 0 :
            new_loss = minus_log_cross_entropy(X_train, y_train, parameters)
            print(count, "loss: ",new_loss, "params: ", parameters, "gradients: ", gradients)
            
            #중단 조건
            if abs(new_loss-loss) < tolerance*len(y_train) :
                break
            loss = new_loss
                
            
                
        parameters = update_parameters(parameters, gradients, learning_rate)
        count += 1
    return parameters

In [24]:
new_param_bgd = gradient_descent(X_train, y_train)
new_param_bgd

100 loss:  45.39510953451567 params:  [-1.62199404  3.46815405 -3.29486231] gradients:  [0.2797622339833672, -0.9207802809546142, 0.8608057722063359]
200 loss:  44.80147112175683 params:  [-1.78137296  3.98915958 -3.77931606] gradients:  [0.084007035713743, -0.2729938881536049, 0.25247880025620884]
300 loss:  44.74037782612954 params:  [-1.83386937  4.15948719 -3.93658311] gradients:  [0.030223619647486336, -0.09791098040490197, 0.09025180241442368]
400 loss:  44.732145112016134 params:  [-1.85328106  4.222339   -3.99448372] gradients:  [0.01148562708639779, -0.037168658209264634, 0.034219870016600214]


array([-1.85328106,  4.222339  , -3.99448372])

## Hyper Parameter Tuning

Hyper Parameter들을 매번 다르게 해서 학습을 진행해 보세요. 다른 점들을 발견할 수 있습니다.

In [25]:
new_param_sgd = gradient_descent(X_train, y_train, learning_rate=0.01, max_iter=100000, tolerance=0.0001, optimizer="sgd")
new_param_sgd

10000 loss:  46.35439864930777 params:  [-1.45358711  3.115251   -2.92113746] gradients:  [-0.9501962635300689, 0.7694395949164543, 0.33427464695140363]
20000 loss:  45.15061170663292 params:  [-1.74566501  3.65518045 -3.59817143] gradients:  [-0.283427933803996, -0.16434051429833382, 0.04362252785385278]
30000 loss:  44.810726118195696 params:  [-1.74451066  4.01098815 -3.78569248] gradients:  [0.3751163904650117, 0.3645270255229834, 0.2639284194247067]
40000 loss:  44.849110239531065 params:  [-1.88471099  4.0705781  -3.96400513] gradients:  [-0.6594565985522298, -0.7348282814042123, -0.5509852243571572]
50000 loss:  44.737810001815724 params:  [-1.86010768  4.20311472 -3.96105539] gradients:  [-0.014393060670048308, -0.026807654736459297, -0.006329265145913558]
60000 loss:  44.75203021771132 params:  [-1.84586003  4.30243532 -4.01689808] gradients:  [0.7528316741982555, 0.40969171715816255, -0.1158686803950903]


array([-1.84586003,  4.30243532, -4.01689808])

## Predict Label

In [26]:
y_predict = []
for i in range(len(y_test)):
    p = logistic(X_test.iloc[i,:], new_param_bgd)
    if p> 0.5 :
        y_predict.append(1)
    else :
        y_predict.append(0)
y_predict_random = []
for i in range(len(y_test)):
    p = logistic(X_test.iloc[i,:], random_parameters)
    if p> 0.5 :
        y_predict_random.append(1)
    else :
        y_predict_random.append(0)

## Confusion Matrix

In [27]:
from sklearn.metrics import *
tn, fp, fn, tp = confusion_matrix(y_test, y_predict_random).ravel()
confusion_matrix(y_test, y_predict_random)

array([[12, 28],
       [ 4,  6]], dtype=int64)

In [28]:
random_accuracy = (tp+tn) / (tp+fn+fp+tn)
print("random_accuracy:",random_accuracy)

random_accuracy: 0.36


In [29]:
tn, fp, fn, tp = confusion_matrix(y_test, y_predict).ravel()
confusion_matrix(y_test, y_predict)

array([[38,  2],
       [ 1,  9]], dtype=int64)

In [30]:
accuracy = (tp+tn) / (tp+fn+fp+tn)
print("accuracy:",accuracy)

accuracy: 0.94
