넘파이만 사용하여 조기 종료를 사용한 배치 경사 하강법으로 소프트맥스 회귀를 구현

In [255]:
import numpy as np

In [256]:
#꽃잎의 너비를 기반으로 Iris-Versicolor 종을 감지
from sklearn.datasets import load_iris

iris=load_iris(as_frame=True)
list(iris)

['data',
 'target',
 'frame',
 'target_names',
 'DESCR',
 'feature_names',
 'filename',
 'data_module']

In [257]:
iris.data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [258]:
iris.target.head()

0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int64

In [259]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [260]:
X=iris.data[["petal length (cm)","petal width (cm)"]].values
y=iris["target"].values

In [261]:
#모든 샘플에 대한 편향 항 추가
X_with_bias=np.c_[np.ones(len(X)),X]

In [262]:
#train_test_split을 사용하지 않고 샘플 분리
test_ratio=0.2
validation_ratio=0.2
total_size=len(X_with_bias)

test_size=int(total_size*test_ratio)
validation_size=int(total_size*validation_ratio)
train_size=total_size-test_size-validation_size

np.random.seed(42)
rnd_indices=np.random.permutation(total_size)

X_train=X_with_bias[rnd_indices[:train_size]]
y_train=y[rnd_indices[:train_size]]
X_valid=X_with_bias[rnd_indices[train_size:-test_size]]
y_valid=y[rnd_indices[train_size:-test_size]]
X_test=X_with_bias[rnd_indices[-test_size:]]
y_test=y[rnd_indices[-test_size:]]

In [263]:
#클래스 인덱스 벡터를 각 샘플에 대한 원-핫 벡터를 포함하는 행렬로 변환
y_train_one_hot=np.diag(np.ones(y_train.max()+1))[y_train]
y_valid_one_hot=np.diag(np.ones(y_valid.max()+1))[y_valid]
y_test_one_hot=np.diag(np.ones(y_test.max()+1))[y_test]

### 소프트맥스 회귀

In [264]:
def softmax(logits):
    exps=np.exp(logits)
    exp_sums=exps.sum(axis=1, keepdims=True)
    return exps/exp_sums

### 배치 경사 하강법

In [276]:
#내려가는 스텝의 크기를 결정하는 방법

n_inputs=X_train.shape[1]
n_outputs=len(np.unique(y_train))

eta=0.5 #학습률
n_epochs=5001
m=len(X_train) #샘플 수
epsilon=1e-5

np.random.seed(42)
theta=np.random.randn(n_inputs,n_outputs) #모델 파리미터 랜덤 초기화

In [266]:
#훈련 세트를 한 번 반복하는 것을 epoch라고 함
for epoch in range(n_epochs):
    logits=X_train@theta
    y_proba=softmax(logits)
    
    #???
    if epoch%1000==0:
        y_proba_valid=softmax(X_valid@theta)
        xentropy_losses=-(y_valid_one_hot*np.log(y_proba_valid+epsilon))
        print(epoch, xentropy_losses.sum(axis=1).mean())
        
    error=y_proba-y_train_one_hot    
    gradients=1/m*X_train.T@error
    theta=theta-eta*gradients

0 7.589413698235447
1000 0.23722564547268396
2000 0.20299776800079783
3000 0.184945459940424
4000 0.17234190163896407
5000 0.16251946024698435


In [267]:
theta

array([[ 14.36543757,   3.45861431, -16.81791349],
       [ -2.26955728,   0.93607207,   2.38822474],
       [ -4.24164817,  -0.84400575,   6.96282708]])

In [268]:
logits=X_valid@theta
Y_proba=softmax(logits)
y_predict=Y_proba.argmax(axis=1)

accuracy_score=(y_predict==y_valid).mean()
accuracy_score

0.9333333333333333

In [277]:
#l_2 정규화를 추가해 손실에 l_2 패널티가 추가되고 그레이디언트가 적절한 추가항을 가짐
alpha=0.01

for epoch in range(n_epochs):
    logits=X_train@theta
    y_proba=softmax(logits)
    
    #???
    if epoch%1000==0:
        y_proba_valid=softmax(X_valid@theta)
        xentropy_losses=-(y_valid_one_hot*np.log(y_proba_valid+epsilon))
        l2_loss=1/2*(theta[1:]**2).sum()
        total_loss=xentropy_losses.sum(axis=1).mean()+alpha*l2_loss
        print(epoch, total_loss.round(4))
        
    error=y_proba-y_train_one_hot    
    gradients=1/m*X_train.T@error
    gradients+=np.r_[np.zeros([1,n_outputs]),alpha*theta[1:]]
    theta=theta-eta*gradients

0 7.6181
1000 0.3435
2000 0.2753
3000 0.2746
4000 0.2743
5000 0.2741


In [278]:
logits=X_valid@theta
Y_proba=softmax(logits)
y_predict=Y_proba.argmax(axis=1)

accuracy_score=(y_predict==y_valid).mean()
accuracy_score

0.9333333333333333

### 조기종료

In [279]:
#학습을 중지하지 않고 학습 후 최상위 모델로 되돌리는 코드
eta=0.5
n_epochs=50_001
m=len(X_train)
epsilon=1e-5
C=100 #규제 하이퍼파라미터
best_loss=np.infty

np.random.seed(42)
theta=np.random.randn(n_inputs,n_outputs)

for epoch in range(n_epochs):
    logits=X_train@theta
    y_proba=softmax(logits)
    y_proba_valid=softmax(X_valid@theta)
    xentropy_losses=-(y_valid_one_hot*np.log(y_proba_valid+epsilon))
    l2_loss=1/2*(theta[1:]**2).sum()
    total_loss=xentropy_losses.sum(axis=1).mean()+1/C*l2_loss
    if epoch % 1000==0:
        print(epoch,total_loss.round(4))
    if total_loss<best_loss:
        best_loss=total_loss
    else:
        print(epoch-1,best_loss.round(4))
        print(epoch, total_loss.round(4),"Ended")
        break
    error=y_proba-y_train_one_hot
    gradients=1/m*X_train.T@error
    gradients+=np.r_[np.zeros([1,n_outputs]),1/C*theta[1:]]
    theta=theta-eta*gradients

0 7.6181
1 1.2429
2 1.276 Ended


In [281]:
logits=X_valid@theta
Y_proba=softmax(logits)
y_predict=Y_proba.argmax(axis=1)

accuracy_score=(y_predict==y_valid).mean()
accuracy_score

0.4