In [2]:
import numpy as np
import matplotlib.pyplot as plt

In [61]:
# 시그모이드 함수
def sigmoid(z):
    return 1/(1+np.exp(z))

In [62]:
# 가설 함수 (릿지+시그모이드)
def hypothesis_function(x,theta):
    z = np.dot(-x,theta)
    return sigmoid(z)

In [63]:
# 비용 함수
def compute_cost(x,y,theta):
    m = y.shape[0]
    J = (-1/m)*y.T.dot(np.log(hypothesis_function(x,theta)))+(1-y).T.dot(np.log(1-hypothesis_function(x,theta)))
    return J

In [64]:
def minimize_gradient(x,y,theta,iterations=10000,alpha=0.01):
    m     = y.size
    Cost  = []  # 100번 업데이트마다 비용
    Theta = []  # 하이퍼파라미터
    
    for I in range(iterations):
        original_theta = theta
        for i in range(theta.size):
            partial_marginal = x[:,i].reshape(-1,1)# 하나의 열벡터로 변경
            delta            = hypothesis_function(x,original_theta)-y
            # 17p gradient 정의
            grad_i           = delta.T.dot(partial_marginal)
            # 경사하강법으로 theta 업데이트
            theta[i]         = theta[i]-alpha*grad_i
        if I%100 ==0:
            Theta.append(theta)
            Cost.append(compute_cost(x,y,theta))
    return theta,np.array(Cost),np.array(Theta)
            

# 1. Breast Cancer 데이터에 logistic 회귀 적용

In [65]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X, y = load_breast_cancer(return_X_y=True, as_frame = False)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=1234)

X_train, X_test = X_train[:, :3], X_test[:, :3]
y_train, y_test = y_train.reshape(-1, 1), y_test.reshape(-1, 1)

In [66]:
# 데이터 전처리 (표준 스케일링)
scaler = StandardScaler()
scaler.fit(X_train) 

X_train=scaler.transform(X_train)
X_test = scaler.transform(X_test)

n, n_test = X_train.shape[0], X_test.shape[0]
X_train, X_test = np.append(np.ones((n, 1)), X_train,axis=1), np.append(np.ones((n_test, 1)),X_test,axis=1)

In [67]:
theta = np.ones((4, 1))
X_train.shape

(381, 4)

In [68]:
# 경사하강법
theta,Cost, Theta = minimize_gradient(X_train,y_train,theta)

In [69]:
# 정확도 측정
y_train_pred = np.where(hypothesis_function(X_train,theta)>0.5,1,0)
y_test_pred  = np.where(hypothesis_function(X_test,theta)>0.5,1,0)
print(f'학습 데이터셋 정확도:{(y_train == y_train_pred).sum() / len(y_train) * 100: .2f}%')
print(f'테스트 데이터셋 정확도:{(y_test == y_test_pred).sum() / len(y_test) * 100: .2f}%')

학습 데이터셋 정확도: 93.18%
테스트 데이터셋 정확도: 87.23%


# 2. LogisticRegression 클래스 사용하기

1번과 달리 간단하게 로지스틱 회귀로 예측할 수 있음! (동일한 방법)

In [70]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X, y = load_breast_cancer(return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.33,random_state=1234)

X_train = X_train.iloc[:, :3]
X_test  = X_test.iloc[:, :3]

In [71]:
clf = LogisticRegression(random_state=1234, max_iter=100, C=100)

clf          = clf.fit(X_train, y_train)
y_train_pred = clf.predict(X_train)
y_pred       = clf.predict(X_test)

print(f'학습 데이터셋 정확도:{(y_train == y_train_pred).sum() / len(y_train) * 100: .2f}%')
print(f'테스트 데이터셋 정확도:{(y_test == y_pred).sum() / len(y_test) * 100: .2f}%')

학습 데이터셋 정확도: 93.18%
테스트 데이터셋 정확도: 87.23%


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
