In [3]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression

In [4]:
# Warning 무시
import warnings
warnings.filterwarnings('ignore')

## sklearn.linear_model.LogisticRegression

* _class_ sklearn.linear_model.LogisticRegression(_penalty='l2'_,  _*_,  _dual=False_,  _tol=0.0001_,  _C=1.0_,  _fit_intercept=True_,  _intercept_scaling=1_,  _class_weight=None_,  _random_state=None_,  _solver='lbfgs'_,  _max_iter=100_,  _multi_class='auto'_,  _verbose=0_,  _warm_start=False_,  _n_jobs=None_,  _l1_ratio=None_)[[source]](https://github.com/scikit-learn/scikit-learn/blob/364c77e04/sklearn/linear_model/_logistic.py#L783)[](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression "Permalink to this definition")


* **solver**
     : {‘lbfgs’, ‘liblinear’, ‘newton-cg’, ‘newton-cholesky’, ‘sag’, ‘saga’}, default=’lbfgs’

     - Algorithm to use in the optimization problem. Default is ‘lbfgs’. To choose a solver, you might want to consider the following aspects:
     
     > -   For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and ‘saga’ are faster for large ones;
     >     
     > -   For multiclass problems, only ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’ handle multinomial loss;
     >     
     > -   ‘liblinear’ is limited to one-versus-rest schemes.
     >     
     > -   ‘newton-cholesky’ is a good choice for  `n_samples`  >>  `n_features`, especially with one-hot encoded categorical features with rare categories. Note that it is limited to binary classification and the one-versus-rest reduction for multiclass classification. Be aware that the memory usage of this solver has a quadratic dependency on  `n_features`  because it explicitly computes the Hessian matrix.
>

* **multi_class**
     : {‘auto’, ‘ovr’, ‘multinomial’}, default=’auto’

     If the option chosen is ‘ovr’, then a binary problem is fit for each label. For ‘multinomial’ the loss minimised is the multinomial loss fit across the entire probability distribution,  _even when the data is binary_. ‘multinomial’ is unavailable when solver=’liblinear’. ‘auto’ selects ‘ovr’ if the data is binary, or if solver=’liblinear’, and otherwise selects ‘multinomial’.

     New in version 0.18: Stochastic Average Gradient descent solver for ‘multinomial’ case.

In [5]:
# 위스콘신 유방암 데이터 불러오기
cancer = load_breast_cancer()

In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# StandardScaler()로 평균이 0, 분산 1로 데이터 분포도 변환
scaler = StandardScaler()
data_scaled = scaler.fit_transform(cancer.data)

X_train, X_test, y_train, y_test = train_test_split(data_scaled, cancer.target, test_size=0.3, random_state=0)

In [7]:
from sklearn.metrics import accuracy_score, roc_auc_score

# 로지스틱 회귀를 이용하여 학습 및 예측 수행
lr_clf = LogisticRegression()

lr_clf.fit(X_train, y_train)

lr_preds = lr_clf.predict(X_test)

# accuracy와 roc_auc 측정
print(f'accuracy : {accuracy_score(y_test, lr_preds):0.3f}')
print(f'roc_auc : {roc_auc_score(y_test, lr_preds):0.3f}')

accuracy : 0.977
roc_auc : 0.972


In [8]:
from sklearn.model_selection import GridSearchCV

params = {'penalty' : ['l2', 'l1'],
          'C' : [0.01, 0.1, 1, 1, 5, 10]}

grid_clf = GridSearchCV(lr_clf, param_grid=params, scoring='accuracy', cv=3)
grid_clf.fit(data_scaled, cancer.target)

print(f'최적 하이퍼 파라미터 : {grid_clf.best_params_}, 최적 평균 정확도 : {grid_clf.best_score_:.3f}')

최적 하이퍼 파라미터 : {'C': 1, 'penalty': 'l2'}, 최적 평균 정확도 : 0.975


> LogisticRegression(multi_class='auto') 가 기본값

파라미터 조정을 통해 값이 달라지는지 확인해보자

In [9]:
# multi_class='ovr'
lr_clf = LogisticRegression(multi_class='ovr')

lr_clf.fit(X_train, y_train)

lr_preds = lr_clf.predict(X_test)
print(f'accuracy : {accuracy_score(y_test, lr_preds):0.3f}')
print(f'roc_auc : {roc_auc_score(y_test, lr_preds):0.3f}')

accuracy : 0.977
roc_auc : 0.972


In [10]:
# multi_class='multinomial'
lr_clf = LogisticRegression(multi_class='multinomial')

lr_clf.fit(X_train, y_train)

lr_preds = lr_clf.predict(X_test)
print(f'accuracy : {accuracy_score(y_test, lr_preds):0.3f}')
print(f'roc_auc : {roc_auc_score(y_test, lr_preds):0.3f}')

accuracy : 0.953
roc_auc : 0.953
