In [46]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression

In [47]:
import sys
import warnings

import warnings
warnings.filterwarnings('ignore')


## sklearn.linear_model.LogisticRegression
* class sklearn.linear_model.LogisticRegression(penalty='l2', *, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, multi_class='auto', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)


* solver{‘lbfgs’, ‘liblinear’, ‘newton-cg’, ‘newton-cholesky’, ‘sag’, ‘saga’}, default=’lbfgs’
    Algorithm to use in the optimization problem. Default is ‘lbfgs’. To choose a solver, you might want to consider the following aspects:

    For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and ‘saga’ are faster for large ones;

    For multiclass problems, only ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’ handle multinomial loss;

    ‘liblinear’ is limited to one-versus-rest schemes.

    ‘newton-cholesky’ is a good choice for n_samples >> n_features, especially with one-hot encoded categorical features with rare categories. Note that it is limited to binary classification and the one-versus-rest reduction for multiclass classification. Be aware that the memory usage of this solver has a quadratic dependency on n_features because it explicitly computes the Hessian matrix.

In [48]:
cancer = load_breast_cancer()

In [49]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

scaler= StandardScaler()
data_scaled = scaler.fit_transform(cancer.data)

X_train, X_test, y_train, y_test = train_test_split(\
    data_scaled, cancer.target, test_size=0.3)

In [50]:
from sklearn.metrics import accuracy_score, roc_auc_score

lr_clf = LogisticRegression(random_state=0)
lr_clf.fit(X_train, y_train)
lr_preds= lr_clf.predict(X_test)

print(accuracy_score(y_test, lr_preds))
print(roc_auc_score(y_test, lr_preds))

0.9649122807017544
0.9614864864864865


In [51]:
from sklearn.model_selection import GridSearchCV

params={'penalty':['l2','l1'], 'C':[0.01, 0.1, 1, 5, 10]}

grid_clf = GridSearchCV(lr_clf, param_grid=params, \
    scoring='accuracy', cv = 3)
grid_clf.fit(data_scaled, cancer.target)
print(grid_clf.best_params_, grid_clf.best_score_)

{'C': 1, 'penalty': 'l2'} 0.975392184164114


* multi_class{‘auto’, ‘ovr’, ‘multinomial’}, default=’auto’
    If the option chosen is ‘ovr’, then a binary problem is fit for each label. For ‘multinomial’ the loss minimised is the multinomial loss fit across the entire probability distribution, even when the data is binary. ‘multinomial’ is unavailable when solver=’liblinear’. ‘auto’ selects ‘ovr’ if the data is binary, or if solver=’liblinear’, and otherwise selects ‘multinomial’.

    New in version 0.18: Stochastic Average Gradient descent solver for ‘multinomial’ case.

    Changed in version 0.22: Default changed from ‘ovr’ to ‘auto’ in 0.22.

In [54]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(\
    iris.data, iris.target, test_size=0.3, random_state=0)
lr = LogisticRegression(random_state=0)
lr.fit(X_train, y_train)
lr.predict_proba(X_test)

array([[1.31716029e-04, 5.98488920e-02, 9.40019392e-01],
       [1.33565514e-02, 9.57041781e-01, 2.96016675e-02],
       [9.82927047e-01, 1.70728971e-02, 5.56552535e-08],
       [1.60489004e-06, 2.60795787e-02, 9.73918816e-01],
       [9.68275765e-01, 3.17239990e-02, 2.35580864e-07],
       [2.47764187e-06, 6.75430567e-03, 9.93243217e-01],
       [9.80358820e-01, 1.96410769e-02, 1.03121750e-07],
       [3.00907754e-03, 7.50151524e-01, 2.46839398e-01],
       [1.62309775e-03, 7.41284239e-01, 2.57092664e-01],
       [2.11392098e-02, 9.32798279e-01, 4.60625107e-02],
       [1.02523608e-04, 1.57158717e-01, 8.42738760e-01],
       [7.26676157e-03, 8.08697061e-01, 1.84036178e-01],
       [4.34749020e-03, 7.83988223e-01, 2.11664287e-01],
       [3.23126456e-03, 7.61206968e-01, 2.35561768e-01],
       [4.09222646e-03, 7.03027094e-01, 2.92880679e-01],
       [9.81573297e-01, 1.84266187e-02, 8.41351910e-08],
       [7.01146482e-03, 7.50156985e-01, 2.42831550e-01],
       [1.20109647e-02, 8.31519

In [55]:
lr_preds= lr.predict(X_test)
print(lr_preds)
print(accuracy_score(y_test, lr_preds))

[2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0
 2 1 1 2 0 2 0 0]
0.9777777777777777


In [56]:
lr = LogisticRegression(multi_class='ovr', random_state=0)
lr.fit(X_train, y_train)
# lr_preds= lr_clf.predict(X_test)
lr.predict_proba(X_test)


array([[7.24937235e-04, 2.32958500e-01, 7.66316562e-01],
       [2.44958162e-02, 9.19568033e-01, 5.59361510e-02],
       [9.66252536e-01, 3.37431743e-02, 4.28956375e-06],
       [4.46524627e-05, 3.14857812e-01, 6.85097535e-01],
       [8.64639796e-01, 1.35351358e-01, 8.84606690e-06],
       [1.15801645e-04, 1.48520801e-01, 8.51363397e-01],
       [9.00857079e-01, 9.91367684e-02, 6.15241185e-06],
       [7.24851498e-03, 4.80178868e-01, 5.12572617e-01],
       [3.64042142e-03, 5.63422030e-01, 4.32937549e-01],
       [4.89451037e-02, 8.26714243e-01, 1.24340654e-01],
       [3.65863594e-04, 4.31798181e-01, 5.67835955e-01],
       [1.75762261e-02, 5.20334227e-01, 4.62089546e-01],
       [7.27580031e-03, 6.63148700e-01, 3.29575500e-01],
       [6.53117799e-03, 5.80335736e-01, 4.13133086e-01],
       [6.81018265e-03, 5.56360712e-01, 4.36829105e-01],
       [8.93154898e-01, 1.06839676e-01, 5.42675615e-06],
       [1.16903381e-02, 5.79073572e-01, 4.09236090e-01],
       [1.50936005e-02, 7.64295

In [57]:
lr_preds= lr.predict(X_test)
print(accuracy_score(y_test, lr_preds))

0.9555555555555556


In [58]:
lr = LogisticRegression(multi_class='multinomial', random_state=0)
lr.fit(X_train, y_train)
# lr_preds= lr_clf.predict(X_test)
lr.predict_proba(X_test)

array([[1.31716029e-04, 5.98488920e-02, 9.40019392e-01],
       [1.33565514e-02, 9.57041781e-01, 2.96016675e-02],
       [9.82927047e-01, 1.70728971e-02, 5.56552535e-08],
       [1.60489004e-06, 2.60795787e-02, 9.73918816e-01],
       [9.68275765e-01, 3.17239990e-02, 2.35580864e-07],
       [2.47764187e-06, 6.75430567e-03, 9.93243217e-01],
       [9.80358820e-01, 1.96410769e-02, 1.03121750e-07],
       [3.00907754e-03, 7.50151524e-01, 2.46839398e-01],
       [1.62309775e-03, 7.41284239e-01, 2.57092664e-01],
       [2.11392098e-02, 9.32798279e-01, 4.60625107e-02],
       [1.02523608e-04, 1.57158717e-01, 8.42738760e-01],
       [7.26676157e-03, 8.08697061e-01, 1.84036178e-01],
       [4.34749020e-03, 7.83988223e-01, 2.11664287e-01],
       [3.23126456e-03, 7.61206968e-01, 2.35561768e-01],
       [4.09222646e-03, 7.03027094e-01, 2.92880679e-01],
       [9.81573297e-01, 1.84266187e-02, 8.41351910e-08],
       [7.01146482e-03, 7.50156985e-01, 2.42831550e-01],
       [1.20109647e-02, 8.31519

In [59]:
lr_preds= lr.predict(X_test)
print(accuracy_score(y_test, lr_preds))

0.9777777777777777
