# ロジスティック回帰

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

$  P(Y = 1| X = \boldsymbol{x}) = \sigma(\omega_0 + \sum^{d}_{j=1}x_j\omega_j) = \sigma(\boldsymbol{\omega^T \tilde{x}^T}) $\
$ \sigma(\xi) = \frac{1}{1 + e^{-\xi}} $

$ P(Y = y| X = \boldsymbol{x}) = P(Y = 1| X = \boldsymbol{x})^y P(Y = 0| X = \boldsymbol{x})^{1-y} $\
$ ~~~~~~~~~~~~~~~~~~~~~~~~~~~= \sigma(\boldsymbol{\tilde{x}^T}\omega )^y (1 - \sigma(\boldsymbol{\tilde{x}^T}\omega))^{1-y} $

$ \boldsymbol{\tilde{y}} = ( \sigma(\boldsymbol{\omega}^T\boldsymbol{x}_1), \sigma(\boldsymbol{\omega}^T\boldsymbol{x}_2), \ldots , \sigma(\boldsymbol{\omega}^T\boldsymbol{x}_n) ) $

$
    R = \begin{pmatrix}
        \tilde{y}_1(1 - \tilde{y}_1) \\
         & \tilde{y}_2(1 - \tilde{y}_2) \\
         && \ddots \\
         &&& \tilde{y}_n(1 - \tilde{y}_n) \\
    \end{pmatrix} \\
    H = \boldsymbol{\tilde{X}^TR\tilde{X}}
$

$ \boldsymbol{\omega}^{new} = \boldsymbol{\omega}^{old} - \boldsymbol{H}^{-1}	\bigtriangledown{E}(\boldsymbol{\omega}^{old}) $\
$ \boldsymbol{\omega}^{new} = \boldsymbol{\omega}^{old} - (\boldsymbol{X^{T}RX})^{-1}(\boldsymbol{X^{T}R})[\boldsymbol{X\omega}^{old} - \boldsymbol{R}^{-1}(\boldsymbol{\tilde{y}} - \boldsymbol{y}) ] $

In [2]:
import os, subprocess
import csv
from solver import LogisticRegression

In [3]:
filename = "wdbc.data"
if not os.path.isfile(filename):
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
    result = subprocess.run(['curl', '-O', url], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    print(result)

In [4]:
n_test = 100
X = []
y = []

b_count, m_count = 0, 0
with open(filename) as fp:
    for row in csv.reader(fp):
        if row[1] == "B":
            y.append(0)
            b_count += 1
        else:
            y.append(1)
            m_count += 1
        X.append(row[2:])

X = np.array(X, dtype=np.float64)
y = np.array(y, dtype=np.float64)

print("良性:{0:d}, 悪性:{1:d}".format(b_count, m_count))

良性:357, 悪性:212


In [5]:
X_train = X[:-n_test]
y_train = y[:-n_test]
X_test = X[-n_test:]
y_test = y[-n_test:]

model = LogisticRegression(tol=0.01, max_iter=100, random_seed=1)
model.fit(X_train, y_train)

   1: Diff=1.000000e+00, Abs=1.882873e+10
   2: Diff=5.571873e-01, Abs=2.977844e+10
   3: Diff=3.667929e-01, Abs=3.858524e+10
   4: Diff=2.436406e-01, Abs=4.413354e+10
   5: Diff=1.767668e-01, Abs=4.774337e+10
   6: Diff=1.199008e-01, Abs=4.962236e+10
   7: Diff=1.177193e-01, Abs=5.345022e+10
   8: Diff=8.657243e-02, Abs=5.495078e+10
   9: Diff=1.417272e-01, Abs=5.985155e+10
  10: Diff=1.324112e-01, Abs=5.819850e+10
  11: Diff=9.906476e-02, Abs=6.148620e+10
  12: Diff=7.040512e-02, Abs=6.208431e+10
  13: Diff=8.628514e-02, Abs=6.435014e+10
  14: Diff=5.809449e-02, Abs=6.477067e+10
  15: Diff=6.641806e-02, Abs=6.577039e+10
  16: Diff=8.245089e-02, Abs=6.656222e+10
  17: Diff=9.373977e-02, Abs=7.177143e+10
  18: Diff=9.291837e-02, Abs=7.059929e+10
  19: Diff=5.470154e-02, Abs=7.214281e+10
  20: Diff=1.281598e-01, Abs=6.984335e+10
  21: Diff=7.079397e-02, Abs=7.312536e+10
  22: Diff=1.430424e-01, Abs=6.695258e+10
  23: Diff=1.293375e-01, Abs=7.175126e+10
  24: Diff=5.283794e-02, Abs=7.285

  return 1 / (1 + np.exp(-x))


In [6]:
y_predict = model.predict(X_test)
n_hits = (y_test == y_predict).sum()
print("Accuracy: {0:d}/{1:d} = {2:.2f}".format(n_hits, n_test, n_hits/n_test))

Accuracy: 97/100 = 0.97
