In [None]:
"""
逻辑回归定义
分类为1的概率 P(x = 1) = exp{wx} / 1 + exp{wx}，
相应的P(x = 0) = 1 / 1 + exp{wx}。

令P(x = 1) = f(z) = exp{z} / 1 + exp{z}，其中z = wx，对P(z)求导，
可以得到dP/dz = P(z) * (1 - P(z))

对于单个sample的损失函数：
loss = y * (1 - lnP(z)) + (1 - y) * lnP(z)，
当y = 1时，loss = (1 - P(z), 若P(z) -> 1，loss -> 0；若P(z) -> 0，loss -> 无穷大
y = 0时，loss = (1 - y) * P(z)，如果若P(z) -> 0，loss -> 0；若P(z) -> 1，loss -> 无穷大

loss求导
loss对某个w求导，有 d(loss)/d(wi) = (y - f(z))xi (这里需要做loss的导数推导，并利用到上一步的等式)
这个推导我依然还是有问题，需要再仔细看看。
"""

import numpy as np
import pandas as pd
from sklearn import datasets
import math
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import random
from sklearn.metrics import accuracy_score

cancer = datasets.load_breast_cancer()

df_X = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df_y = pd.DataFrame(cancer.target, columns=["target"])

"""
特征要注意归一化
"""
class LogisticRegression(object):
    def __init__(self, iteration, feature_num):
        self.lr = 0.005
        self.max_iteration = iteration
        # wx + b
        self.w = np.zeros(feature_num + 1)

    def train(self, features, labels):
        cur_iter = 0
        while True:
            if cur_iter > self.max_iteration:
                break
            # 每次拿出一个sample更新
            index = random.randint(0, len(labels) - 1)
            cur_features = list(features[index])
            # 加入bias
            cur_features.append(1.0)
            y = labels[index]
            x = np.array(cur_features)
            wx = np.dot(self.w, x)
            exp_wx = math.exp(wx)
            yhat = exp_wx / (1 + exp_wx)
            for i in range(0, len(self.w)):
                # 每一个sample可以用来更新所有的w值
                """
                如果y = 1, 而yhat -> 0，即f(z) -> 0，说明预测有误，为了提高预测概率，要提高f(z)的值。
                P(x = 1) = exp{wx} / 1 + exp{wx} = 1 - 1 / (1 + exp(wx))，要让P(X = 1)提高，下一轮需要提高wx的值。
                如果x[i] > 0，则+号使得w[i]增加了，则下一轮的wx增加，P(x = 1)提高，不论w[x]以前是正负，都是这个结论。
                如果x[i] < 0，则+号使得w[i]减小了，w[i]*x[i]减小了，不论w[x]以前是正负，都是这个结论。
                综上，+号更新是正确的。同理，可以假设y = 0来推导，应该同样成立。
                """
                self.w[i] += self.lr * (y - yhat) * x[i]
            # print("iteration - {}".format(cur_iter))
            cur_iter += 1
        return

    def predict_single(self, x):
        try:
            wx = np.dot(self.w, x)
            exp_wx = math.exp(wx)
            predict_1 = exp_wx / (1 + exp_wx)
            predict_0 = 1 / (1 + exp_wx)
            if predict_1 > predict_0:
                return 1
            else:
                return 0
        except OverflowError:
            print(x)
            print(self.w)

    def predict_batch(self, features):
        labels = []
        for feature in features:
            cur_features = list(feature)
            cur_features.append(1.0)
            x = np.array(cur_features)
            labels.append(self.predict_single(x))
        return labels


np_X = df_X.to_numpy()
np_y = df_y["target"].to_numpy()
scaler = StandardScaler()
# np_X_normal = scaler.fit_transform(np_X)
# df_np_normal = pd.DataFrame(np_X_normal, columns=cancer.feature_names)
# df_np_normal.head(5)
np_X = scaler.fit_transform(np_X)


train_features, test_features, train_labels, test_labels = train_test_split(np_X, np_y, test_size=0.2, random_state=23323)

lr = LogisticRegression(1000, np_X.shape[1])
lr.train(train_features, train_labels)
predict_labels = lr.predict_batch(test_features)
print(predict_labels)
print(lr.w)
print(accuracy_score(test_labels, predict_labels))

lr2 = LogisticRegression(10000, np_X.shape[1])
lr2.train(train_features, train_labels)
predict_labels2 = lr2.predict_batch(test_features)
print(predict_labels2)
print(lr2.w)
print(accuracy_score(test_labels, predict_labels2))

lr3 = LogisticRegression(20000, np_X.shape[1])
lr3.train(train_features, train_labels)
predict_labels3 = lr3.predict_batch(test_features)
print(predict_labels3)
print(lr3.w)
print(accuracy_score(test_labels, predict_labels3))