# 正则化逻辑回归
使用正则化逻辑回归预测芯片质量是否合格。ex2data2.txt文件包含用于训练的样本数据，包括两项测试的结果和是否合格
## 样本数据可视化

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def plotData(data):
    # 1. 筛选数据：将样本分为 正例(1) 和 负例(0)
    positive = data[data['accepted'] == 1]
    negative = data[data['accepted'] == 0]

    plt.figure()

    # 2. 绘制散点图
    # s 为点的大小，c 为颜色，marker 为形状
    plt.scatter(positive['Test1'], positive['Test2'], 
                s=50, c='black', marker='+', label='accepted')
    
    plt.scatter(negative['Test1'], negative['Test2'], 
                s=50, c='y', marker='o', label='rejected')

    # 3. 装饰图表
    plt.xlabel('Microchip Test 1')
    plt.ylabel('Microchip Test 2')
    plt.legend() # 显示图例
    plt.title('Scatter Plot of Training Data')
    plt.show()

if __name__ == "__main__":
    data = pd.read_csv("ex2data2.txt", names=['Test1', 'Test2', 'accepted'])
    plotData(data)

## 特征映射
为了更好得拟合数据，假设函数采用六阶非线性方程，需要基于现有特征进行拓展。映射后的特征向量为
$$mapFeature(x)=\begin{bmatrix}
1 \\
x_1 \\
x_2 \\
x_1^2 \\
x_1x_2 \\
x_2^2 \\
\vdots \\
x_1x_2^5 \\
x_2^6
\end{bmatrix}$$

In [None]:
# 特征映射，包括添加theta_0对应列
def mapFeature(X, degree):
    out = [np.ones((len(X), 1))] # 不直接初始化为列向量，因为ndarray类型只能在循环中使用np.hstack不断扩充数组，NumPy每次都会在内存中重新开辟空间并复制旧数据，比较低效
    X1 = X[:, 0].reshape(-1, 1)
    X2 = X[:, 1].reshape(-1, 1)
    for i in range(1, degree+1):
        for j in range(i+1):
            new_feat = np.power(X1, i - j) * np.power(X2, j)
            out.append(new_feat) # 使用列表存储所有特征列向量，最后用hstack一次将列表中的列向量横向组合成特征矩阵，性能更优
    return np.hstack(out)
    
if __name__ == "__main__":
    cols = data.shape[1]
    X = data.iloc[:, :-1].to_numpy()
    y= data.iloc[:, cols-1:cols].to_numpy()

    degree = 6
    X_map = mapFeature(X, degree)
    print(f"Mapped feature:\n{X_map[:5, :5]}")

## 代价函数和梯度
为了控制拟合程度，正则化逻辑回归的代价函数表示如下，注意不对$\theta_0$进行正则化
$$J(\theta)=\frac{1}{m}\sum_{i=1}^{m}[-y^{(i)}\log(h_{\theta}(x^{(i)}))-(1-y^{(i)})\log(1-h_{\theta}(x^{(i)}))]+\frac{\lambda}{2m}\sum_{j=1}^{n}\theta_j^2$$
代价函数偏导数表示为
$$\begin{align*}
\frac{\partial J(\theta)}{\partial \theta_0}&=\frac{1}{m}\sum_{i=1}^m(h_{\theta}(x^{(i)})-y^{(i)})x_j^{(i)}\quad(j=0)\\
\frac{\partial J(\theta)}{\partial \theta_j}&=\frac{1}{m}\sum_{i=1}^m(h_{\theta}(x^{(i)})-y^{(i)})x_j^{(i)}+\frac{\lambda}{m}\theta_j\quad(j=1,2,...,n)
\end{align*}$$
梯度下降算法：
$$\begin{align*}
Repeat:\{\theta_0&=\theta_0-\frac{\alpha}{m}\sum_{i=1}^m(h_{\theta}(x^{(i)})-y^{(i)})x_0^{(i)}
\\\theta_j&=\theta_j(1-\frac{\alpha\lambda}{m})-\frac{\alpha}{m}\sum_{i=1}^m(h_{\theta}(x^{(i)})-y^{(i)})x_j^{(i)}\quad(j=1,...,n)\\
&(同时更新所有\theta)\}
\end{align*}$$
其中$m$为样本数量，$n$为特征数量，$\alpha$为学习率，$\lambda$为正则化参数。

In [None]:
# 定义 Sigmoid 函数
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def featureNormalization(X):
    X = (X - X.mean()) / X.std()
    return X 

def costFunctionReg(theta, X, y, lambda_):
    theta = theta.reshape(-1, 1) # 确保theta为列向量

    h = sigmoid(X @ theta)
    eps = 1e-15 # 添加 eps (1e-15) 防止 np.log(0) 导致溢出

    term1 = np.log(h + eps) * y # 逐元素相乘
    term2 = np.log(1 - h + eps) * (1 - y)
    return -np.mean(term1 + term2) + lambda_ / (2 * len(X)) * np.sum(np.power(theta[:, 1:], 2)) # 注意：不惩罚 theta[0] (偏置项)

def grad(X, y, theta):
    theta = theta.reshape(-1, 1)

    h = sigmoid(X @ theta)
    error = h - y
    return 1 / len(X) * (X.T @ error)

def gradReg(X, y, theta, lambda_):
    theta = theta.reshape(-1, 1)

    # 1.计算未正则化的梯度
    grad = grad(X, y, theta)

    # 2.计算正则化部分
    # 创建一个 theta 的副本用于正则化计算
    reg_theta = np.copy(theta)
    # 关键步骤：将 theta_0 对应的位置设为 0，这样它就不会被正则化
    reg_theta[0] = 0
    
    # 计算最终梯度: 基本梯度 + (lambda/m) * 修正后的theta
    grad = grad + (lambda_ / len(X)) * reg_theta
    
    return grad.flatten() # 展平为一维数组，方便优化函数调用


def gradientDescent(X, y, theta, alpha, iters, lambda_):
    cost = np.zeros(iters + 1)

    for iter in range(iters):
        cost[iter] = costFunctionReg(theta, X, y, lambda_)
        theta -= alpha * gradReg(X, y, theta, lambda_)
    cost[iter+1] = costFunctionReg(theta, X, y, lambda_)
    return theta, cost

if __name__ == "__main__":
    # 特征归一化
    X_map[:, 1:] = featureNormalization(X_map[:, 1:])

    print(f"X:shape{X_map.shape}\n{X_map[:5, :5]}")
    print(f"y:shape{y.shape}\n{y[:5, :]}")

    # 参数初始化
    initial_theta = np.zeros((X.shape[1], 1), dtype=np.float32)
    print(f"theta:shape{initial_theta.shape}\n{initial_theta[:5]}")

    # 计算初始代价函数值，验证代价函数计算准确性
    lambda_ = 1
    print(f"initial cost:{costFunctionReg(initial_theta, X, y, lambda_):.3f}") # initial cost:0.693

    alphas = np.array([0.01, 0.03, 0.1, 0.3])
    iters = 1000

    plt.figure(figsize=(12,8))

    for alpha in alphas:
        current_theta = initial_theta.copy() # 每次循环都使用初始 theta，防止上一个 alpha 的结果干扰
        theta, cost = gradientDescent(X, y, current_theta, alpha, iters)
        print(f"theta for alpha = {alpha}:{theta}")
        print(f"final cost for alpha = {alpha}:{cost[-1]:.3f}")
        plt.plot(range(len(cost)), cost, linewidth=2, label=f"alpha={alpha}")
    
    plt.title("Convergence of Cost Function for different Alphas")
    plt.xlabel("Iterations")
    plt.ylabel("Cost J")
    plt.legend() # 显示不同 alpha 的标签
    plt.grid(True)
    plt.show() # 最后统一显示