### 1. 数据集的加载

In [None]:
import pandas as pd
dataset = pd.read_csv('dataset.csv')

In [None]:
# 将性别特征转化为数字
dataset['Sex'] = dataset['Sex'].astype('category').cat.codes
# 将年龄特征缺失的样本使用年龄的均值替换
dataset['Age'] = dataset['Age'].fillna(dataset['Age'].mean())
# 所有样本的特征
X = dataset[['Pclass','Sex','Fare','Age']].values
# 所有样本的标签
y = dataset['Survived'].values
# 将数据集所有样本的特征进行标准化
mean = X.mean(axis=0)
std = X.std(axis=0)
X = (X - mean) / std

In [None]:
from sklearn.model_selection import train_test_split
# 将数据集分为训练集与测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# 训练集样本的个数
n_train = X_train.shape[0]
# 每个样本中的特征值的个数
n_features = X_train.shape[1]

### 2. 模型的构建与训练

In [1]:
import numpy as np
def sigmoid(x):
    y = 1 / (1 + np.exp(-x))
    return y

In [None]:
# 对逻辑回归模型的参数进行随机初始化
w = np.random.rand(n_features)
b = 1.1
# 构建逻辑回归模型
def model(x):
    z = w.dot(x) + b
    y_hat = sigmoid(z)
    return y_hat

In [None]:
# 指定模型进行训练的次数
epochs = 10000
# 指定学习率的值
lr = 0.01
# 使用梯度下降算法对逻辑回归模型进行训练
for epoch in range(epochs):
    sum_w = np.zeros(n_features)
    sum_b = 0.0
    for i in range(n_train):
        xi = X_train[i]
        yi_hat = model(xi)
        yi = y_train[i]
        sum_w += (yi_hat - yi) * xi
        sum_b += (yi_hat - yi)
    # 计算出权重参数对损失函数的梯度值
    grad_w = (1 / n_train) * sum_w
    # 计算出偏移项对损失函数的梯度值
    grad_b = (1 / n_train) * sum_b
    w = w - lr * grad_w
    b = b - lr * grad_b

### 3. 模型的评估

In [None]:
def predict(X):
    predictions = []
    n_samples = X.shape[0]
    for i in range(n_samples):
        xi = X[i]
        yi_hat = model(xi)
        if yi_hat < 0.5:
            # 当模型的预测值小于0.5时，预测为第一类
            predictions.append(0)
        else:
            # 放模型的预测值大于0.5时，预测为第二类
            predictions.append(1)
    return predictions

In [None]:
# 计算逻辑回归模型对数据集中的样本预测准确度
def get_accuracy(X, y):
    n_samples = X.shape[0]
    predictions = predict(X)
    loss = 0
    for i in range(n_samples):
        if y[i] != predictions[i]:
            loss += 1 
    accuracy = (n_samples - loss) / n_samples
    return accuracy

In [None]:
train_accuracy = get_accuracy(X_train, y_train)
test_accuracy = get_accuracy(X_test, y_test)

### 3. 使用矩阵的方式加速模型的训练

In [None]:
w = np.random.rand(n_features)
b = 0
def model(X):
    z = w.dot(X.T) + b
    y_hat = sigmoid(z)
    return y_hat

In [None]:
epochs = 10000
lr = 0.01
for epoch in range(epochs):
    sum_w = np.zeros(n_features)
    sum_b = 0.0
    # 使用模型一次计算出全部训练集样本的预测结果
    y_hat = model(X_train)
    # 计算模型参数梯度值
    sum_w = np.dot((y_hat - y_train), X_train)
    sum_b = np.sum(y_hat - y_train)
    grad_w = (1 / n_train) * sum_w
    grad_b = (1 / n_train) * sum_b
    # 使用梯度下降算法更新模型参数
    w = w - lr * grad_w
    b = b - lr * grad_b

In [None]:
train_accuracy = get_accuracy(X_train, y_train)
test_accuracy = get_accuracy(X_test, y_test)