In [None]:
import pandas as pd # 导入pandas库
import numpy as np  # 导入numpy库
df_titanic = pd.read_csv('../input/titanic/train.csv') # 读取文件
df_titanic.head() # 显示前5行数据

In [None]:
df_titanic.Survived.value_counts() # 输出分类值，及各个类别数目
df_titanic['Age'] = df_titanic['Age'].fillna(0)

In [None]:
# 把类别型变量转换为哑变量
a = pd.get_dummies(df_titanic['Sex'], prefix = "Sex")
b = pd.get_dummies(df_titanic['Embarked'], prefix = "Em")
# 把哑变量添加进dataframe
frames = [df_titanic, a, b]
df_titanic = pd.concat(frames, axis = 1)
df_titanic = df_titanic.drop(columns = ['Sex', 'Embarked'])
df_titanic.head() # 显示新的dataframe

In [None]:
X = df_titanic.drop(['Survived','Name','Ticket','Cabin'], axis=1) # 拿掉比较不相关的字段,构建特征集
y = df_titanic.Survived.values # 构建标签集
y = y.reshape(-1,1) # -1是相对索引，等价于len(y)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2)

In [None]:
from sklearn.preprocessing import StandardScaler # 导入数据缩放器
scaler = StandardScaler() # 选择归一化数据缩放器，MinMaxScaler
X_train = scaler.fit_transform(X_train) # 特征归一化 训练集fit_transform
X_test = scaler.transform(X_test) # 特征归一化 测试集transform

In [None]:
from sklearn.linear_model import LogisticRegression #导入逻辑回归模型
lr = LogisticRegression() # lr,就代表是逻辑回归模型
lr.fit(X_train,y_train) # fit,就相当于是梯度下降
print("SK-learn逻辑回归测试准确率 {:.2f}%".format(lr.score(X_test,y_test)*100))

In [None]:
# 首先定义一个sigmoid函数，输入Z，返回y'
def sigmoid(z):    
    y_hat = 1/(1+ np.exp(-z))
    return y_hat

In [None]:
# 然后定义损失函数
def cost_function(X,y,w,b):
    y_hat = sigmoid(np.dot(X,w) + b) # Sigmoid逻辑函数 + 线性函数（wX+b）得到y'
#     print (X.shape,w.shape)
    loss = np.abs((y*np.log(y_hat) + (1-y)*np.log(1-y_hat))) # 计算损失
#     cost = np.sum(loss) / X.shape[0]  # 返回整个数据集平均损失  
    cost = np.mean(loss) # 返回整个数据集平均损失
    return cost

In [None]:
def gradient_descent(X,y,w,b,lr,iter) : #定义逻辑回归梯度下降函数
    l_history = np.zeros(iter) # 初始化记录梯度下降过程中误差值(损失)的数组
    w_history = np.zeros((iter,w.shape[0],w.shape[1])) # 初始化权重记录的数组
    b_history = np.zeros(iter) # 初始化记录梯度下降过程中偏置的数组  
    for i in range(iter): #进行机器训练的迭代
        y_hat = sigmoid(np.dot(X,w) + b) #Sigmoid逻辑函数+线性函数(wX+b)得到y'
        loss = (y*np.log(y_hat) + (1-y)*np.log(1-y_hat))
        derivative_w = np.dot(X.T,((y_hat-y)))/X.shape[0]  # 给权重向量求导
        derivative_b = np.sum(y_hat-y)/X.shape[0] # 给偏置求导
        w = w - lr * derivative_w # 更新权重向量，lr即学习速率alpha
        b = b - lr * derivative_b   # 更新偏置，lr即学习速率alpha
        l_history[i] =  cost_function(X,y,w,b) # 梯度下降过程中的损失
        print ("轮次", i+1 , "当前轮训练集损失：",l_history[i])        
        w_history[i] = w # 梯度下降过程中权重的历史 请注意w_history和w的形状
        b_history[i] = b # 梯度下降过程中偏置的历史
    return l_history, w_history, b_history

In [None]:
def predict(X,w,b): # 定义预测函数
    z = np.dot(X,w) + b # 线性函数
    y_hat = sigmoid(z) # 逻辑函数转换
    y_pred = np.zeros((y_hat.shape[0],1)) # 初始化预测结果变量    
    for i in range(y_hat.shape[0]):
        if y_hat[i,0] < 0.5:
            y_pred[i,0] = 0 # 如果预测概率小于0.5，输出分类0
        else:
            y_pred[i,0] = 1 # 如果预测概率大于0.5，输出分类0
    return y_pred # 返回预测分类的结果

In [None]:
def logistic_regression(X,y,w,b,lr,iter): # 定义逻辑回归模型
    l_history,w_history,b_history = gradient_descent(X,y,w,b,lr,iter)#梯度下降
    print("训练最终损失:", l_history[-1]) # 打印最终损失
    y_pred = predict(X,w_history[-1],b_history[-1]) # 进行预测
    traning_acc = 100 - np.mean(np.abs(y_pred - y_train))*100 # 计算准确率
    print("逻辑回归训练准确率: {:.2f}%".format(traning_acc))  # 打印准确率
    return l_history, w_history, b_history # 返回训练历史记录

In [None]:
#初始化参数
dimension = X.shape[1] # 这里的维度 len(X)是矩阵的行的数，维度是列的数目
weight = np.full((dimension,1),0.1) # 权重向量，向量一般是1D，但这里实际上创建了2D张量
bias = 0 # 偏置值
#初始化超参数
alpha = 1 # 学习速率
iterations = 100 # 迭代次数

In [None]:
# 用逻辑回归函数训练机器
loss_history, weight_history, bias_history = logistic_regression(X_train,y_train,
                                                                 weight,bias,                                                                 
                                                                 alpha,iterations)

In [None]:
y_pred = predict(X_test,weight_history[-1],bias_history[-1]) # 预测测试集
testing_acc = 100 - np.mean(np.abs(y_pred - y_test))*100 # 计算准确率
print("逻辑回归测试准确率: {:.2f}%".format(testing_acc))

In [None]:
import matplotlib.pyplot as plt # 导入绘图工具
loss_history_test = np.zeros(iterations) # 初始化历史损失
for i in range(iterations): #求训练过程中不同参数带来的测试集损失
    loss_history_test[i] = cost_function(X_test,y_test,weight_history[i],bias_history[i])
index = np.arange(0,iterations,1)
plt.plot(index,loss_history,c='blue',linestyle='solid')
plt.plot(index,loss_history_test,c='red',linestyle='dashed')
plt.legend(["Training Loss", "Test Loss"])
plt.xlabel("Number of Iteration")
plt.ylabel("Cost")
plt.show() # 同时显示显示训练集和测试集损失曲线