## Logistic回归2
### ——从疝气病症状预测病马的死亡率
*** Date : 2019-3-5 ***

*** Author : wwt117@163.com ***

*** Source : 《 Machine Learning in Action 》(chapter 5)  -- Peter Harrington***

---

#### 1、使用随机梯度上升法预测病马死亡率


In [12]:
import numpy as np
import matplotlib.pyplot as plt
import random

def SGAscent(data,label,iter_num=150):
    """随机梯度上升算法
    
    输入参数
    - - - - - - -
    data : float
        样本的特征
    label : int 
        样本对应的标签
        
    返回值
    - - - - - -
    weights: float
        Logistic模型的权重
        
    描述
    - - - - - -
    每次仅用一个样本点来更新模型参数
        
    """
    data_mat = np.mat(data)
    label_mat = np.mat(label).T
    m,n = np.shape(data_mat)
    weights = np.ones((1,n))
    for j in range(iter_num):  
        data_ind = list(range(m))
        for i in range(m):
            alpha = 4 / (1.0 + j + i) + 0.01  # 当j<<max(i)时，alpha不是严格下降的。（严格下降需要避免）
            rand_ind = int(random.uniform(0,len(data_ind))) # 选取当前列表中随机第rand_ind位置的值
            z = data_mat[rand_ind] * weights.T
            h = sigmoid(z)
            error = label_mat[rand_ind] - h
            weights = weights + alpha * error * data_mat[rand_ind]
            del(data_ind[rand_ind])
    return weights.T


def sigmoid(z):
    """sigmoid函数(经过优化)
    原代码：g =  1.0 / (1 + np.exp(-z))
    
    输入参数
    - - - - - -
    z : float
        z是样本特征与模型参数的乘积和
            
    返回值
    - - - - - -
    g : float
        
    """
    if z >= 0:
        g =  1.0 / (1 + np.exp(-z))
    else:
        g =  np.exp(z) / ( 1 + np.exp(z))
    return g

    
def predict(weights,data,label):
    """预测分类正确率
    
    输入参数
    - - - - - - -
    weights : float
        模型参数
    data : float
        样本特征
    label : int 
        样本对应的标签
    
    """
    data_mat = np.mat(data)
    label_mat = np.mat(label).T
    m,n = np.shape(data_mat)
    count = 0
    z = data_mat * weights
    for i in range(m):
        h = int(round(sigmoid(np.array(z)[i][0])))
        if h == label_mat[i]:
            count += 1
    accur = count / m * 100
    print("模型分类的准确率为 %f%%" % accur)
    
    
def colic_test():
    """Logistic模型对horseColic数据集进行预测
    
    """    
    # 读入数据
    train = open('horseColicTraining.txt')
    test = open('horseColicTest.txt')
    
    # 训练集训练参数
    train_data = []
    train_label = []
    for line in train.readlines():
        line_list = line.strip().split('\t')
        line_data = []
        for i in range(len(line_list)-1):
            line_data.append(float(line_list[i]))
        train_data.append(line_data)
        train_label.append(float(line_list[-1]))
        
    # 随机梯度上升法求解最优参数
    train_weights = SGAscent(train_data,train_label,500)
    count = 0
    
    # 测试集验证结果
    test_data = []
    test_label = []
    for line in test.readlines():
        line_list = line.strip().split('\t')
        line_data = []
        for i in range(len(line_list)-1):
            line_data.append(float(line_list[i]))
        test_data.append(line_data)
        test_label.append(int(line_list[-1]))    
        
    # 验证分类准确率
    predict(train_weights,test_data,test_label)
    

    
if __name__=='__main__':
    colic_test()

模型分类的准确率为 68.656716%


#### 2、使用梯度上升法预测病马死亡率

In [15]:
import numpy as np
import matplotlib.pyplot as plt
import random

def grad_ascent(data,label):
    """梯度上升算法
    
    输入参数
    - - - - - - -
    data : float
        样本的特征
    label : int 
        样本对应的标签
        
    返回值
    - - - - - -
    weights: float
        Logistic模型的权重
        
    描述
    - - - - - -
    每次使用所有的样本点来更新模型参数
    
    """
    data_mat = np.mat(data)
    label_mat = np.mat(label).T
    m,n = np.shape(data_mat)
    alpha = 0.001
    iter_num = 1000
    weights = np.ones((n,1)) # 初始化权重 2*1
    for i in range(iter_num):
        z = data_mat * weights
        h = sigmoid(z)
        error = label_mat - h
        weights = weights + alpha * data_mat.T * error
    return weights


def sigmoid(z):
    """sigmoid函数
    
    输入参数
    - - - - - -
    z : float
        z是样本特征与模型参数的乘积和
            
    返回值
    - - - - - -
    g : float
        
    """
    g =  1.0 / (1 + np.exp(-z))
    return g
    
    
def predict(weights,data,label):
    """预测分类正确率
    
    输入参数
    - - - - - - -
    weights : float
        模型参数
    data : float
        样本特征
    label : int 
        样本对应的标签
    
    """
    data_mat = np.mat(data)
    label_mat = np.mat(label).T
    m,n = np.shape(data_mat)
    count = 0
    z = data_mat * weights
    for i in range(m):
        h = int(round(sigmoid(np.array(z)[i][0])))
        if h == label_mat[i]:
            count += 1
    accur = count / m * 100
    print("模型分类的准确率为 %f%%" % accur)
    
    
def colic_test():
    """Logistic模型对horseColic数据集进行预测
    
    """    
    # 读入数据
    train = open('horseColicTraining.txt')
    test = open('horseColicTest.txt')
    
    # 训练集训练参数
    train_data = []
    train_label = []
    for line in train.readlines():
        line_list = line.strip().split('\t')
        line_data = []
        for i in range(len(line_list)-1):
            line_data.append(float(line_list[i]))
        train_data.append(line_data)
        train_label.append(float(line_list[-1]))
        
    # 随机梯度上升法求解最优参数
    train_weights = grad_ascent(train_data,train_label)
    count = 0
    
    # 测试集验证结果
    test_data = []
    test_label = []
    for line in test.readlines():
        line_list = line.strip().split('\t')
        line_data = []
        for i in range(len(line_list)-1):
            line_data.append(float(line_list[i]))
        test_data.append(line_data)
        test_label.append(int(line_list[-1]))    
        
    # 验证分类准确率
    predict(train_weights,test_data,test_label)
    

    
if __name__=='__main__':
    colic_test()

模型分类的准确率为 71.641791%




#### Sklearn函数求解

In [14]:
from sklearn.linear_model import LogisticRegression


def colic_test1():
    """使用sklearn中LogisticRegression函数对horseColic数据集进行预测
    
        -- 梯度上升结果
    
    """    
    # 读入数据
    train = open('horseColicTraining.txt')
    test = open('horseColicTest.txt')
    
    # 训练集
    train_data = []
    train_label = []
    for line in train.readlines():
        line_list = line.strip().split('\t')
        line_data = []
        for i in range(len(line_list)-1):
            line_data.append(float(line_list[i]))
        train_data.append(line_data)
        train_label.append(float(line_list[-1]))
        
    # 测试集
    test_data = []
    test_label = []
    for line in test.readlines():
        line_list = line.strip().split('\t')
        line_data = []
        for i in range(len(line_list)-1):
            line_data.append(float(line_list[i]))
        test_data.append(line_data)
        test_label.append(int(line_list[-1]))    
        
    # 分类
    classifier = LogisticRegression(solver='liblinear',max_iter=10).fit(train_data, train_label)
    test_accurcy = classifier.score(test_data, test_label) * 100
    print('模型分类的准确率为:%f%%' % test_accurcy)
    
def colic_test2():
    """使用sklearn中LogisticRegression函数对horseColic数据集进行预测
    
        -- 随机梯度上升结果
    
    """    
    # 读入数据
    train = open('horseColicTraining.txt')
    test = open('horseColicTest.txt')
    
    # 训练集
    train_data = []
    train_label = []
    for line in train.readlines():
        line_list = line.strip().split('\t')
        line_data = []
        for i in range(len(line_list)-1):
            line_data.append(float(line_list[i]))
        train_data.append(line_data)
        train_label.append(float(line_list[-1]))
        
    # 测试集
    test_data = []
    test_label = []
    for line in test.readlines():
        line_list = line.strip().split('\t')
        line_data = []
        for i in range(len(line_list)-1):
            line_data.append(float(line_list[i]))
        test_data.append(line_data)
        test_label.append(int(line_list[-1]))    
        
    # 分类
    classifier = LogisticRegression(solver='sag',max_iter=2000).fit(train_data, train_label)
    test_accurcy = classifier.score(test_data, test_label) * 100
    print('模型分类的准确率为:%f%%' % test_accurcy)

if __name__ == '__main__':
    colic_test1()
    colic_test2()


模型分类的准确率为:73.134328%
模型分类的准确率为:73.134328%
