# Logistic Regression

In [56]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.metrics import classification_report

In [2]:
import numpy as np

## 设置映射字典

In [28]:
x_dict = {}
x_string = 'ARNDCQEGHILKMFPSTWYV'
for i in range(len(x_string)):
    key = x_string[i]
    value = i + 1
    x_dict[key] = value
    
y_dict = {1:1, -1:0}

## 读取文件函数

In [4]:
def read_file(file):
    data_x = []
    data_y = []
    with open(file) as f:
        for line in f.readlines():
            x, y = line.strip().split(',')
            x_list = [x_dict[i] for i in x]
            data_x.append(x_list)
            data_y.append(int(y))
            
    return data_x, data_y

## 读取所有数据文件

In [5]:
file_765 = './newHIV-1_data/746Data.txt'
file_1625 = './newHIV-1_data/1625Data.txt'
file_impens = './newHIV-1_data/impensData.txt'
file_schilling = './newHIV-1_data/schillingData.txt'

data_765_x, data_765_y = read_file(file_765)
data_1625_x, data_1625_y = read_file(file_1625)
data_impens_x, data_impens_y = read_file(file_impens)
data_schilling_x, data_schilling_y = read_file(file_schilling)

data_dict_x = {}
data_dict_y = {}
data_dict_x['765'], data_dict_y['765'] = data_765_x, data_765_y
data_dict_x['1625'], data_dict_y['1625'] = data_1625_x, data_1625_y
data_dict_x['impens'], data_dict_y['impens'] = data_impens_x, data_impens_y
data_dict_x['schilling'], data_dict_y['schilling'] = data_schilling_x, data_schilling_y

data_set = ['765', '1625', 'impens', 'schilling']

## 调包实现Logistic回归

In [85]:
def Logistic_package(training_set, dataset, data_dict_x, data_dict_y):
    testing_set = dataset[:]
    testing_set.remove(training_set)
    x_train, y_train = data_dict_x[training_set], data_dict_y[training_set]
    x_test, y_test = data_dict_x[testing_set[0]], data_dict_y[testing_set[0]]
    for i in range(2):
        x_test = x_test + data_dict_x[testing_set[i+1]]
        y_test = y_test + data_dict_y[testing_set[i+1]]
    
    clf = LogisticRegressionCV(multi_class="ovr", cv=2, penalty="l2", solver="sag", tol=0.01)
    clf.fit(x_train, y_train)
    
    y_pred = clf.predict(x_test)
    report = classification_report(y_test, y_pred)
#     acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    print("report: ", report)
#     print("acc: ", acc)
    print("auc: ", auc)

## 分别以一个数据集作为训练集并在其他三个数据集上做测试

In [86]:
training_set = '765'
Logistic_package(training_set, data_set, data_dict_x, data_dict_y)

report:                precision    recall  f1-score   support

          -1       0.87      0.61      0.72      4886
           1       0.22      0.55      0.31       958

    accuracy                           0.60      5844
   macro avg       0.55      0.58      0.52      5844
weighted avg       0.77      0.60      0.65      5844

auc:  0.582604467452916


In [87]:
training_set = '1625'
Logistic_package(training_set, data_set, data_dict_x, data_dict_y)

report:                precision    recall  f1-score   support

          -1       0.80      1.00      0.89      3980
           1       0.73      0.01      0.02       985

    accuracy                           0.80      4965
   macro avg       0.77      0.50      0.45      4965
weighted avg       0.79      0.80      0.72      4965

auc:  0.5036840292834731


In [88]:
training_set = 'impens'
Logistic_package(training_set, data_set, data_dict_x, data_dict_y)

report:                precision    recall  f1-score   support

          -1       0.79      1.00      0.88      4432
           1       0.00      0.00      0.00      1211

    accuracy                           0.79      5643
   macro avg       0.39      0.50      0.44      5643
weighted avg       0.62      0.79      0.69      5643

auc:  0.5


  'precision', 'predicted', average, warn_for)


In [89]:
training_set = 'schilling'
Logistic_package(training_set, data_set, data_dict_x, data_dict_y)

report:                precision    recall  f1-score   support

          -1       0.72      1.00      0.84      2392
           1       0.00      0.00      0.00       926

    accuracy                           0.72      3318
   macro avg       0.36      0.50      0.42      3318
weighted avg       0.52      0.72      0.60      3318

auc:  0.5


  'precision', 'predicted', average, warn_for)


## 手写Logistic回归

In [31]:
class Logis():
    def sigmoid(self, z):
        return 1.0 / (1 + np.exp(-z))
    
    def train(self, data, label):
        m, n = np.shape(data)
        maxCycles = 200
        weights = np.ones(n)
        for cycle in range(maxCycles):
            dataIndex=list(range(m))
            for i in range(m):
                alpha = 4 / (1.0 + cycle + i) + 0.01
                randIndex=int(np.random.uniform(0, len(dataIndex)))
                y = self.sigmoid(sum(data[randIndex] * weights ))
                error = label[randIndex] - y
                weights = weights + alpha  * error * data[randIndex]
                del(dataIndex[randIndex])
                # print(type(weights))
        return weights
    
    def predict(self, data, weights):
        m, n = np.shape(data)
        y_pred = []
        for i in range(m):
            y = self.sigmoid(sum(data[i] * weights))
            if y > 0.5:
                y_pred.append(1)
            else:
                y_pred.append(0)
        return y_pred

In [80]:
def Logistic_write(training_set, dataset, data_dict_x, data_dict_y):
    testing_set = dataset[:]
    testing_set.remove(training_set)
    x_train, y_train = data_dict_x[training_set], data_dict_y[training_set]
    x_test, y_test = data_dict_x[testing_set[0]], data_dict_y[testing_set[0]]
    for i in range(2):
        x_test = x_test + data_dict_x[testing_set[i+1]]
        y_test = y_test + data_dict_y[testing_set[i+1]]
    
    y_train = [y_dict[i] for i in y_train]
    y_test = [y_dict[i] for i in y_test]
    
    clf = Logis()
    weights = clf.train(np.array(x_train), y_train)
    y_pred = clf.predict(np.array(x_test), weights)
    report = classification_report(y_test, y_pred)
#     acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    print("report: ", report)
#     print("acc: ", acc)
    print("auc: ", auc)

## 分别以一个数据集作为训练集并在其他三个数据集上做测试

In [81]:
training_set = '765'
Logistic_write(training_set, data_set, data_dict_x, data_dict_y)

report:                precision    recall  f1-score   support

           0       0.84      1.00      0.91      4886
           1       0.00      0.00      0.00       958

    accuracy                           0.84      5844
   macro avg       0.42      0.50      0.46      5844
weighted avg       0.70      0.84      0.76      5844

auc:  0.4993860008186656


In [92]:
training_set = '1625'
Logistic_write(training_set, data_set, data_dict_x, data_dict_y)

report:                precision    recall  f1-score   support

           0       0.77      0.03      0.06      3980
           1       0.20      0.97      0.33       985

    accuracy                           0.21      4965
   macro avg       0.49      0.50      0.19      4965
weighted avg       0.66      0.21      0.11      4965

auc:  0.49731398107287705


In [91]:
training_set = 'impens'
Logistic_write(training_set, data_set, data_dict_x, data_dict_y)

report:                precision    recall  f1-score   support

           0       0.79      0.95      0.86      4432
           1       0.25      0.06      0.09      1211

    accuracy                           0.76      5643
   macro avg       0.52      0.51      0.48      5643
weighted avg       0.67      0.76      0.70      5643

auc:  0.5056616619018802


In [84]:
training_set = 'schilling'
Logistic_write(training_set, data_set, data_dict_x, data_dict_y)

report:                precision    recall  f1-score   support

           0       0.72      0.81      0.76      2392
           1       0.27      0.18      0.22       926

    accuracy                           0.63      3318
   macro avg       0.49      0.50      0.49      3318
weighted avg       0.59      0.63      0.61      3318

auc:  0.4959697371367481


- 样本不均衡问题：损失做操作，采样
- 编码问题（标签编码，one-hot编码）：目标编码、证据权重（WOE）
- 证据权重
- 指标问题：auc
- 先探索数据
- 验证集
- 超参数调整
- 损失函数的选择问题