In [None]:
"""
Adaboost的sample权重更新和模型权重更新都是根据公式计算的，直接参看lihang的统计机器学习
"""

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_hastie_10_2
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

def sample_wrong(predict, labels):
    sample_wrong_array = np.zeros(len(predict))
    for i in range(0, len(predict)):
        if predict[i] != labels[i]:
            sample_wrong_array[i] = 1
    return sample_wrong_array


def update_sample_weight(predict, labels, alpha_m, sample_weight):
    new_sample_weight = np.zeros(len(predict))
    z_m = 0
    for i in range(0, len(predict)):
        cur = sample_weight[i] * np.exp(-1 * alpha_m * predict[i] * labels[i])
        new_sample_weight[i] = cur
        z_m += cur
    return new_sample_weight / z_m

"""
使用DecisionTree来决策
"""
def adaboost(train_features, train_labels, test_features, test_labels, classifier_number):
    # 每个模型的初始权重
    weight_model_pair = []
    # sample的权重是动态变化的，初始一样。
    train_sample_num = len(train_features)
    sample_weight = np.ones(train_sample_num) / train_sample_num
    for i in range(classifier_number):
        model = DecisionTreeClassifier(max_depth=2)
        # 对于不支持sample weight的模型怎么办？
        model.fit(train_features, train_labels, sample_weight=sample_weight)
        train_predict = model.predict(train_features)
        # 返回错误的位向量，如果那个位置是错的，就设置为1，否则为0
        wrong_array = sample_wrong(train_predict, train_labels)
        # 计算弱分类器的error rate
        e_m = np.dot(sample_weight, wrong_array)
        # print(e_m)
        # 根据分类器的error rate计算模型的权重
        alpha_m = 0.5 * np.log((1 - e_m) / e_m)
        # 更新sample的权重，分类错误的提高，分类正确的降低
        sample_weight = update_sample_weight(train_predict, train_labels, alpha_m, sample_weight)
        # print(np.sum(sample_weight))
        weight_model_pair.append((alpha_m, model, e_m))
    return weight_model_pair

def adaboost_predict(weight_model_pair, test_features):
    test_predict = np.zeros(len(test_features))
    for item in weight_model_pair:
        model_weight = item[0]
        model = item[1]
        cur_model_predict = model.predict(test_features)
        test_predict += model_weight * cur_model_predict
    test_score = test_predict
    test_predict = np.sign(test_predict)
    return test_predict, test_score


x, y = make_hastie_10_2(n_samples=10000)
train_features, test_features, train_labels, test_labels = train_test_split(x, y, test_size=0.2, random_state=23323)

classifier_nums = [10, 50, 100, 400]
# 这里的i代表弱分类器的个数，这里是为了比较弱分类器个数增加，效果会进一步提升
for classifier_number in classifier_nums:
    print("classifier numbers: " + str(classifier_number))
    weight_model_pair = adaboost(train_features, train_labels, test_features, test_labels, classifier_number)
    test_predict, _ = adaboost_predict(weight_model_pair, test_features)
    print("accuracy : " + str(accuracy_score(test_labels, test_predict)))
