### 本节课调用adaboost以及基于简单决策树实现adaboost模型  

#### 我们将利用adaboost模型对肿瘤类型进行判断与分类

- 数据来源： https://www.kaggle.com/uciml/breast-cancer-wisconsin-data  
- 对比模型： Adaboost模型

In [3]:
# necessary imports
%matplotlib inline
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

np.random.seed(19)

In [4]:
# TODO 读取肿瘤数据
data_folder = "../input"
data = pd.read_csv(os.path.join(data_folder, "breastCancer.csv"))

In [5]:
# 打印部分数据
data.head()

#### 检查需要预测的目标

In [6]:
# TODO
data['diagnosis'].value_counts()
data['diagnosis'].value_counts().plot(kind='bar')

#### 过滤不需要的信息

In [7]:
# TODO
data.drop('id',axis=1,inplace=True)
data.drop('Unnamed: 32',axis=1,inplace=True)

#### 转换预测的目标   M：1， B：-1

In [8]:
# TODO
data['diagnosis'] = data['diagnosis'].apply(lambda x : +1 if x=='M' else -1)

#### 观察数据的基本信息

In [9]:
# TODO
data.describe()

In [12]:
data.info()

In [10]:
# 柱状图
import seaborn as sns
# TODO
sns.countplot(data['diagnosis'])

#### 利用前6个特征，设定目标变量

In [11]:
# TODO
features = data.columns[1:7]
target = 'diagnosis'
features

#### 特征探索  

- 比较6个特征与恶性肿瘤的关系

In [12]:
i = 0
for feature in features:

    bins = 25
    # 将特征的直方图画出来
    # TODO
    plt.hist(data[feature][data[target] == -1], bins=bins, color='lightblue', label= 'B-healthy', alpha=1)
    plt.hist(data[feature][data[target] == 1], bins=bins, color='k', label='M-bad', alpha=0.5)    
    
    plt.xlabel(feature)
    plt.ylabel('Amount of count')
    
    plt.legend()
    
    plt.show()

In [13]:
from sklearn.model_selection import train_test_split
# TODO 训练集和测试集
train_data, test_data = train_test_split(data, test_size=0.3)

In [14]:
# TODO 数据和标签
trainX, trainY = train_data[data.columns[1:]], train_data[target]
testX, testY = test_data[data.columns[1:]], test_data[target]

### Logistic回归的表现

In [15]:
# TODO logistic 模型的表现
logistic_model = LogisticRegression()
print("Logistic Regression performance: %f" % (cross_val_score(logistic_model, trainX, trainY, cv=8).mean()))

### 决策树的表现

In [16]:
# TODO
tree_model = DecisionTreeClassifier()
print("Decision Tree performance: %f" % (cross_val_score(tree_model, trainX, trainY, cv=8).mean()))


### 直接调用adaboost模型的表现

In [17]:
# TODO
ada_model = AdaBoostClassifier(n_estimators=200)
print("Decision Tree performance: %f" % (cross_val_score(ada_model, trainX, trainY, cv=8).mean()))

### 测试集的表现

In [18]:
logistic_model = LogisticRegression()
# TODO
logistic_model.fit(trainX, trainY)
print("Logistic Regression test performance: %f" % logistic_model.score(testX, testY))

In [19]:
tree_model = DecisionTreeClassifier()
# TODO
tree_model.fit(trainX, trainY)
print("Decision Tree test performance: %f" % tree_model.score(testX, testY))

In [20]:
ada_model = AdaBoostClassifier(n_estimators=200)
# TODO
ada_model.fit(trainX, trainY)
print("Adaboost test performance: %f" % ada_model.score(testX, testY))

#### Adaboost的实现

In [25]:
from sklearn.base import BaseEstimator
class Adaboost(BaseEstimator):
    
    def __init__(self, M):
        # TODO
        self.M = M
        
    def fit(self, X, Y):
        # TODO
        self.models = []
        self.model_weights = []
        
        N, _ = X.shape
        alpha = np.ones(N) / N
        
        for m in range(self.M):
            tree = DecisionTreeClassifier(max_depth=2)
            tree.fit(X, Y, sample_weight=alpha)
            prediction = tree.predict(X)
            
            # 计算加权错误
            weighted_error = alpha.dot(prediction != Y)
            
            # 计算当前模型的权重
            model_weight = 0.5 * (np.log(1 - weighted_error) - np.log(weighted_error))
            
            # 更新数据的权重
            alpha = alpha * np.exp(-model_weight * Y * prediction)
            
            # 数据权重normalize
            alpha = alpha / alpha.sum()
            
            self.models.append(tree)
            self.model_weights.append(model_weight)
            
    def predict(self, X):
        # TODO
        N, _ = X.shape
        result = np.zeros(N)
        for wt, tree in zip(self.model_weights, self.models):
            result += wt * tree.predict(X)
        
        return np.sign(result)
    
    def score(self, X, Y):
        # TODO
        prediction = self.predict(X)
        return np.mean(prediction == Y)

### Adaboost的表现

In [26]:
# TODO
adamodel = Adaboost(200)
print("Adaboost model performance: %f" % (cross_val_score(adamodel, trainX.as_matrix().astype(np.float64), trainY.as_matrix().astype(np.float64), cv=8).mean()))

### 测试集的表现

In [27]:
adamodel.fit(trainX.as_matrix().astype(np.float64), trainY.as_matrix().astype(np.float64))
print("Adaboost model test performance: %f" % adamodel.score(testX.as_matrix().astype(np.float64), testY.as_matrix().astype(np.float64)))