# 集成学习

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

# 用于在jupyter中进行绘图
%matplotlib inline

## 水果识别

### 1. 数据加载

In [2]:
# 加载数据集
fruits_df = pd.read_table('fruit_data_with_colors.txt')
fruits_df.head()

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


In [3]:
print('样本个数：', len(fruits_df))

样本个数： 59


In [4]:
# 创建目标标签和名称的字典
fruit_name_dict = dict(zip(fruits_df['fruit_label'], fruits_df['fruit_name']))
print(fruit_name_dict)

{1: 'apple', 2: 'mandarin', 3: 'orange', 4: 'lemon'}


In [5]:
# 划分数据集
X = fruits_df[['mass', 'width', 'height', 'color_score']]
y = fruits_df['fruit_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/4, random_state=0)

In [6]:
print('数据集样本数：{}，训练集样本数：{}，测试集样本数：{}'.format(len(X), len(X_train), len(X_test)))

数据集样本数：59，训练集样本数：44，测试集样本数：15


### 2. 特征归一化

In [7]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

for i in range(4):
    print('归一化前，训练数据第{}维特征最大值：{:.3f}，最小值：{:.3f}'.format(i + 1, 
                                           X_train.iloc[:, i].max(), 
                                           X_train.iloc[:, i].min()))
    print('归一化后，训练数据第{}维特征最大值：{:.3f}，最小值：{:.3f}'.format(i + 1, 
                                           X_train_scaled[:, i].max(), 
                                           X_train_scaled[:, i].min()))
    print()

归一化前，训练数据第1维特征最大值：356.000，最小值：76.000
归一化后，训练数据第1维特征最大值：1.000，最小值：0.000

归一化前，训练数据第2维特征最大值：9.200，最小值：5.800
归一化后，训练数据第2维特征最大值：1.000，最小值：0.000

归一化前，训练数据第3维特征最大值：10.500，最小值：4.000
归一化后，训练数据第3维特征最大值：1.000，最小值：0.000

归一化前，训练数据第4维特征最大值：0.920，最小值：0.550
归一化后，训练数据第4维特征最大值：1.000，最小值：0.000



### 3. 数据建模

#### 3.1 [Stacking](https://rasbt.github.io/mlxtend/user_guide/classifier/StackingClassifier/)

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from mlxtend.classifier import StackingClassifier

clf1 = KNeighborsClassifier(n_neighbors=1)
clf2 = SVC(kernel='linear')
clf3 = DecisionTreeClassifier()
lr = LogisticRegression(C=100)

sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], 
                          meta_classifier=lr)

clf1.fit(X_train_scaled, y_train)
clf2.fit(X_train_scaled, y_train)
clf3.fit(X_train_scaled, y_train)
sclf.fit(X_train_scaled, y_train)

print('kNN测试集准确率：{:.3f}'.format(clf1.score(X_test_scaled, y_test)))
print('SVM测试集准确率：{:.3f}'.format(clf2.score(X_test_scaled, y_test)))
print('DT测试集准确率：{:.3f}'.format(clf3.score(X_test_scaled, y_test)))
print('Stacking测试集准确率：{:.3f}'.format(sclf.score(X_test_scaled, y_test)))

kNN测试集准确率：0.867
SVM测试集准确率：0.533
DT测试集准确率：0.733
Stacking测试集准确率：0.867


#### 3.2 [AdaBoost](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html)

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV

parameters = {'n_estimators': [20, 40, 60, 80, 100, 120, 140]}

clf = GridSearchCV(AdaBoostClassifier(), parameters, cv=3, scoring='accuracy')
clf.fit(X_train_scaled, y_train)

print('最优参数：', clf.best_params_)
print('验证集最高得分：', clf.best_score_)
print('测试集准确率：{:.3f}'.format(clf.score(X_test_scaled, y_test)))

#### 3.3 [GBDT](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

parameters = {'learning_rate': [0.001, 0.01, 0.1, 1, 10, 100]}
clf = GridSearchCV(GradientBoostingClassifier(), parameters, cv=3, scoring='accuracy')
clf.fit(X_train_scaled, y_train)

print('最优参数：', clf.best_params_)
print('验证集最高得分：', clf.best_score_)
print('测试集准确率：{:.3f}'.format(clf.score(X_test_scaled, y_test)))

#### 3.4 [随机森林](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

parameters = {'n_estimators':[10, 50, 100, 150, 200]}
clf = GridSearchCV(RandomForestClassifier(random_state=0), parameters, cv=3, scoring='accuracy')
clf.fit(X_train_scaled, y_train)

print('最优参数：', clf.best_params_)
print('验证集最高得分：', clf.best_score_)
print('测试集准确率：{:.3f}'.format(clf.score(X_test_scaled, y_test)))