# 交叉验证

In [24]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib notebook

## 水果识别

### 1、数据加载

In [25]:
data = pd.read_table('./fruit_data_with_colors.txt')
data.head()

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


In [26]:
# 创建标签和名称字典
fruit_label = dict(zip(data['fruit_label'], data['fruit_name']))
fruit_label

{1: 'apple', 2: 'mandarin', 3: 'orange', 4: 'lemon'}

In [27]:
# 划分数据集
X = data[['mass', 'width', 'height', 'color_score']]  # 特征列
y = data['fruit_label']  # 标准标签

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/4, random_state=0)
print('数据集：{}，训练集：{}，测试集：{}'.format(len(data), len(X_train), len(X_test)))

数据集：59，训练集：44，测试集：15


### 2、特征归一化

In [28]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

### 3、交叉验证

#### 单一超参数

In [29]:
from sklearn.neighbors import KNeighborsClassifier  # KNN
from sklearn.model_selection import cross_val_score  # 交叉验证

k_range = [5, 10, 15, 20]
cv_scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X=X_train_scaled, y=y_train, cv=3)  # 交叉验证
    cv_score = np.mean(scores)
    print('k={}，验证集上的准确率：{:.3f}'.format(k, cv_score))
    cv_scores.append(cv_score)

k=5，验证集上的准确率：0.845
k=10，验证集上的准确率：0.495
k=15，验证集上的准确率：0.521
k=20，验证集上的准确率：0.546


In [30]:
best_k = k_range[np.argmax(cv_scores)]
best_knn = KNeighborsClassifier(n_neighbors=best_k)
best_knn.fit(X_train_scaled, y_train)
print('测试集准确率：', best_knn.score(X_test_scaled, y_test))

测试集准确率： 0.8


In [31]:
# 调用  validation_curve 绘制超参数对训练集和验证集的影响
from sklearn.model_selection import validation_curve
from sklearn.svm import SVC  # SVM 
c_range = [1e-3, 1e-2, 0.1, 1, 10, 100, 1000, 10000]
train_scores, test_scores = validation_curve(SVC(), 
                                             X_train_scaled, y_train, 
                                             param_name='C', param_range=c_range, 
                                             cv=3, scoring='accuracy')
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.figure(figsize=(10, 8))
plt.title('Validation Curve with SVM')
plt.xlabel('C')
plt.ylabel('Score')
plt.ylim(0.0, 1.1)
lw = 2
plt.semilogx(c_range, train_scores_mean, label='Training score', color='darkorange', lw=lw)
plt.fill_between(c_range, train_scores_mean - train_scores_std, 
                 train_scores_mean + train_scores_std, alpha=0.2, 
                color='navy', lw=lw)
plt.semilogx(c_range, test_scores_mean, label="Cross-validation score",
             color="navy", lw=lw)
plt.fill_between(c_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.2,
                 color="navy", lw=lw)
plt.legend(loc="best")
plt.show()

<IPython.core.display.Javascript object>

In [32]:
# 从上图可知对SVM，C=10, 100为最优参数
svm_model = SVC(C=100)
svm_model.fit(X_train_scaled, y_train)
print(svm_model.score(X_test_scaled, y_test))

0.8


#### 多个超参数 

In [33]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

parameters = {'max_depth':[3, 5, 7, 9], 'min_samples_leaf':[1, 2, 3, 4]}
clf = GridSearchCV(DecisionTreeClassifier(), parameters, cv=3, scoring='accuracy')
clf.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [3, 5, 7, 9], 'min_samples_leaf': [1, 2, 3, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [34]:
print('最优参数：', clf.best_params_)
print('验证集最高得分：', clf.best_score_)

最优参数： {'max_depth': 7, 'min_samples_leaf': 1}
验证集最高得分： 0.75


In [35]:
# 获得最优模型
best_model = clf.best_estimator_
print('测试集上准确率：', best_model.score(X_test, y_test))

测试集上准确率： 0.8666666666666667


# 模型评价指标

In [36]:
k = 7

# 转换为二分类问题
y_train_binary = y_train.copy()
y_test_binary = y_test.copy()

y_train_binary[y_train_binary != 1] = 0
y_test_binary[y_test_binary != 1] = 0

knn = KNeighborsClassifier(k)
knn.fit(X_train_scaled, y_train_binary)
y_pred = knn.predict(X=X_test_scaled)

In [37]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 准确率
print('准确率：', accuracy_score(y_test_binary, y_pred))

# 精确率      
print('精确率：', precision_score(y_test_binary, y_pred))

# 召回率
print('召回率：', recall_score(y_test_binary, y_pred))

# F1 值
print('F1 值：', f1_score(y_test_binary, y_pred))

准确率： 0.8666666666666667
精确率： 0.6666666666666666
召回率： 1.0
F1 值： 0.8


## PR 曲线

In [38]:
from sklearn.metrics import precision_recall_curve, average_precision_score

print('AP 值：', average_precision_score(y_test_binary, y_pred))
# precision, recall, _ = precision_recall_curve(y_test, y_pred)  # 这样得到两个值

AP 值： 0.6666666666666666


## ROC 曲线

In [39]:
from sklearn.metrics import roc_auc_score, roc_curve

print('AUC 值：', roc_auc_score(y_test_binary, y_pred))
# fpr, tpr, _ = roc_curve(y_test, y_pred)

AUC 值： 0.9090909090909091


## 混淆矩阵

In [40]:
from sklearn.metrics import confusion_matrix

y_pred = svm_model.predict(X=X_test_scaled)
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[4 0 0 0]
 [0 1 0 0]
 [3 0 5 0]
 [0 0 0 2]]


In [41]:
plt.figure()
plt.grid(False)
plt.imshow(cm, cmap='jet')
plt.colorbar()

<IPython.core.display.Javascript object>

<matplotlib.colorbar.Colorbar at 0x1b368cfa860>