In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib notebook

## 水果识别

### 1、数据加载

In [2]:
data = pd.read_table('./fruit_data_with_colors.txt')
data.head()

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


In [3]:
# 创建标签和名称字典
fruit_label = dict(zip(data['fruit_label'], data['fruit_name']))
fruit_label

{1: 'apple', 2: 'mandarin', 3: 'orange', 4: 'lemon'}

In [22]:
# 划分数据集
X = data[['mass', 'width', 'height', 'color_score']]  # 特征列
y = data['fruit_label']  # 标准标签

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/4, random_state=0)
print('数据集：{}，训练集：{}，测试集：{}'.format(len(data), len(X_train), len(X_test)))

数据集：59，训练集：44，测试集：15


### 2、特征归一化

In [23]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaler = scaler.fit_transform(X_train)
X_test_scaler = scaler.fit_transform(X_test)

### 3、交叉验证

#### 单一超参数

In [26]:
from sklearn.neighbors import KNeighborsClassifier  # KNN
from sklearn.model_selection import cross_val_score  # 交叉验证

k_range = [5, 10, 15, 20]
cv_scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X=X_train_scaler, y=y_train, cv=3)  # 交叉验证
    cv_score = np.mean(scores)
    print('k={}，验证集上的准确率：{:.3f}'.format(k, cv_score))
    cv_scores.append(cv_score)

k=5，验证集上的准确率：0.845
k=10，验证集上的准确率：0.495
k=15，验证集上的准确率：0.521
k=20，验证集上的准确率：0.546


In [27]:
best_k = k_range[np.argmax(cv_scores)]
best_knn = KNeighborsClassifier(n_neighbors=best_k)
best_knn.fit(X_train_scaler, y_train)
print('测试集准确率：', best_knn.score(X_test_scaler, y_test))

测试集准确率： 0.8
