In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## 水果识别

### 数据读取、划分、特征归一化

In [2]:
# 数据读取
data = pd.read_table('./fruit_data_with_colors.txt')
data.head()

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


In [5]:
# 数据集划分
X = data[['mass', 'width', 'height', 'color_score']]
y = data['fruit_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/4, random_state=0)
print('数据集：{} 个，训练集：{} 个，测试集：{} 个'.format(len(data), len(X_train), len(X_test)))

fruit_label = dict(zip(data['fruit_label'], data['fruit_name']))
print('水果标签：', fruit_label)

数据集：59 个，训练集：44 个，测试集：15 个
水果标签： {1: 'apple', 2: 'mandarin', 3: 'orange', 4: 'lemon'}


In [7]:
# 特征归一化
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

### 数据建模

#### 朴素贝叶斯

In [10]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train_scaled, y_train)
score = gnb.score(X_test_scaled, y_test)
score

0.4666666666666667

#### 随机森林

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

parameters = {'n_estimators':[10, 50, 100, 150, 200]}
clf = GridSearchCV(RandomForestClassifier(random_state=0), parameters, cv=3, scoring='accuracy')
clf.fit(X_train, y_train)

print('最优参数：', clf.best_params_)
print('验证集最高得分：', clf.best_score_)
print('测试集准确率：{:.3f}'.format(clf.score(X_test, y_test)))

最优参数： {'n_estimators': 150}
验证集最高得分： 0.8181818181818182
测试集准确率：0.867


#### GBDT

In [14]:
from sklearn.ensemble import GradientBoostingClassifier

parameters = {'learning_rate':[0.001, 0.01, 0.1, 1, 10, 100]}
clf = GridSearchCV(GradientBoostingClassifier(), parameters, cv=3, scoring='accuracy')
clf.fit(X=X_train, y=y_train)

print('最优参数：', clf.best_params_)
print('验证集最高得分：', clf.best_score_)
print('测试集准确率：{:.3f}'.format(clf.score(X=X_test, y=y_test)))

最优参数： {'learning_rate': 0.001}
验证集最高得分： 0.7954545454545454
测试集准确率：0.733
