In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## 水果识别

### 1、数据加载

In [2]:
input_file = './fruit_data_with_colors.txt'
data = pd.read_table(input_file)
data.head()

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


In [3]:
print('样本个数：', len(data))

样本个数： 59


In [4]:
# 创建目标标签和名称的字典
fruit_name_dict = dict(zip(data['fruit_label'], data['fruit_name']))
fruit_name_dict

{1: 'apple', 2: 'mandarin', 3: 'orange', 4: 'lemon'}

In [5]:
# 划分数据集
X = data[['mass', 'width', 'height', 'color_score']]  # 特征
y = data['fruit_label']  # 标准标签

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/4, random_state=0)

print('数据集样本总数：{}，训练集样本数：{}，测试集样本数：{}'.format(len(data), len(X_train), len(X_test))) 

数据集样本总数：59，训练集样本数：44，测试集样本数：15


### 2、特征归一化

In [6]:
from sklearn.preprocessing import MinMaxScaler  # 最小最大值归一化

scaler = MinMaxScaler()
X_train_scaler = scaler.fit_transform(X_train)
X_test_scaler = scaler.fit_transform(X_test)

for i in range(len(X.columns)):
    print('归一化前，训练集 {} 特征，最大值：{}，最小值：{}'.format(X_train.columns[i], 
                                                X_train.iloc[:, i].max(), 
                                                X_train.iloc[:, i].min()))
    print('归一化后，训练集 {} 特征，最大值：{:.3f}，最小值：{:.3f}'.format(X_train.columns[i], 
                                                X_train_scaler[:, i].max(), 
                                                X_train_scaler[:, i].min()))
    print()

归一化前，训练集 mass 特征，最大值：356，最小值：76
归一化后，训练集 mass 特征，最大值：1.000，最小值：0.000

归一化前，训练集 width 特征，最大值：9.2，最小值：5.8
归一化后，训练集 width 特征，最大值：1.000，最小值：0.000

归一化前，训练集 height 特征，最大值：10.5，最小值：4.0
归一化后，训练集 height 特征，最大值：1.000，最小值：0.000

归一化前，训练集 color_score 特征，最大值：0.92，最小值：0.55
归一化后，训练集 color_score 特征，最大值：1.000，最小值：0.000



In [None]:
# 可视化归一化效果


### 3、归一化对结果的影响

In [7]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)

# 在未进行归一化的训练集上计算
knn.fit(X=X_train, y=y_train)
print('未进行归一化，准确率：{}'.format(knn.score(X_test, y_test)))

# 进行归一化后的计算
knn.fit(X=X_train_scaler, y=y_train)
print('归一化后，准确率：{}'.format(knn.score(X=X_test_scaler, y=y_test)))

未进行归一化，准确率：0.5333333333333333
归一化后，准确率：0.8


### 4、标签编码和独热编码

In [8]:
# 随机生成有序型特征和类别特征作为例子
X_train = np.array([['male', 'low'],
                  ['female', 'low'],
                  ['female', 'middle'],
                  ['male', 'low'],
                  ['female', 'high'],
                  ['male', 'low'],
                  ['female', 'low'],
                  ['female', 'high'],
                  ['male', 'low'],
                  ['male', 'high']])

X_test = np.array([['male', 'low'],
                  ['male', 'low'],
                  ['female', 'middle'],
                  ['female', 'low'],
                  ['female', 'high']])

X_train

array([['male', 'low'],
       ['female', 'low'],
       ['female', 'middle'],
       ['male', 'low'],
       ['female', 'high'],
       ['male', 'low'],
       ['female', 'low'],
       ['female', 'high'],
       ['male', 'low'],
       ['male', 'high']], dtype='<U6')

In [9]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# 在训练集上进行编码操作
label_enc1 = LabelEncoder()  # 首先将male, female用数字编码
one_hot_enc = OneHotEncoder()  # 将数字编码转换为独热编码

label_enc2 = LabelEncoder()  # 将low, middle, high用数字编码

tr_feat1_tmp = label_enc1.fit_transform(X_train[:, 0]).reshape(-1, 1)  # reshape(-1, 1)保证为一维列向量
tr_feat1 = one_hot_enc.fit_transform(tr_feat1_tmp)
tr_feat1 = tr_feat1.todense()
print(tr_feat1)
print()

tr_feat2 = label_enc2.fit_transform(X_train[:, 1]).reshape(-1, 1)
print(tr_feat2)
print()

X_train_enc = np.hstack((tr_feat1, tr_feat2))
print(X_train_enc)

[[0. 1.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [0. 1.]]

[[1]
 [1]
 [2]
 [1]
 [0]
 [1]
 [1]
 [0]
 [1]
 [0]]

[[0. 1. 1.]
 [1. 0. 1.]
 [1. 0. 2.]
 [0. 1. 1.]
 [1. 0. 0.]
 [0. 1. 1.]
 [1. 0. 1.]
 [1. 0. 0.]
 [0. 1. 1.]
 [0. 1. 0.]]


In [12]:
# 在测试集上进行编码操作
te_feat1_tmp = label_enc1.transform(X_test[:, 0]).reshape(-1, 1)
te_feat1 = one_hot_enc.fit_transform(te_feat1_tmp)
te_feat1 = te_feat1.todense()

te_feat2 = label_enc2.fit_transform(X_test[:, 1]).reshape(-1, 1)

X_test_enc = np.hstack((te_feat1, te_feat2))
print(X_test_enc)

[[0. 1. 1.]
 [0. 1. 1.]
 [1. 0. 2.]
 [1. 0. 1.]
 [1. 0. 0.]]
