## 使用鸢尾花的数据集来测试我们的算法

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data
y = iris.target

In [2]:
X.shape

(150, 4)

In [3]:
y.shape

(150,)

### train_test_split 把数据集拆为两部分，一大部分是训练用，一小部分是测试用

In [4]:
# 防止数据集是有序，会影响训练和测试的效果，所以第一步要把数据乱序（shuffle）处理
# 同时要注意，X 和 y 中的数据是一一对应的，所以乱序处理不能把他们的对应关系搞乱，so专门做一个乱序数组按照下标分别到 X 和 y 中取值
shuffle_indexes = np.random.permutation(len(X))
shuffle_indexes

array([ 10, 118,  15, 127, 101,  70,  54,   8,  75,  55, 141,  29,  96,
       102,  97,  50, 106,   7,  62,   6,  98,  34,   3,  46,  22,  13,
        44,  56, 126, 100,  90,  68, 113, 115,  85,  21,  61,  26,  66,
        19, 121,  38, 134,  58,  69,  81, 108, 120, 148,   9, 129,  28,
       130,  35,  37,   0,  49, 145, 142, 136,  91, 135, 107, 112,  71,
        59,  12,  95, 146,  51, 131,  64, 119,  72,  53,  80,  94,   1,
        77, 128,  65,  92,  40,   5, 140,  73,  39,  17, 105,  87, 132,
       125, 124,  16, 147,  36,  86, 116,  63,  84,  67, 114,  47,  76,
        82, 138,  57,  41,  33,  30,  45,  83, 122, 104,  20,  24, 144,
       149,  99,  32,  11, 111,  52, 139, 103, 109,  27,  79,  18,  42,
        89,  23, 117,  74,  88,  60,  43,  14, 137, 133,   2,  31, 143,
       110,  48,  93, 123,  25,  78,   4])

In [5]:
# 确定测试数据集占整个数据集的比率
test_ratio = 0.2
# 测试数据集大小
test_size = int(len(X) * test_ratio)
test_size

30

In [6]:
# 测试数据集的索引
test_indexes = shuffle_indexes[test_size:]
# 训练数据集的索引
train_indexes = shuffle_indexes[:test_size]

# 测试数据集
X_test = X[test_indexes]
y_test = y[test_indexes]
# 训练数据集
X_train = X[train_indexes]
y_train = y[train_indexes]

print(X_test.shape)
print(y_test.shape)
print(X_train.shape)
print(y_train.shape)

(120, 4)
(120,)
(30, 4)
(30,)


In [7]:
# 用这些准备好的数据来测试我们自己写的kNN算法

from my_kNN.kNN import kNNClassifier

knn = kNNClassifier(k=3)
knn.fit(X_train, y_train)
y_predict = knn.predict(X_test)
y_predict

array([1, 1, 2, 2, 1, 0, 1, 0, 1, 0, 2, 0, 2, 1, 1, 1, 2, 2, 2, 0, 2, 0,
       2, 0, 0, 0, 0, 2, 2, 2, 1, 2, 2, 2, 1, 1, 0, 1, 2, 1, 2, 1, 2, 2,
       1, 1, 1, 0, 1, 2, 1, 1, 0, 0, 2, 1, 0, 0, 2, 1, 2, 2, 2, 0, 2, 0,
       1, 2, 1, 1, 1, 2, 0, 1, 1, 2, 1, 0, 0, 0, 0, 2, 2, 2, 0, 0, 2, 2,
       1, 0, 0, 2, 1, 2, 2, 2, 0, 1, 0, 0, 1, 0, 2, 1, 1, 1, 0, 0, 2, 2,
       0, 0, 2, 2, 0, 1, 2, 0, 1, 0])

In [9]:
# 预测准确率
sum(y_predict == y_test) / len(y_test)

0.9833333333333333

## sklearn 中封装的 train_test_split

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, )
print(X_test.shape)
print(y_test.shape)
print(X_train.shape)
print(y_train.shape)

(30, 4)
(30,)
(120, 4)
(120,)


In [12]:
# 使用 sklearn 的 kNN 算法
from sklearn.neighbors import KNeighborsClassifier
sk_knn = KNeighborsClassifier(n_neighbors=3)
sk_knn.fit(X_train, y_train)
y_predict = sk_knn.predict(X_test)


## sklearn 中封装的 计算准确度方法

In [16]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_predict)

0.9333333333333333

In [17]:
# 如果不关心预测结果，只想知道模型的准确度，可以直接调用算法的方法的得到准确度：
sk_knn2 = KNeighborsClassifier(n_neighbors=3)
sk_knn2.fit(X_train, y_train)
sk_knn2.score(X_test, y_test)

0.9333333333333333