## 测试我们的算法

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

In [2]:
# 导入鸢尾花数据集
iris = datasets.load_iris()
iris

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [3]:
# 原始数据集
X = iris.data
y = iris.target
print(X.shape)
print(y.shape)

(150, 4)
(150,)


## train test split

In [4]:
# 不能直接按照原始的数据顺序做 split，因为 X 或 y 的值可能是排好序的
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [5]:
# 把索引进行随机打乱排列
shuffled_indexes = np.random.permutation(len(X))
shuffled_indexes

array([ 14,  44, 117, 130,  39, 111, 100,  35, 119,   0, 128,  10,  94,
        17,  21, 148,   1, 102,  16, 129,  88,  19,  93, 142,  64,  61,
        70, 146, 114,   5,  79,  22, 131,  80, 118,  27, 139, 123,  87,
        91, 109,   8,  42, 107, 125,  90,  49,  96,  60,  67, 149, 103,
        54,  78, 127,  75,  12, 101,  24,  47, 145, 121,  50, 120, 144,
        26,  85,  97,  51,  53,  34, 134,  23, 105,  56,   7,   9,  37,
       124,  46,  25, 133,  86,  48,  99,  28, 135, 106,  92,  30,  76,
       108, 122,  63,  66, 143,  52,  83, 147,  55, 113,  32,  73,  29,
        68,  15,  77, 136,   2,  41, 112, 140,  31,  58,   4,  95,  43,
        84,  65,  59, 116, 137, 138, 126,  74,  20,  69,  71, 132,  18,
        89, 104,  36,  72,  33, 110,  13,  82,  98,  45,   3, 115,  40,
        11,  62,  81,  38,   6,  57, 141])

In [6]:
# 设置抽取测试数据集的比例
test_ratio = 0.2
test_size = int(len(X) * test_ratio)
test_size

30

In [7]:
# 分别抽取训练数据集和测试数据集
test_indexes = shuffled_indexes[:test_size]
train_indexes = shuffled_indexes[test_size:]

X_train = X[train_indexes]
y_train = y[train_indexes]

X_test = X[test_indexes]
y_test = y[test_indexes]

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(120, 4)
(120,)
(30, 4)
(30,)


## 把上面的过程包装成自己的方法  model_selection.train_test_split, 这里直接调用

In [8]:
from playML.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(120, 4)
(120,)
(30, 4)
(30,)


## 接下来就使用 train_test_split 的数据来测试我们写好的 kNN 算法

In [9]:
from playML.kNN import kNNClassifier
my_knn_clf = kNNClassifier(k=3)
my_knn_clf.fit(X_train, y_train)
y_predict = my_knn_clf.predict(X_test)
y_predict

array([1, 1, 2, 2, 1, 2, 2, 0, 1, 0, 0, 2, 2, 1, 2, 2, 0, 2, 2, 0, 0, 2,
       1, 2, 0, 1, 2, 2, 0, 0])

In [10]:
y_test

array([1, 1, 2, 2, 1, 2, 2, 0, 1, 0, 0, 2, 2, 1, 2, 2, 0, 2, 1, 0, 0, 2,
       1, 2, 0, 1, 2, 2, 0, 0])

In [12]:
# 接下来就很简单，只要比较 y_predict 和 y_test
sum(y_predict == y_test)

29

In [14]:
# 计算准确率
sum(y_predict == y_test) / len(X_test)

0.9666666666666667

## sklearn 中也提供的现成的 train_test_split

In [18]:
from sklearn.model_selection import train_test_split as sklearn_train_test_split
X_train, X_test, y_train, y_test = sklearn_train_test_split(X, y, test_size=0.2, random_state=666)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(120, 4)
(120,)
(30, 4)
(30,)
