## 超参数

In [1]:
import numpy as np
from sklearn import datasets

In [2]:
# 使用手写数字图片数据集
digits = datasets.load_digits()
X = digits.data
y = digits.target

print(X.shape)
print(y.shape)

(1797, 64)
(1797,)


In [3]:
# 使用 sklearn 的 train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [4]:
# 使用 sklearn 的 kNN 算法
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(X_train, y_train)
knn_clf.score(X_test, y_test)

0.9888888888888889

## 寻找最好的 k

In [5]:
# 用一个变量保存搜寻过的最好的分数（准确度）
best_score = 0.0
best_k = -1

In [6]:
# 用不同的 k 运行 kNN，搜寻最好的分数和参数
for k in range(1, 11):
    knn_clf = KNeighborsClassifier(n_neighbors=k)
    knn_clf.fit(X_train, y_train)
    score = knn_clf.score(X_test, y_test)
    if score > best_score:
        best_score = score
        best_k = k

print("best score is", best_score)
print("best k is", best_k)

best score is 0.9888888888888889
best k is 1


In [7]:
## 要注意，如果计算出来 best_k = 10， 也就是循环的边界， 那说明 10 可能还不是最好的值，只是循环结束了停在这个值 

## 考虑另外一个超参数：是否考虑距离

In [8]:
# sklearn 的 kNN 算法本来就有一个构造参数：weights:  uniform=不考虑距离；  distance=考虑距离
best_method = ""
best_score = 0.0
best_k = -1
for method in ["uniform", "distance"]:
    for k in range(1, 11):
        knn_clf = KNeighborsClassifier(n_neighbors=k, weights=method)
        knn_clf.fit(X_train, y_train)
        score = knn_clf.score(X_test, y_test)
        if score > best_score:
            best_score = score
            best_k = k
            best_method = method

print("best score is", best_score)
print("best k is", best_k)
print("best method is", best_method)

best score is 0.9888888888888889
best k is 1
best method is uniform


## 再来一个超参数：距离计算模式（明科夫斯基距离公式）中的 p

In [9]:
%%time

# sklearn 的 kNN 算法本来就有一个构造参数：p:  默认就是 2 即 欧拉距离
best_p = -1
best_score = 0.0
best_k = -1
for p in range(1, 6):
    for k in range(1, 11):
        knn_clf = KNeighborsClassifier(n_neighbors=k, weights="distance", p=p)
        knn_clf.fit(X_train, y_train)
        score = knn_clf.score(X_test, y_test)
        if score > best_score:
            best_score = score
            best_k = k
            best_p = p

print("best score is", best_score)
print("best k is", best_k)
print("best p is", best_p)

best score is 0.9916666666666667
best k is 1
best p is 3
CPU times: user 12.8 s, sys: 6.93 ms, total: 12.8 s
Wall time: 12.8 s
