In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

def plot_precision_recall_vs_treshold(precisions, recalls, thresholds):
    plt.figure(figsize=(12,6))
    plt.plot(thresholds, precisions[:-1], "b--", label="Точность")
    plt.plot(thresholds, recalls[:-1], "g-", label="Полнота")
    plt.xlabel("Порог", fontsize=16)
    plt.legend(loc="center left", fontsize=16)
    plt.ylim([0, 1])

def plot_precision_vs_recall(precisions, recalls):
    plt.figure(figsize=(8, 6))
    plt.plot(recalls, precisions, "b-", linewidth=2)
    plt.xlabel("Полнота", fontsize=16)
    plt.ylabel("Точность", fontsize=16)
    plt.axis([0, 1, 0, 1])

def plot_roc_curve(rates):
    plt.figure(figsize=(10,8))
    for rate in rates:
        plt.plot(rate[0], rate[1], rate[2], linewidth=2, label=rate[3])
    plt.plot([0, 1], [0, 1], 'k--', label="Случайная классификация")
    plt.axis([0, 1, 0, 1])
    plt.xlabel("Доля ложноположительных классификаций")
    plt.ylabel("Доля истинно положительных классификаций")
    plt.legend(loc="lower right", fontsize=16)

def draw_digit(X, i):
    some_digit = X[i]
    some_digit_image = some_digit.reshape(28,28)
    plt.imshow(some_digit_image, cmap=matplotlib.cm.binary, interpolation="nearest")
    
    return some_digit

In [2]:
from sklearn.datasets import fetch_mldata

mnist = fetch_mldata("MNIST original")
X, y = mnist["data"], mnist["target"]

In [3]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

In [8]:
from sklearn.neighbors import KNeighborsClassifier
KNeighborsClassifier?
knn_clf = KNeighborsClassifier()
# knn_clf.fit(X_train, y_train)

In [9]:
from sklearn.model_selection import GridSearchCV

grid_params = [
    {"weights":["uniform","distance"], "n_neighbors":[1,2,3]}
]

grid_search = GridSearchCV(knn_clf, grid_params, cv=5, scoring="accuracy", n_jobs=4)

In [None]:
grid_search.fit(X_train, y_train)