In [None]:
! pip install mglearn==0.1.9

In [None]:
%matplotlib inline
import sys
from scipy import sparse
print("Python version: {}".format(sys.version))
import pandas as pd
print("pandas version: {}".format(pd.__version__))
import matplotlib
print("matplotlib version: {}".format(matplotlib.__version__))
import numpy as np
print("NumPy version: {}".format(np.__version__))
import scipy as sp
print("SciPy version: {}".format(sp.__version__))
import IPython
print("IPython version: {}".format(IPython.__version__))
import sklearn
print("scikit-learn version: {}".format(sklearn.__version__))
import mglearn

In [None]:
from sklearn.datasets import load_iris
iris_dataset = load_iris()

# Data Exploration

In [None]:
print("Keys of iris_dataset: \n{}".format(iris_dataset.keys()))

In [None]:
print(iris_dataset['DESCR'][:193] + "\n...")

In [None]:
print("Target names: {}".format(iris_dataset['target_names']))

In [None]:
print("Feature names: \n{}".format(iris_dataset['feature_names']))

In [None]:
print("Type of data: {}".format(type(iris_dataset['data'])))

In [None]:
print("Shape of data: {}".format(iris_dataset['data'].shape))

In [None]:
print("First five columns of data:\n{}".format(iris_dataset['data'][:5]))

In [None]:
print("Type of target: {}".format(type(iris_dataset['target'])))

In [None]:
print("Shape of target: {}".format(iris_dataset['target'].shape))

In [None]:
print("Target:\n{}".format(iris_dataset['target']))

# Training and testing

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
iris_dataset['data'], iris_dataset['target'], random_state=0)

In [None]:
print("X_train shape: {}".format(X_train.shape))
print("y_train shape: {}".format(y_train.shape))

In [None]:
print("X_test shape: {}".format(X_test.shape))
print("y_test shape: {}".format(y_test.shape))

# Data Visualization

In [None]:
# create dataframe from data in X_train
# label the columns using the strings in iris_dataset.feature_names
iris_dataframe = pd.DataFrame(X_train, columns=iris_dataset.feature_names)
# create a scatter matrix from the dataframe, color by y_train
grr = pd.plotting.scatter_matrix(iris_dataframe, c=y_train, figsize=(15, 15), marker='o',
hist_kwds={'bins': 20}, s=60, alpha=.8, cmap=mglearn.cm3)

# Building the model

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)

In [None]:
knn.fit(X_train, y_train)

# Making Predictions ans Evaluating the model

In [None]:
X_new = np.array([[5, 2.9, 1, 0.2]])
print("X_new.shape: {}".format(X_new.shape))

In [None]:
prediction = knn.predict(X_new)
print("Prediction: {}".format(prediction))
print("Predicted target name: {}".format(
iris_dataset['target_names'][prediction]))

In [None]:
y_pred = knn.predict(X_test)
print("Test set predictions:\n {}".format(y_pred))

In [None]:
print("Test set score: {:.2f}".format(np.mean(y_pred == y_test)))

In [None]:
print("Test set score: {:.2f}".format(knn.score(X_test, y_test)))

# Visualizing the algorithm with some sample datasets

In [None]:
import matplotlib.pyplot as plt
# generate dataset
X, y = mglearn.datasets.make_forge()
# plot dataset
mglearn.discrete_scatter(X[:, 0], X[:, 1], y)
plt.legend(["Class 0", "Class 1"], loc=4)
plt.xlabel("First feature")
plt.ylabel("Second feature")
print("X.shape: {}".format(X.shape))

In [None]:
X, y = mglearn.datasets.make_wave(n_samples=40)
plt.plot(X, y, 'o')
plt.ylim(-3, 3)
plt.xlabel("Feature")
plt.ylabel("Target")

In [None]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
print("cancer.keys(): \n{}".format(cancer.keys()))

In [None]:
print("Shape of cancer data: {}".format(cancer.data.shape))

In [None]:
print("Sample counts per class:\n{}".format(
{n: v for n, v in zip(cancer.target_names, np.bincount(cancer.target))}))

In [None]:
print("Feature names:\n{}".format(cancer.feature_names))

In [None]:
from sklearn.datasets import load_boston
boston = load_boston()
print("Data shape: {}".format(boston.data.shape))

In [None]:
X, y = mglearn.datasets.load_extended_boston()
print("X.shape: {}".format(X.shape))

In [None]:
 from sklearn.datasets import make_blobs
mglearn.plots.plot_knn_classification(n_neighbors=5)

In [None]:
mglearn.plots.plot_knn_classification(n_neighbors=10)

In [None]:
from sklearn.model_selection import train_test_split
X, y = mglearn.datasets.make_forge()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=3)

In [None]:
clf.fit(X_train, y_train)

In [None]:
print("Test set predictions: {}".format(clf.predict(X_test)))

In [None]:
print("Test set accuracy: {:.2f}".format(clf.score(X_test, y_test)))

# Analyzing KNeighboursClassifier further

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(10, 3))
for n_neighbors, ax in zip([1, 3, 9], axes):
# the fit method returns the object self, so we can instantiate
# and fit in one line
    clf = KNeighborsClassifier(n_neighbors=n_neighbors).fit(X, y)
    mglearn.plots.plot_2d_separator(clf, X, fill=True, eps=0.5, ax=ax, alpha=.4)
    mglearn.discrete_scatter(X[:, 0], X[:, 1], y, ax=ax)
    ax.set_title("{} neighbor(s)".format(n_neighbors))
    ax.set_xlabel("feature 0")
    ax.set_ylabel("feature 1")
axes[0].legend(loc=3)

# Final Training
**Here, we are evaluating training and test set performance with different numbers of neighbors.**

In [None]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, stratify=cancer.target, random_state=66)
training_accuracy = []
test_accuracy = []
# try n_neighbors from 1 to 10
neighbors_settings = range(1, 11)
for n_neighbors in neighbors_settings:
# build the model
    clf = KNeighborsClassifier(n_neighbors=n_neighbors)
    clf.fit(X_train, y_train)
    # record training set accuracy
    training_accuracy.append(clf.score(X_train, y_train))
# record generalization accuracy
    test_accuracy.append(clf.score(X_test, y_test))
plt.plot(neighbors_settings, training_accuracy, label="training accuracy")
plt.plot(neighbors_settings, test_accuracy, label="test accuracy")
plt.ylabel("Accuracy")
plt.xlabel("n_neighbors")
plt.legend()

# Regression using KNN

In [None]:
mglearn.plots.plot_knn_regression(n_neighbors=3)

In [None]:
mglearn.plots.plot_knn_regression(n_neighbors=5)

In [None]:
from sklearn.neighbors import KNeighborsRegressor
X, y = mglearn.datasets.make_wave(n_samples=40)
# split the wave dataset into a training and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
# instantiate the model and set the number of neighbors to consider to 3
reg = KNeighborsRegressor(n_neighbors=3)
# fit the model using the training data and training targets
reg.fit(X_train, y_train)

In [None]:
print("Test set predictions:\n{}".format(reg.predict(X_test)))

In [None]:
#coefficient of determination method
print("Test set R^2: {:.2f}".format(reg.score(X_test, y_test)))

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
# create 1,000 data points, evenly spaced between -3 and 3
line = np.linspace(-3, 3, 1000).reshape(-1, 1)
for n_neighbors, ax in zip([1, 3, 9], axes):
# make predictions using 1, 3, or 9 neighbors
    reg = KNeighborsRegressor(n_neighbors=n_neighbors)
    reg.fit(X_train, y_train)
    ax.plot(line, reg.predict(line))
    ax.plot(X_train, y_train, '^', c=mglearn.cm2(0), markersize=8)
    ax.plot(X_test, y_test, 'v', c=mglearn.cm2(1), markersize=8)
    ax.set_title(
"{} neighbor(s)\n train score: {:.2f} test score: {:.2f}".format(
    n_neighbors, reg.score(X_train, y_train),
    reg.score(X_test, y_test)))
    ax.set_xlabel("Feature")
    ax.set_ylabel("Target")
axes[0].legend(["Model predictions", "Training data/target",
"Test data/target"], loc="best")

In [None]:
mglearn.plots.plot_linear_regression_wave()