In [None]:
import random
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython

%load_ext autoreload
%autoreload 2

# Load The Data

In [None]:
from sklearn import preprocessing, metrics
import utils
import scipy.io
import numpy as np
import tqdm
import itertools
from sklearn import datasets

iris_df = pd.read_csv('https://raw.githubusercontent.com/rasbt/python-machine-learning-book/master/code/datasets/iris/iris.data', header=None)
# select setosa and versicolor
y = df.iloc[0:100, 4].values
y = np.where(y == 'Iris-setosa', 0, 1)

# extract sepal length and petal length
X = df.iloc[0:100, [0, 2]].values

# Visualize The Data

In [None]:
utils.plot_data(X)

# Split The Data Into Training, Validation, and Testing

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.33, random_state=42)

# KNN - Supervised
## Implementation

In [None]:
import statistics

class KNN(object):
    def __init__(self, k):
        self.X_train = None 
        self.y_train = None 
        self.k = k
    
    def train(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
        
    def predict(self, X_test):
        predictions = np.zeros((X_test.shape[0]))
        ##################################################################################
        #  YOUR CODE HERE. The goal here is to iterate accross inputs, and predict each 
        #  input. For each input, we want to calculate the k closest training inputs 
        #  and find the most common label among them. Assign these predictions to the 
        #  variable 'predictions'.
        ##################################################################################
        for i in range(len(X_test)):
            dists = #use np.linalg.norm to find the distances between X_test[i] and each training example accross the 1st axis
            closest_k = #get the values of y_train at the indices of dists with the lowest values using np.argsort
            predictions[i] = #use statistics.mode on closest_k to find the most common label among the nearest neighbors
        ##################################################################################
        #  END OF YOUR CODE
        ##################################################################################
        return predictions 

## Training & Searching For The Best K

In [None]:
best_K = 0
best_val = 0
best_KNN = None

Ks = np.arange(1, 8, 2)

for K in tqdm.tqdm(itertools.product(Ks), unit="pair"):
    K = K[0]

    # set up the KNN and learn the parameters
    knn = KNN(k = K)
    knn.train(X_train,y_train)
    

    # calculate accuracy and update
    val_acc = np.mean(y_val == knn.predict(X_val))
    
    if val_acc >= best_val:
        best_K = K
        best_KNN = knn
        best_val = val_acc
    
print('Best K: %e, Best Val: %e' % (best_K, best_val))

## Model Evaluation

In [None]:
print('Train Accuracy:', np.mean(y_train == best_KNN.predict(X_train)))
print('Test Accuracy:', np.mean(y_test == best_KNN.predict(X_test)))

## Plot Decision Boundary

In [None]:
utils.plot_decision_boundary(X, y, best_KNN)

# KMeans - Unsupervised
## Implementation

In [None]:
from sklearn.metrics import pairwise_distances_argmin

class KMeans(object):
    def __init__(self, k):
        self.X_train = None 
        self.k = k
        self.centers = None

    def train(self, X):
        # 1. Randomly choose clusters
        rng = np.random.RandomState(2)
        i = rng.permutation(X.shape[0])[:self.k]
        centers = X[i]

        while True:
            # 2a. Assign labels based on closest center
            labels = pairwise_distances_argmin(X, centers)

            # 2b. Find new centers from means of points
            new_centers = np.array([X[labels == i].mean(0)
                                    for i in range(self.k)])

            # 2c. Check for convergence
            if #check if self.centers and the new centers are the same
                break
            self.centers = new_centers

        return self.centers, labels
    
    def predict(self, X_test):
        predictions = np.zeros((X_test.shape[0]))
        for i in range(len(X_test)):
            distance_from_centers = #use np.linalg.norm to find the distances between X_test[i] and the centers accross the 1st axis
            predictions[i] = #use np.argsort to find which index corresponds to the lowest distance from a center
        return predictions

## Training And Plotting Clusters

In [None]:
kmeans = KMeans(2)
centers, labels = kmeans.train(X)
plt.scatter(X[:, 0], X[:, 1], c=labels,
            s=50, cmap='viridis');
plt.scatter(centers[:, 0], centers[:, 1], marker = "+")

## Model Evaluation

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(kmeans.predict(X), 1 - y)

## Cool Example - Digit Recognition
### Load The Data

In [None]:
from sklearn.datasets import load_digits
digits = load_digits()

### Train KMeans

In [None]:
kmeans = KMeans(k=10)
kmeans.train(digits.data)
clusters = kmeans.predict(digits.data)

### Visualize The Centers

In [None]:
fig, ax = plt.subplots(2, 5, figsize=(8, 3))
centers = kmeans.centers.reshape(10, 8, 8)
for axi, center in zip(ax.flat, centers):
    axi.set(xticks=[], yticks=[])
    axi.imshow(center, interpolation='nearest', cmap=plt.cm.binary)

### Evaluate The Model

In [None]:
from scipy.stats import mode
from sklearn.metrics import accuracy_score

labels = np.zeros_like(clusters)
for i in range(10):
    mask = (clusters == i)
    labels[mask] = mode(digits.target[mask])[0]
accuracy_score(digits.target, labels)