# Machine Learning Implementation

## imports

In [1]:
import json

import numpy as np
import pandas as pd
import plotly.offline as py
from plotly import graph_objects as go

## knn

### the maths

The k-nearest neighbours algorithm finds k clusters from the data. It does this by starting with k centroids (often randomly selected) and then assigning each point in the data to a cluster based on it's closest centroid. The centroids are then updated as the mean of all points in the cluster. This process is repeated untill the centroids stop changing

1. Choose k
2. randomly select centroids $c_1,\dots,c_k \in \mathbb{R}^n$
3. until convergence repeat
    * for each $x_i$ assign to cluster $C_j \in \{C_1\dots C_k\}$ where
    $$
    j = \arg\min_{j}\sqrt{(x_i - c_j)^2}
    $$ 
    * update each $c_j$ as
    $$
    c_j = \frac{1}{|C_j|}\sum_{x_j\in C_j}x_j
    $$
    

### make fake data

In [2]:
def get_2d_blob(n,x,y,r=1):
    x1 = np.random.normal(x, r, n)
    y1 = np.random.normal(y, 1 /(1 + abs(x1 - x))**.5, n)
    return x1, y1

In [3]:
n=100
xy = np.concatenate((
    get_2d_blob(n,1,2,3),
    get_2d_blob(n,14,5),
    get_2d_blob(n,7,10)
),axis=1).T

x = xy[:,0]
y = xy[:,1]

In [4]:
knn_fig = go.FigureWidget()
scatter  = go.Scatter(x=x,
                      y=y,
                      mode='markers',
                      name='blob data + noise')
knn_fig.add_trace(scatter)
iteration = 0
knn_fig.layout.title = f'knn Iteration {iteration}'

color_mapping={}
for i in [1,2,3]:
    color_mapping[i] = knn_fig.layout.template.layout.colorway[i]

In [5]:
knn_fig

FigureWidget({
    'data': [{'mode': 'markers',
              'name': 'blob data + noise',
              'type…

### knn implementaion

In [436]:
# set k and define random centroids
k=3
c = np.column_stack((
    np.random.uniform(x.min(),x.max(),k),
    np.random.uniform(y.min(),y.max(),k)))
cs = [c]

# add the centroids to the fig
for i in range(len(c)):
    knn_fig.add_scatter(
        x=[c[i,0]],
        y=[c[i,1]],
        mode='markers',
        marker=dict(
            size=12,
            line=dict(width=2,color='DarkSlateGrey')),
        text=[f'Centroid-{i}'],
        name=f'Centroid-{i}')

# Euclidean Distance Caculator
def dist(a, b, ax=1):
    return np.linalg.norm(a - b, axis=ax)

def assign_clusters(xy, c):
    distances = []
    for centroid in c:
#         print(centroid)
        centorid_distances = dist(xy,centroid)
#         print(min(centorid_distances))
        distances.append(centorid_distances)
    all_distaces = np.array(distances).T
    cluster_labels = np.argmin(all_distaces,axis=1)
    data_colors = list(map(lambda x: color_mapping[x+1], cluster_labels))
    knn_fig.data[0].marker.color = data_colors
    return cluster_labels

def update_centroids(xy, cluster_labels, cs):
    global iteration
    c = []
    for i in range(3):
        centroid = xy[cluster_labels == i,:].mean(axis=0)
        knn_fig.data[1+i].x = (centroid[0],)
        knn_fig.data[1+i].y = (centroid[1],)
        c.append(centroid)
    c = np.array(c)
    cs.append(c)
    iteration +=1
    title = f'knn Iteration {iteration}'
    knn_fig.layout.title = title
    print(f'Change in centroids {dist(cs[-1],cs[-2])}')
    return c, cs

def update_step():
    global xy, c, cs
    cluster_labels = assign_clusters(xy, c)
    c, cs = update_centroids(xy, cluster_labels, cs)

knn_fig

FigureWidget({
    'data': [{'mode': 'markers',
              'name': 'blob data + noise',
              'type…

In [441]:
update_step()

Change in centroids [0. 0. 0.]


## end