# machine_learning_implementation 

> This is a notebook for the detail - both mathematically and of a raw Ward-Jones python implementation

## imports

In [442]:
import pandas as pd
import plotly.offline as py
from plotly import graph_objects as go
import numpy as np
import json

## Linear regression

### the maths

The linear model (or line of best fit in 2D) aims to describe the continuous y vairable a.k.a the target variable (e.g. house prices) as a linear combination of features (e.g. square footage / number of bedrooms) the features are also refered to as the design matrix.

$$
\begin{align}
\hat{y}&=\beta_0x_0+\cdots+\beta_nx_n\quad &n\in \mathbb{N}, x_o = 1 \\
\hat{y}&=\sum^{n}_{i=0}\beta_ix_i \\
\hat{y}&=\mathbf{\boldsymbol{\beta}^Tx}\quad&\boldsymbol{\beta},\mathbf{x}\in\mathbb{R}^{n\times1}\\
\hat{y}&=g(\boldsymbol{\beta}^T\mathbf{x})
\end{align}
$$

where g, the activation function, is the identidy in linear regression  

We define the cost function as half of the mean square error:

$$
\begin{align}
J(\boldsymbol{\beta})
&= \frac{1}{2m}\sum^{m}_{j=1}\left(
y^j-\hat{y}^j
\right)^2,\quad m\in \mathbb{N} \text{ is the number of training samples}\\
&= \frac{1}{2m}\sum^{m}_{j=1}\left(
y^j-g(\boldsymbol{\beta}^T\mathbf{x}^j)
\right)^2
\end{align}
$$

We need to differentiate the cost function i.e. find the gradient

$$
\begin{align}
\frac{\partial J}{\partial\beta_k}\left(\boldsymbol{\beta}\right) &= \frac{\partial}{\partial\beta_k}\left(
\frac{1}{2m}\sum^{m}_{j=1}\left(
y^j-g(\boldsymbol{\beta}^T\mathbf{x}^j)\right)^2
\right)\\
&= \frac{\partial}{\partial\beta_k}\left(
\frac{1}{2m}\sum^{m}_{j=1}
\left(
y^j-\sum^{n}_{i=0}\beta_ix_i^j
\right)^2
\right)\\
&=
\frac{1}{m}\sum^{m}_{j=1}
\left(
y^j-\sum^{n}_{i=0}\beta_ix_i^j
\right)(-x^j_k)\\
\end{align}
$$

hence

$$
\nabla_{\boldsymbol{\beta}} J
=
\begin{bmatrix}
       \frac{\partial J}{\partial\beta_1} \\
       \vdots \\
       \frac{\partial J}{\partial\beta_n}
\end{bmatrix}
=
\begin{bmatrix}
       -\frac{1}{m}\sum^{m}_{j=1}
           \left(y^j-\sum^{n}_{i=0}\beta_ix_i^j\right)x^j_1\\
       \vdots \\
       -\frac{1}{m}\sum^{m}_{j=1}
           \left(y^j-\sum^{n}_{i=0}\beta_ix_i^j\right)x^j_n\\
\end{bmatrix}
$$

Define the design matrix and column representation of y. Here each row of X and y are training examples hence there are m rows

$$\mathbf{X}\in\mathbb{R}^{m\times n},
\quad \mathbf{y}\in\mathbb{R}^{m\times 1}
$$

$$\\
\mathbf{X}=\begin{bmatrix}
       \dots & (\mathbf{x}^1)^T & \dots\\
       \dots & (\mathbf{x}^2)^T & \dots\\
       \dots & \vdots  & \dots\\
       \dots & (\mathbf{x}^m)^T & \dots
\end{bmatrix}\quad
\mathbf{y}=\begin{bmatrix}
    y_1\\y_2\\\vdots\\y_m
\end{bmatrix}
$$

$$
\begin{align}
\nabla_{\boldsymbol{\beta}} J
&=
\begin{bmatrix}
       -\frac{1}{m}\sum^{m}_{j=1}
           \left(y^j-\sum^{n}_{i=0}\beta_ix_i^j\right)x^j_1\\
       \vdots \\
       -\frac{1}{m}\sum^{m}_{j=1}
           \left(y^j-\sum^{n}_{i=0}\beta_ix_i^j\right)x^j_n\\
\end{bmatrix}
=-\frac{1}{m}
\begin{bmatrix}
       \sum^{m}_{j=1}y^jx^j_1\\
       \vdots \\
       \sum^{m}_{j=1}y^jx^j_n\\
\end{bmatrix}+
\frac{1}{m}
\begin{bmatrix}
       \sum^{m}_{j=0}\sum^{n}_{i=0}\beta_ix_i^jx^j_1\\
       \vdots \\
       \sum^{m}_{j=0}\sum^{n}_{i=0}\beta_ix_i^jx^j_n
\end{bmatrix}\\
\end{align}
$$

so

$$
\begin{align}
\nabla_{\boldsymbol{\beta}} J
&=\frac{1}{m}\left(
\mathbf{X}^T\mathbf{X}\mathbf{\boldsymbol{\beta}}-\mathbf{X}^T\mathbf{y}
\right)\\
&=\frac{1}{m}\mathbf{X}^T\left(
\mathbf{X}\mathbf{\boldsymbol{\beta}}-\mathbf{y}
\right)\\
&=\frac{1}{m}\mathbf{X}^T\left(
\mathbf{\hat{y}}-\mathbf{y}
\right)
\end{align}
$$

where

$$
\mathbf{\hat{y}} = \mathbf{X}\mathbf{\boldsymbol{\beta}}
$$

We could have derived the same thing using matrix calculus - noting the following:

$$
\begin{align}
J(\boldsymbol{\beta}) &= \frac{1}{2m}\sum^{m}_{j=1}\left(
y^j-g(\boldsymbol{\beta}^T\mathbf{x}^j)
\right)^2\\
&= \frac{1}{2m}\left(
\mathbf{y}-\mathbf{\hat{y}}
\right)^T
\left(
\mathbf{y}-\mathbf{\hat{y}}
\right)\\
&= \frac{1}{2m}\left(
\mathbf{y}-\mathbf{X}\boldsymbol{\beta}
\right)^T
\left(
\mathbf{y}-\mathbf{X}\boldsymbol{\beta}
\right)\\
&= \frac{1}{2m}\left(
\mathbf{y}^T\mathbf{y}
-\boldsymbol{\beta}^T\mathbf{X}^T\mathbf{y}
-\mathbf{y}^T\mathbf{X}\boldsymbol{\beta}
+\boldsymbol{\beta}^T\mathbf{X}^T\mathbf{X}\boldsymbol{\beta}
\right)\\
\end{align}
$$

and

$$
\frac{\partial}{\partial\mathbf{\boldsymbol{\beta}}}
\left(
A^T\boldsymbol{\beta}
\right) = A,\quad \forall A\in\mathbb{R}^{n\times1}\\
$$

and

$$
\frac{\partial}{\partial\mathbf{\boldsymbol{\beta}}}
\left(
\boldsymbol{\beta}^TA\boldsymbol{\beta}
\right) = 2A\boldsymbol{\beta},\quad \forall A\in\mathbb{R}^{m\times n}\\
$$

so

$$
\nabla_{\boldsymbol{\beta}}J=\frac{1}{m}\left(
\mathbf{X}^T\mathbf{X}\mathbf{\boldsymbol{\beta}}-\mathbf{X}^T\mathbf{y}
\right)$$

### make fake data

In [4]:
m = 100
x0 = np.ones(shape=(m, 1))
x1 = np.linspace(0, 10, m).reshape(-1, 1)
X = np.column_stack((x0, x1))

# let y = 0.5 * x + 1 + epsilon
epsilon =  np.random.normal(scale=0.5, size=(m, 1))
y = x1 + 1 + epsilon

In [5]:
fig = go.FigureWidget()
fig = fig.add_scatter(
    x=X[:,1],
    y=y[:,0],
    mode='markers',
    name='linear data + noise')
fig.layout.title = 'Fake linear data with noise'
fig.layout.xaxis.title = 'x1'
fig.layout.yaxis.title = 'y'
fig

FigureWidget({
    'data': [{'mode': 'markers',
              'name': 'linear data + noise',
              'ty…

### gradient descent

In [7]:
class LinearRegression():
    
    def __init__(self, learning_rate=0.05):
        self.learning_rate = learning_rate
        print('Creating linear model instance')

    def __repr__(self):
        return (
            f'<LinearRegression '
            f'learning_rate={self.learning_rate}>')

    def fit(self, X, y, n_iter=1000):
        m, n = X.shape
        print(f'fitting with m={m} samples with n={n-1} features\n')
        self.beta = np.zeros(shape=(n, 1))
        self.costs = []
        self.betas = [self.beta]
        for iteration in range(n_iter):
            y_pred = self.predict(X)
            cost = self.cost(y, y_pred, m)
            self.costs.append(cost[0][0])
            gradient = self.gradient(y, y_pred, X, m)
            self.beta = self.beta - (
                self.learning_rate * gradient)
            self.betas.append(self.beta)

    def cost(self, y, y_pred, m):
        cost = (1 / (2 * m)) * (y - y_pred).T @ (y - y_pred)
        return cost

    def gradient(self, y, y_pred, X, m):
        gradient = (1 / m) * X.T @ (y_pred - y)
        return gradient

    def predict(self, X):
        y_pred = X @ self.beta
        return y_pred

In [8]:
linear_regression = LinearRegression()
linear_regression

Creating linear model instance


<LinearRegression learning_rate=0.05>

In [16]:
linear_regression.fit(X, y)

fitting with m=100 samples with n=1 features



### plot the best fit

In [17]:
fig = fig.add_scatter(
    x=X[:,1], 
    y=linear_regression.predict(X)[:,0],
    mode='markers',
    name='best fit')
fig

FigureWidget({
    'data': [{'mode': 'markers',
              'name': 'linear data + noise',
              'ty…

### plot the cost function

In [18]:
def plot_surface(linear_regression):
    cost_fig = go.FigureWidget()
    cost_fig = cost_fig.add_scatter(
        x=list(range(len(linear_regression.costs))),
        y=linear_regression.costs,
        mode='markers+lines')
    cost_fig.layout.title = 'Cost by iteration'
    return cost_fig

In [19]:
cost_fig = plot_surface(linear_regression)
cost_fig

FigureWidget({
    'data': [{'mode': 'markers+lines',
              'type': 'scatter',
              'uid': '8…

In [20]:
def plot_surface(linear_regression):
    beta0s = [beta[0][0] for beta in linear_regression.betas]
    beta1s = [beta[1][0] for beta in linear_regression.betas]
    beta0_max = max(map(abs, beta0s)) * 1.05
    beta1_max = max(map(abs, beta1s)) * 1.05

    gradient_descent_fig = go.FigureWidget()
    gradient_descent_fig = gradient_descent_fig.add_scatter3d(
        x=beta0s,
        y=beta1s,
        z=linear_regression.costs,
        mode='markers+lines',
        marker={'size':3, 'color':'red'})

    beta0, beta1 = np.meshgrid(
        np.linspace(-beta0_max, beta0_max, 100),
        np.linspace(-beta1_max, beta1_max, 100))

    z = np.diag(
        (1 / (2 * m)) * \
        (y - (X @ np.column_stack((beta0.ravel(), beta1.ravel())).T)).T @ \
        (y - (X @ np.column_stack((beta0.ravel(), beta1.ravel())).T))
        ).reshape(beta1.shape)

    gradient_descent_fig = gradient_descent_fig.add_surface(
        x=beta0,
        y=beta1,
        z=z,
        opacity=0.8)
    
    gradient_descent_fig.layout.title = 'Cost function surface'
    gradient_descent_fig.layout.scene.xaxis.title = 'beta0'
    gradient_descent_fig.layout.scene.yaxis.title = 'beta1'
    gradient_descent_fig.layout.scene.zaxis.title = 'cost' 
    # cost = average sum square residuals

    return gradient_descent_fig

In [21]:
gradient_descent_fig = plot_surface(linear_regression)
gradient_descent_fig

FigureWidget({
    'data': [{'marker': {'color': 'red', 'size': 3},
              'mode': 'markers+lines',
   …

In [15]:
py.plot(gradient_descent_fig, filename='gradient_descent.html')

'gradient_descent.html'

## logistic regression

### the maths

The logistic model aims to predict the discrete y vairable a.k.a the target variable (e.g. whether something will happen) based on a collection of features. It does this by transforming a linear combination of the features into a curve and fitting this curve to the data.

The curve used in logistic regression is the sigmoid function

$$
\sigma(x) = \frac{1}{1+e^{-x}}
$$

Define y as

$$
\begin{align}
\hat{y} &= h_{\boldsymbol{\beta}}(\mathbf{x})\\
\hat{y}&= \sigma\left(\beta_0x_0+\cdots+\beta_nx_n\right)\quad &n\in \mathbb{N},x_0=1 \\
\hat{y}&=\sigma\left(\sum^{n}_{i=0}\beta_ix_i\right) \\
\hat{y}&=\sigma\left(\mathbf{\boldsymbol{\beta}^Tx}\right)\quad&\boldsymbol{\beta},\mathbf{x}\in\mathbb{R}^{n\times1}\\
\hat{y}&=\sigma\left(\boldsymbol{\beta}^T\mathbf{x}\right)
\end{align}
$$

notice

$$
\hat{y} = \frac{1}{1+e^{-\boldsymbol{\beta}^T\mathbf{x}}}
$$

so

$$
\begin{align}
\hat{y} + \hat{y}e^{\boldsymbol{\beta}^T\mathbf{x}} &= 1\\
\hat{y}e^{\boldsymbol{\beta}^T\mathbf{x}} &= 1 - \hat{y}\\
\frac{\hat{y}}{1 - \hat{y}} &= e^{\boldsymbol{\beta}^T\mathbf{x}}\\
\ln\left(\frac{\hat{y}}{1 - \hat{y}}\right)&=\boldsymbol{\beta}^T\mathbf{x}
\end{align}
$$

This above is the logit form of logistic regression. We model the logit as a linear combination of the x variables

We define the cost function as follows for each y and corresponding x

$$
\begin{align}
J(\mathbf{x})
&= \begin{cases}
-\log\left(h_{\boldsymbol{\beta}}(\mathbf{x})\right) &\text{if y=1}\\
-\log\left(1-h_{\boldsymbol{\beta}}(\mathbf{x})\right) &\text{if y=0}\\
\end{cases}
\end{align}
$$

$$
\begin{align}
J(\mathbf{x})
&= -\frac{1}{m}\sum_{j=1}^my^j\log\left(h_{\boldsymbol{\beta}}(\mathbf{x}^j)\right)
+(1-y^j)\log\left(1-h_{\boldsymbol{\beta}}(\mathbf{x}^j)\right)\\
&= -\frac{1}{m}\sum_{j=1}^my^j\log\left(\frac{1}{1+e^{-\boldsymbol{\beta}^T\mathbf{x}}}\right)
+(1-y^j)\log\left(1-\frac{1}{1+e^{-\boldsymbol{\beta}^T\mathbf{x}}}\right)\\
&= -\frac{1}{m}\sum_{j=1}^my^j\log\left(\frac{1}{1+e^{-\sum^{n}_{i=0}\beta_ix_i}}\right)
+(1-y^j)\log\left(1-\frac{1}{1+e^{-\sum^{n}_{i=0}\beta_ix_i}}\right)
\end{align}
$$

note

$$
\begin{align}
h_{\boldsymbol{\beta}}(\mathbf{x}^j)&=\frac{1}{1+e^{-\boldsymbol{\beta}^T\mathbf{x}^j}}\\
&=\frac{1}{1+e^{-\sum^{n}_{i=0}\beta_ix_i}}
\end{align}
$$

so

$$
\begin{align}
\frac{\partial h}{\partial \beta_k} &= \left(1+e^{-\sum^{n}_{i=0}\beta_ix_i}\right)^{-2}e^{-\sum^{n}_{i=0}\beta_ix_i} (-x_k^j)\\
&=\frac{1}{1+e^{-\sum^{n}_{i=0}\beta_ix_i}}
\frac{e^{-\sum^{n}_{i=0}\beta_ix_i} (-x_k^j)}{1+e^{-\sum^{n}_{i=0}\beta_ix_i}}\\
&=\frac{1}{1+e^{-\sum^{n}_{i=0}\beta_ix_i}}
\frac{(1-1+e^{-\sum^{n}_{i=0}\beta_ix_i})(-x_k^j)}{1+e^{-\sum^{n}_{i=0}\beta_ix_i}}\\
&=\frac{1}{1+e^{-\sum^{n}_{i=0}\beta_ix_i}}
\left(
\frac{1}{1+e^{-\sum^{n}_{i=0}\beta_ix_i}}-
\frac{1+e^{-\sum^{n}_{i=0}\beta_ix_i}}{1+e^{-\sum^{n}_{i=0}\beta_ix_i}}
\right)(-x_k^j)\\
&=\frac{1}{1+e^{-\sum^{n}_{i=0}\beta_ix_i}}
\left(
\frac{1+e^{-\sum^{n}_{i=0}\beta_ix_i}}{1+e^{-\sum^{n}_{i=0}\beta_ix_i}}-
\frac{1}{1+e^{-\sum^{n}_{i=0}\beta_ix_i}}
\right)(x_k^j)\\
&=\frac{1}{1+e^{-\sum^{n}_{i=0}\beta_ix_i}}
\left(
1-
\frac{1}{1+e^{-\sum^{n}_{i=0}\beta_ix_i}}
\right)(x_k^j)\\
&=h_{\boldsymbol{\beta}}(\mathbf{x}^j)(1-h_{\boldsymbol{\beta}}(\mathbf{x}^j))x_k^j
\end{align}
$$

We need to differentiate the cost function i.e. find the gradient

$$
\begin{align}
\frac{\partial J}{\partial\beta_k}\left(\boldsymbol{\beta}\right) 
&=\frac{\partial}{\partial\beta_k}\left(
-\frac{1}{m}\sum_{j=1}^my^j\log\left(h_{\boldsymbol{\beta}}(\mathbf{x}^j)\right)
+(1-y^j)\log\left(1-h_{\boldsymbol{\beta}}(\mathbf{x}^j)\right)
\right)\\
&=-\frac{1}{m}\sum_{j=1}^m\frac{y^j}{h_{\boldsymbol{\beta}}(\mathbf{x}^j)}\frac{\partial h}{\partial \beta_k}
+\frac{-(1-y^j)}{1-h_{\boldsymbol{\beta}}(\mathbf{x}^j)}\frac{\partial h}{\partial \beta_k}\\
&=-\frac{1}{m}\sum_{j=1}^m\frac{y^j}{h_{\boldsymbol{\beta}}(\mathbf{x}^j)}
h_{\boldsymbol{\beta}}(\mathbf{x}^j)(1-h_{\boldsymbol{\beta}}(\mathbf{x}^j))x_k^j
+\frac{-(1-y^j)}{1-h_{\boldsymbol{\beta}}(\mathbf{x}^j)}
h_{\boldsymbol{\beta}}(\mathbf{x}^j)(1-h_{\boldsymbol{\beta}}(\mathbf{x}^j))x_k^j\\
&=-\frac{1}{m}\sum_{j=1}^my^j(1-h_{\boldsymbol{\beta}}(\mathbf{x}^j))x_k^j
-(1-y^j)
h_{\boldsymbol{\beta}}(\mathbf{x}^j)x_k^j\\
&=\frac{1}{m}\sum_{j=1}^m
\left(h_{\boldsymbol{\beta}}(\mathbf{x}^j)-y^j\right)x_k^j
\end{align}
$$

hence

$$
\nabla_{\boldsymbol{\beta}} J
=
\begin{bmatrix}
       \frac{\partial J}{\partial\beta_1} \\
       \vdots \\
       \frac{\partial J}{\partial\beta_n}
\end{bmatrix}
=
\begin{bmatrix}
       \frac{1}{m}\sum_{j=1}^m
            \left(h_{\boldsymbol{\beta}}(\mathbf{x}^j)-y^j\right)x_1^j\\
       \vdots \\
       \frac{1}{m}\sum_{j=1}^m
           \left(h_{\boldsymbol{\beta}}(\mathbf{x}^j)-y^j\right)x_n^j
\end{bmatrix}
$$

Define the design matrix and column representation of y. Here each row of X and y are training examples hence there are m rows

$$\mathbf{X}\in\mathbb{R}^{m\times n},
\quad \mathbf{y}\in\mathbb{R}^{m\times 1}
$$

$$\\
\mathbf{X}=\begin{bmatrix}
       \dots & (\mathbf{x}^1)^T & \dots\\
       \dots & (\mathbf{x}^2)^T & \dots\\
       \dots & \vdots  & \dots\\
       \dots & (\mathbf{x}^m)^T & \dots
\end{bmatrix}\quad
\mathbf{y}=\begin{bmatrix}
    y_1\\y_2\\\vdots\\y_m
\end{bmatrix}
$$

$$
\begin{align}
\nabla_{\boldsymbol{\beta}} J
=
\begin{bmatrix}
       \frac{1}{m}\sum_{j=1}^m
            \left(h_{\boldsymbol{\beta}}(\mathbf{x}^j)-y^j\right)x_1^j\\
       \vdots \\
       \frac{1}{m}\sum_{j=1}^m
           \left(h_{\boldsymbol{\beta}}(\mathbf{x}^j)-y^j\right)x_n^j
\end{bmatrix}
=
\frac{1}{m}
\begin{bmatrix}
       \sum^{n}_{i=0}h_{\boldsymbol{\beta}}(\mathbf{x}^j)^jx^j_1\\
       \vdots \\
       \sum^{n}_{i=0}h_{\boldsymbol{\beta}}(\mathbf{x}^j)x^j_n
\end{bmatrix}
-
\frac{1}{m}
\begin{bmatrix}
       \sum^{m}_{j=1}y^jx^j_1\\
       \vdots \\
       \sum^{m}_{j=1}y^jx^j_n\\
\end{bmatrix}
\end{align}
$$

$$
h_{\boldsymbol{\beta}}(\mathbf{x}^j) = \sigma({\mathbf{x}^j}^T\boldsymbol{\beta})
$$

so

$$
\begin{align}
\nabla_{\boldsymbol{\beta}} J
&=\frac{1}{m}\left(
\mathbf{X}^T\sigma(\mathbf{X}\mathbf{\boldsymbol{\beta}})-\mathbf{X}^T\mathbf{y}
\right)\\
&=\frac{1}{m}\mathbf{X}^T\left(
\sigma(\mathbf{X}\mathbf{\boldsymbol{\beta}})-\mathbf{y}
\right)\\
&=\frac{1}{m}\mathbf{X}^T\left(
\mathbf{\hat{y}}-\mathbf{y}
\right)
\end{align}
$$

where

$$
\mathbf{\hat{y}} = \sigma(\mathbf{X}\mathbf{\boldsymbol{\beta}})
$$

We could have derived the same thing using matrix calculus

### example sigmoid

The curve used in logistic regression is the sigmoid function

$$
\sigma(x) = \frac{1}{1+e^{-x}}
$$

In [91]:
sigmoid_fig = go.FigureWidget()
demo_x = np.arange(-10,10,0.1)
demo_y = 1 / (1 + np.exp(-demo_x))
sigmoid_fig.add_scatter(
    x=demo_x,
    y=demo_y)
sigmoid_fig.layout.title = 'Sigmoid Function'
sigmoid_fig

FigureWidget({
    'data': [{'type': 'scatter',
              'uid': '53f53c41-ef72-45d3-adca-5b96c6807a35',
 …

### make fake data

In [85]:
m = 100
x0 = np.ones(shape=(m, 1))
x1 = np.linspace(0, 10, m).reshape(-1, 1)
X = np.column_stack((x0, x1))

# let y = 0.5 * x + 1 + epsilon
epsilon =  np.random.normal(scale=2, size=(m, 1))
y = x1 + epsilon
y = (y > 5).astype(int)

In [86]:
fig = go.FigureWidget()
fig = fig.add_scatter(
    x=X[:,1],
    y=y[:,0],
    mode='markers',
    name='linear data + noise')
fig

FigureWidget({
    'data': [{'mode': 'markers',
              'name': 'linear data + noise',
              'ty…

### graident descent

In [125]:
class LogisticRegression():
    
    def __init__(self, learning_rate=0.05):
        self.learning_rate = learning_rate
        print('Creating logistic model instance')

    def __repr__(self):
        return (
            f'<LogisticRegression '
            f'learning_rate={self.learning_rate}>')

    def fit(self, X, y, n_iter=1000):
        m, n = X.shape
        print(f'fitting with m={m} samples with n={n} features\n')
        self.beta = np.zeros(shape=(n, 1))
        self.costs = []
        self.betas = [self.beta]
        for iteration in range(n_iter):
            y_pred = self.predict(X)
            cost = (-1 / m) * (
                (y.T @ np.log(y_pred)) +
                ((np.ones(shape=y.shape) - y).T @ np.log(
                    np.ones(shape=y_pred.shape) - y_pred))
            )
            self.costs.append(cost[0][0])
            gradient = (1 / m) * X.T @ (y_pred - y)
            self.beta = self.beta - (
                self.learning_rate * gradient)
            self.betas.append(self.beta)

    def predict(self, X):
        y_pred = self.sigmoid(X @ self.beta)
        return y_pred
    
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    

In [126]:
logistic_regression = LogisticRegression()
logistic_regression.fit(X, y)

Creating logistic model instance
fitting with m=100 samples with n=2 features



### plot the best fit

In [127]:
fig = fig.add_scatter(
    x=X[:,1], 
    y=logistic_regression.predict(X)[:,0],
    mode='markers',
    name='logistic best fit')
fig

FigureWidget({
    'data': [{'mode': 'markers',
              'name': 'linear data + noise',
              'ty…

### plot the cost function

In [128]:
cost_fig = plot_surface(logistic_regression)
cost_fig

FigureWidget({
    'data': [{'mode': 'markers+lines',
              'type': 'scatter',
              'uid': 'c…

## knn

### the maths

The k-nearest neighbours algorithm finds k clusters from the data. It does this by starting with k centroids (often randomly selected) and then assigning each point in the data to a cluster based on it's closest centroid. The centroids are then updated as the mean of all points in the cluster. This process is repeated untill the centroids stop changing

1. Choose k
2. randomly select centroids $c_1,\dots,c_k \in \mathbb{R}^n$
3. until convergence repeat
    * for each $x_i$ assign to cluster $C_j \in \{C_1\dots C_k\}$ where
    $$
    j = \arg\min_{j}\sqrt{(x_i - c_j)^2}
    $$ 
    * update each $c_j$ as
    $$
    c_j = \frac{1}{|C_j|}\sum_{x_j\in C_j}x_j
    $$
    

### make fake data

In [432]:
def get_2d_blob(n,x,y,r=1):
    x1 = np.random.normal(x, r, n)
    y1 = np.random.normal(y, 1 /(1 + abs(x1 - x))**.5, n)
    return x1, y1

In [433]:
n=100
xy = np.concatenate((
    get_2d_blob(n,1,2,3),
    get_2d_blob(n,14,5),
    get_2d_blob(n,7,10)
),axis=1).T

x = xy[:,0]
y = xy[:,1]

In [434]:
knn_fig = go.FigureWidget()
scatter  = go.Scatter(x=x,
                      y=y,
                      mode='markers',
                      name='blob data + noise')
knn_fig.add_trace(scatter)
iteration = 0
knn_fig.layout.title = f'knn Iteration {iteration}'

color_mapping={}
for i in [1,2,3]:
    color_mapping[i] = knn_fig.layout.template.layout.colorway[i]

In [435]:
knn_fig

FigureWidget({
    'data': [{'mode': 'markers',
              'name': 'blob data + noise',
              'type…

### knn implementaion

In [436]:
# set k and define random centroids
k=3
c = np.column_stack((
    np.random.uniform(x.min(),x.max(),k),
    np.random.uniform(y.min(),y.max(),k)))
cs = [c]

# add the centroids to the fig
for i in range(len(c)):
    knn_fig.add_scatter(
        x=[c[i,0]],
        y=[c[i,1]],
        mode='markers',
        marker=dict(
            size=12,
            line=dict(width=2,color='DarkSlateGrey')),
        text=[f'Centroid-{i}'],
        name=f'Centroid-{i}')

# Euclidean Distance Caculator
def dist(a, b, ax=1):
    return np.linalg.norm(a - b, axis=ax)

def assign_clusters(xy, c):
    distances = []
    for centroid in c:
#         print(centroid)
        centorid_distances = dist(xy,centroid)
#         print(min(centorid_distances))
        distances.append(centorid_distances)
    all_distaces = np.array(distances).T
    cluster_labels = np.argmin(all_distaces,axis=1)
    data_colors = list(map(lambda x: color_mapping[x+1], cluster_labels))
    knn_fig.data[0].marker.color = data_colors
    return cluster_labels

def update_centroids(xy, cluster_labels, cs):
    global iteration
    c = []
    for i in range(3):
        centroid = xy[cluster_labels == i,:].mean(axis=0)
        knn_fig.data[1+i].x = (centroid[0],)
        knn_fig.data[1+i].y = (centroid[1],)
        c.append(centroid)
    c = np.array(c)
    cs.append(c)
    iteration +=1
    title = f'knn Iteration {iteration}'
    knn_fig.layout.title = title
    print(f'Change in centroids {dist(cs[-1],cs[-2])}')
    return c, cs

def update_step():
    global xy, c, cs
    cluster_labels = assign_clusters(xy, c)
    c, cs = update_centroids(xy, cluster_labels, cs)

knn_fig

FigureWidget({
    'data': [{'mode': 'markers',
              'name': 'blob data + noise',
              'type…

In [441]:
update_step()

Change in centroids [0. 0. 0.]


# end