# Machine Learning Implementation

## Imports

In [1]:
import pandas as pd
import plotly.offline as py
from plotly import graph_objects as go
import numpy as np
import json

## Linear regression

### The maths

The linear model (or line of best fit in 2D) aims to describe the continuous y vairable a.k.a the target variable (e.g. house prices) as a linear combination of features (e.g. square footage / number of bedrooms) the features are also refered to as the design matrix.

$$
\begin{align}
\hat{y}&=\beta_0x_0+\cdots+\beta_nx_n\quad &n\in \mathbb{N}, x_o = 1 \\
\hat{y}&=\sum^{n}_{i=0}\beta_ix_i \\
\hat{y}&=\mathbf{\boldsymbol{\beta}^Tx}\quad&\boldsymbol{\beta},\mathbf{x}\in\mathbb{R}^{n\times1}\\
\hat{y}&=g(\boldsymbol{\beta}^T\mathbf{x})
\end{align}
$$

where g, the activation function, is the identidy in linear regression  

We define the cost function as half of the mean square error:

$$
\begin{align}
J(\boldsymbol{\beta})
&= \frac{1}{2m}\sum^{m}_{j=1}\left(
y^j-\hat{y}^j
\right)^2,\quad m\in \mathbb{N} \text{ is the number of training samples}\\
&= \frac{1}{2m}\sum^{m}_{j=1}\left(
y^j-g(\boldsymbol{\beta}^T\mathbf{x}^j)
\right)^2
\end{align}
$$

We need to differentiate the cost function i.e. find the gradient

$$
\begin{align}
\frac{\partial J}{\partial\beta_k}\left(\boldsymbol{\beta}\right) &= \frac{\partial}{\partial\beta_k}\left(
\frac{1}{2m}\sum^{m}_{j=1}\left(
y^j-g(\boldsymbol{\beta}^T\mathbf{x}^j)\right)^2
\right)\\
&= \frac{\partial}{\partial\beta_k}\left(
\frac{1}{2m}\sum^{m}_{j=1}
\left(
y^j-\sum^{n}_{i=0}\beta_ix_i^j
\right)^2
\right)\\
&=
\frac{1}{m}\sum^{m}_{j=1}
\left(
y^j-\sum^{n}_{i=0}\beta_ix_i^j
\right)(-x^j_k)\\
\end{align}
$$

hence

$$
\nabla_{\boldsymbol{\beta}} J
=
\begin{bmatrix}
       \frac{\partial J}{\partial\beta_1} \\
       \vdots \\
       \frac{\partial J}{\partial\beta_n}
\end{bmatrix}
=
\begin{bmatrix}
       -\frac{1}{m}\sum^{m}_{j=1}
           \left(y^j-\sum^{n}_{i=0}\beta_ix_i^j\right)x^j_1\\
       \vdots \\
       -\frac{1}{m}\sum^{m}_{j=1}
           \left(y^j-\sum^{n}_{i=0}\beta_ix_i^j\right)x^j_n\\
\end{bmatrix}
$$

Define the design matrix and column representation of y. Here each row of X and y are training examples hence there are m rows

$$\mathbf{X}\in\mathbb{R}^{m\times n},
\quad \mathbf{y}\in\mathbb{R}^{m\times 1}
$$

$$
\mathbf{X}=\begin{bmatrix}
       \dots & (\mathbf{x}^1)^T & \dots\\
       \dots & (\mathbf{x}^2)^T & \dots\\
       \dots & \vdots  & \dots\\
       \dots & (\mathbf{x}^m)^T & \dots
\end{bmatrix}\quad
\mathbf{y}=\begin{bmatrix}
    y_1\\y_2\\\vdots\\y_m
\end{bmatrix}
$$

$$
\begin{align}
\nabla_{\boldsymbol{\beta}} J
&=
\begin{bmatrix}
       -\frac{1}{m}\sum^{m}_{j=1}
           \left(y^j-\sum^{n}_{i=0}\beta_ix_i^j\right)x^j_1\\
       \vdots \\
       -\frac{1}{m}\sum^{m}_{j=1}
           \left(y^j-\sum^{n}_{i=0}\beta_ix_i^j\right)x^j_n\\
\end{bmatrix}
=-\frac{1}{m}
\begin{bmatrix}
       \sum^{m}_{j=1}y^jx^j_1\\
       \vdots \\
       \sum^{m}_{j=1}y^jx^j_n\\
\end{bmatrix}+
\frac{1}{m}
\begin{bmatrix}
       \sum^{m}_{j=0}\sum^{n}_{i=0}\beta_ix_i^jx^j_1\\
       \vdots \\
       \sum^{m}_{j=0}\sum^{n}_{i=0}\beta_ix_i^jx^j_n
\end{bmatrix}\\
\end{align}
$$

so

$$
\begin{align}
\nabla_{\boldsymbol{\beta}} J
&=\frac{1}{m}\left(
\mathbf{X}^T\mathbf{X}\mathbf{\boldsymbol{\beta}}-\mathbf{X}^T\mathbf{y}
\right)\\
&=\frac{1}{m}\mathbf{X}^T\left(
\mathbf{X}\mathbf{\boldsymbol{\beta}}-\mathbf{y}
\right)\\
&=\frac{1}{m}\mathbf{X}^T\left(
\mathbf{\hat{y}}-\mathbf{y}
\right)
\end{align}
$$

where

$$
\mathbf{\hat{y}} = \mathbf{X}\mathbf{\boldsymbol{\beta}}
$$

We could have derived the same thing using matrix calculus - noting the following:

$$
\begin{align}
J(\boldsymbol{\beta}) &= \frac{1}{2m}\sum^{m}_{j=1}\left(
y^j-g(\boldsymbol{\beta}^T\mathbf{x}^j)
\right)^2\\
&= \frac{1}{2m}\left(
\mathbf{y}-\mathbf{\hat{y}}
\right)^T
\left(
\mathbf{y}-\mathbf{\hat{y}}
\right)\\
&= \frac{1}{2m}\left(
\mathbf{y}-\mathbf{X}\boldsymbol{\beta}
\right)^T
\left(
\mathbf{y}-\mathbf{X}\boldsymbol{\beta}
\right)\\
&= \frac{1}{2m}\left(
\mathbf{y}^T\mathbf{y}
-\boldsymbol{\beta}^T\mathbf{X}^T\mathbf{y}
-\mathbf{y}^T\mathbf{X}\boldsymbol{\beta}
+\boldsymbol{\beta}^T\mathbf{X}^T\mathbf{X}\boldsymbol{\beta}
\right)\\
\end{align}
$$

and

$$
\frac{\partial}{\partial\mathbf{\boldsymbol{\beta}}}
\left(
A^T\boldsymbol{\beta}
\right) = A,\quad \forall A\in\mathbb{R}^{n\times1}\\
$$

and

$$
\frac{\partial}{\partial\mathbf{\boldsymbol{\beta}}}
\left(
\boldsymbol{\beta}^TA\boldsymbol{\beta}
\right) = 2A\boldsymbol{\beta},\quad \forall A\in\mathbb{R}^{m\times n}\\
$$

so

$$
\nabla_{\boldsymbol{\beta}}J=\frac{1}{m}\left(
\mathbf{X}^T\mathbf{X}\mathbf{\boldsymbol{\beta}}-\mathbf{X}^T\mathbf{y}
\right)$$

### Make fake data

In [2]:
m = 100
x0 = np.ones(shape=(m, 1))
x1 = np.linspace(0, 10, m).reshape(-1, 1)
X = np.column_stack((x0, x1))

# let y = 0.5 * x + 1 + epsilon
epsilon =  np.random.normal(scale=0.5, size=(m, 1))
y = x1 + 1 + epsilon

In [6]:
fig = go.FigureWidget()
fig = fig.add_scatter(
    x=X[:,1],
    y=y[:,0],
    mode='markers',
    name='linear data + noise')
fig.layout.title = 'Fake linear data with noise'
fig.layout.xaxis.title = 'x1'
fig.layout.yaxis.title = 'y'
fig.show()

### Gradient descent

In [7]:
class LinearRegression():
    
    def __init__(self, learning_rate=0.05):
        self.learning_rate = learning_rate
        print('Creating linear model instance')

    def __repr__(self):
        return (
            f'<LinearRegression '
            f'learning_rate={self.learning_rate}>')

    def fit(self, X, y, n_iter=1000):
        m, n = X.shape
        print(f'fitting with m={m} samples with n={n-1} features\n')
        self.beta = np.zeros(shape=(n, 1))
        self.costs = []
        self.betas = [self.beta]
        for iteration in range(n_iter):
            y_pred = self.predict(X)
            cost = self.cost(y, y_pred, m)
            self.costs.append(cost[0][0])
            gradient = self.gradient(y, y_pred, X, m)
            self.beta = self.beta - (
                self.learning_rate * gradient)
            self.betas.append(self.beta)

    def cost(self, y, y_pred, m):
        cost = (1 / (2 * m)) * (y - y_pred).T @ (y - y_pred)
        return cost

    def gradient(self, y, y_pred, X, m):
        gradient = (1 / m) * X.T @ (y_pred - y)
        return gradient

    def predict(self, X):
        y_pred = X @ self.beta
        return y_pred

In [8]:
linear_regression = LinearRegression()
linear_regression

Creating linear model instance


<LinearRegression learning_rate=0.05>

In [9]:
linear_regression.fit(X, y)

fitting with m=100 samples with n=1 features



### Plot the best fit

In [10]:
fig = fig.add_scatter(
    x=X[:,1], 
    y=linear_regression.predict(X)[:,0],
    mode='markers',
    name='best fit')
fig.show()

### Plot the cost function

In [11]:
def plot_surface(linear_regression):
    cost_fig = go.FigureWidget()
    cost_fig = cost_fig.add_scatter(
        x=list(range(len(linear_regression.costs))),
        y=linear_regression.costs,
        mode='markers+lines')
    cost_fig.layout.title = 'Cost by iteration'
    return cost_fig

In [12]:
cost_fig = plot_surface(linear_regression)
cost_fig.show()

In [13]:
def plot_surface(linear_regression):
    beta0s = [beta[0][0] for beta in linear_regression.betas]
    beta1s = [beta[1][0] for beta in linear_regression.betas]
    beta0_max = max(map(abs, beta0s)) * 1.05
    beta1_max = max(map(abs, beta1s)) * 1.05

    gradient_descent_fig = go.FigureWidget()
    gradient_descent_fig = gradient_descent_fig.add_scatter3d(
        x=beta0s,
        y=beta1s,
        z=linear_regression.costs,
        mode='markers+lines',
        marker={'size':3, 'color':'red'})

    beta0, beta1 = np.meshgrid(
        np.linspace(-beta0_max, beta0_max, 100),
        np.linspace(-beta1_max, beta1_max, 100))

    z = np.diag(
        (1 / (2 * m)) * \
        (y - (X @ np.column_stack((beta0.ravel(), beta1.ravel())).T)).T @ \
        (y - (X @ np.column_stack((beta0.ravel(), beta1.ravel())).T))
        ).reshape(beta1.shape)

    gradient_descent_fig = gradient_descent_fig.add_surface(
        x=beta0,
        y=beta1,
        z=z,
        opacity=0.8)
    
    gradient_descent_fig.layout.title = 'Cost function surface'
    gradient_descent_fig.layout.scene.xaxis.title = 'beta0'
    gradient_descent_fig.layout.scene.yaxis.title = 'beta1'
    gradient_descent_fig.layout.scene.zaxis.title = 'cost' 
    # cost = average sum square residuals

    return gradient_descent_fig

In [14]:
gradient_descent_fig = plot_surface(linear_regression)
gradient_descent_fig.show()

In [16]:
py.plot(gradient_descent_fig, filename='gradient_descent.html')

'gradient_descent.html'

## End