# Machine Learning Implementation

## imports

In [3]:
import pandas as pd
import plotly.offline as py
from plotly import graph_objects as go
import numpy as np
import json

## Logistic regression

### the maths

The logistic model aims to predict the discrete y vairable a.k.a the target variable (e.g. whether something will happen) based on a collection of features. It does this by transforming a linear combination of the features into a curve and fitting this curve to the data.

The curve used in logistic regression is the sigmoid function

$$
\sigma(x) = \frac{1}{1+e^{-x}}
$$

Define y as

$$
\begin{align}
\hat{y} &= h_{\boldsymbol{\beta}}(\mathbf{x})\\
\hat{y}&= \sigma\left(\beta_0x_0+\cdots+\beta_nx_n\right)\quad &n\in \mathbb{N},x_0=1 \\
\hat{y}&=\sigma\left(\sum^{n}_{i=0}\beta_ix_i\right) \\
\hat{y}&=\sigma\left(\mathbf{\boldsymbol{\beta}^Tx}\right)\quad&\boldsymbol{\beta},\mathbf{x}\in\mathbb{R}^{n\times1}\\
\hat{y}&=\sigma\left(\boldsymbol{\beta}^T\mathbf{x}\right)
\end{align}
$$

notice

$$
\hat{y} = \frac{1}{1+e^{-\boldsymbol{\beta}^T\mathbf{x}}}
$$

so

$$
\begin{align}
\hat{y} + \hat{y}e^{\boldsymbol{\beta}^T\mathbf{x}} &= 1\\
\hat{y}e^{\boldsymbol{\beta}^T\mathbf{x}} &= 1 - \hat{y}\\
\frac{\hat{y}}{1 - \hat{y}} &= e^{\boldsymbol{\beta}^T\mathbf{x}}\\
\ln\left(\frac{\hat{y}}{1 - \hat{y}}\right)&=\boldsymbol{\beta}^T\mathbf{x}
\end{align}
$$

This above is the logit form of logistic regression. We model the logit as a linear combination of the x variables

We define the cost function as follows for each y and corresponding x

$$
\begin{align}
J(\mathbf{x})
&= \begin{cases}
-\log\left(h_{\boldsymbol{\beta}}(\mathbf{x})\right) &\text{if y=1}\\
-\log\left(1-h_{\boldsymbol{\beta}}(\mathbf{x})\right) &\text{if y=0}\\
\end{cases}
\end{align}
$$

$$
\begin{align}
J(\mathbf{x})
&= -\frac{1}{m}\sum_{j=1}^my^j\log\left(h_{\boldsymbol{\beta}}(\mathbf{x}^j)\right)
+(1-y^j)\log\left(1-h_{\boldsymbol{\beta}}(\mathbf{x}^j)\right)\\
&= -\frac{1}{m}\sum_{j=1}^my^j\log\left(\frac{1}{1+e^{-\boldsymbol{\beta}^T\mathbf{x}}}\right)
+(1-y^j)\log\left(1-\frac{1}{1+e^{-\boldsymbol{\beta}^T\mathbf{x}}}\right)\\
&= -\frac{1}{m}\sum_{j=1}^my^j\log\left(\frac{1}{1+e^{-\sum^{n}_{i=0}\beta_ix_i}}\right)
+(1-y^j)\log\left(1-\frac{1}{1+e^{-\sum^{n}_{i=0}\beta_ix_i}}\right)
\end{align}
$$

note

$$
\begin{align}
h_{\boldsymbol{\beta}}(\mathbf{x}^j)&=\frac{1}{1+e^{-\boldsymbol{\beta}^T\mathbf{x}^j}}\\
&=\frac{1}{1+e^{-\sum^{n}_{i=0}\beta_ix_i}}
\end{align}
$$

so

$$
\begin{align}
\frac{\partial h}{\partial \beta_k} &= \left(1+e^{-\sum^{n}_{i=0}\beta_ix_i}\right)^{-2}e^{-\sum^{n}_{i=0}\beta_ix_i} (-x_k^j)\\
&=\frac{1}{1+e^{-\sum^{n}_{i=0}\beta_ix_i}}
\frac{e^{-\sum^{n}_{i=0}\beta_ix_i} (-x_k^j)}{1+e^{-\sum^{n}_{i=0}\beta_ix_i}}\\
&=\frac{1}{1+e^{-\sum^{n}_{i=0}\beta_ix_i}}
\frac{(1-1+e^{-\sum^{n}_{i=0}\beta_ix_i})(-x_k^j)}{1+e^{-\sum^{n}_{i=0}\beta_ix_i}}\\
&=\frac{1}{1+e^{-\sum^{n}_{i=0}\beta_ix_i}}
\left(
\frac{1}{1+e^{-\sum^{n}_{i=0}\beta_ix_i}}-
\frac{1+e^{-\sum^{n}_{i=0}\beta_ix_i}}{1+e^{-\sum^{n}_{i=0}\beta_ix_i}}
\right)(-x_k^j)\\
&=\frac{1}{1+e^{-\sum^{n}_{i=0}\beta_ix_i}}
\left(
\frac{1+e^{-\sum^{n}_{i=0}\beta_ix_i}}{1+e^{-\sum^{n}_{i=0}\beta_ix_i}}-
\frac{1}{1+e^{-\sum^{n}_{i=0}\beta_ix_i}}
\right)(x_k^j)\\
&=\frac{1}{1+e^{-\sum^{n}_{i=0}\beta_ix_i}}
\left(
1-
\frac{1}{1+e^{-\sum^{n}_{i=0}\beta_ix_i}}
\right)(x_k^j)\\
&=h_{\boldsymbol{\beta}}(\mathbf{x}^j)(1-h_{\boldsymbol{\beta}}(\mathbf{x}^j))x_k^j
\end{align}
$$

We need to differentiate the cost function i.e. find the gradient

$$
\begin{align}
\frac{\partial J}{\partial\beta_k}\left(\boldsymbol{\beta}\right) 
&=\frac{\partial}{\partial\beta_k}\left(
-\frac{1}{m}\sum_{j=1}^my^j\log\left(h_{\boldsymbol{\beta}}(\mathbf{x}^j)\right)
+(1-y^j)\log\left(1-h_{\boldsymbol{\beta}}(\mathbf{x}^j)\right)
\right)\\
&=-\frac{1}{m}\sum_{j=1}^m\frac{y^j}{h_{\boldsymbol{\beta}}(\mathbf{x}^j)}\frac{\partial h}{\partial \beta_k}
+\frac{-(1-y^j)}{1-h_{\boldsymbol{\beta}}(\mathbf{x}^j)}\frac{\partial h}{\partial \beta_k}\\
&=-\frac{1}{m}\sum_{j=1}^m\frac{y^j}{h_{\boldsymbol{\beta}}(\mathbf{x}^j)}
h_{\boldsymbol{\beta}}(\mathbf{x}^j)(1-h_{\boldsymbol{\beta}}(\mathbf{x}^j))x_k^j
+\frac{-(1-y^j)}{1-h_{\boldsymbol{\beta}}(\mathbf{x}^j)}
h_{\boldsymbol{\beta}}(\mathbf{x}^j)(1-h_{\boldsymbol{\beta}}(\mathbf{x}^j))x_k^j\\
&=-\frac{1}{m}\sum_{j=1}^my^j(1-h_{\boldsymbol{\beta}}(\mathbf{x}^j))x_k^j
-(1-y^j)
h_{\boldsymbol{\beta}}(\mathbf{x}^j)x_k^j\\
&=\frac{1}{m}\sum_{j=1}^m
\left(h_{\boldsymbol{\beta}}(\mathbf{x}^j)-y^j\right)x_k^j
\end{align}
$$

hence

$$
\nabla_{\boldsymbol{\beta}} J
=
\begin{bmatrix}
       \frac{\partial J}{\partial\beta_1} \\
       \vdots \\
       \frac{\partial J}{\partial\beta_n}
\end{bmatrix}
=
\begin{bmatrix}
       \frac{1}{m}\sum_{j=1}^m
            \left(h_{\boldsymbol{\beta}}(\mathbf{x}^j)-y^j\right)x_1^j\\
       \vdots \\
       \frac{1}{m}\sum_{j=1}^m
           \left(h_{\boldsymbol{\beta}}(\mathbf{x}^j)-y^j\right)x_n^j
\end{bmatrix}
$$

Define the design matrix and column representation of y. Here each row of X and y are training examples hence there are m rows

$$\mathbf{X}\in\mathbb{R}^{m\times n},
\quad \mathbf{y}\in\mathbb{R}^{m\times 1}
$$

$$\\
\mathbf{X}=\begin{bmatrix}
       \dots & (\mathbf{x}^1)^T & \dots\\
       \dots & (\mathbf{x}^2)^T & \dots\\
       \dots & \vdots  & \dots\\
       \dots & (\mathbf{x}^m)^T & \dots
\end{bmatrix}\quad
\mathbf{y}=\begin{bmatrix}
    y_1\\y_2\\\vdots\\y_m
\end{bmatrix}
$$

$$
\begin{align}
\nabla_{\boldsymbol{\beta}} J
=
\begin{bmatrix}
       \frac{1}{m}\sum_{j=1}^m
            \left(h_{\boldsymbol{\beta}}(\mathbf{x}^j)-y^j\right)x_1^j\\
       \vdots \\
       \frac{1}{m}\sum_{j=1}^m
           \left(h_{\boldsymbol{\beta}}(\mathbf{x}^j)-y^j\right)x_n^j
\end{bmatrix}
=
\frac{1}{m}
\begin{bmatrix}
       \sum^{n}_{i=0}h_{\boldsymbol{\beta}}(\mathbf{x}^j)^jx^j_1\\
       \vdots \\
       \sum^{n}_{i=0}h_{\boldsymbol{\beta}}(\mathbf{x}^j)x^j_n
\end{bmatrix}
-
\frac{1}{m}
\begin{bmatrix}
       \sum^{m}_{j=1}y^jx^j_1\\
       \vdots \\
       \sum^{m}_{j=1}y^jx^j_n\\
\end{bmatrix}
\end{align}
$$

$$
h_{\boldsymbol{\beta}}(\mathbf{x}^j) = \sigma({\mathbf{x}^j}^T\boldsymbol{\beta})
$$

so

$$
\begin{align}
\nabla_{\boldsymbol{\beta}} J
&=\frac{1}{m}\left(
\mathbf{X}^T\sigma(\mathbf{X}\mathbf{\boldsymbol{\beta}})-\mathbf{X}^T\mathbf{y}
\right)\\
&=\frac{1}{m}\mathbf{X}^T\left(
\sigma(\mathbf{X}\mathbf{\boldsymbol{\beta}})-\mathbf{y}
\right)\\
&=\frac{1}{m}\mathbf{X}^T\left(
\mathbf{\hat{y}}-\mathbf{y}
\right)
\end{align}
$$

where

$$
\mathbf{\hat{y}} = \sigma(\mathbf{X}\mathbf{\boldsymbol{\beta}})
$$

We could have derived the same thing using matrix calculus

### example sigmoid

The curve used in logistic regression is the sigmoid function

$$
\sigma(x) = \frac{1}{1+e^{-x}}
$$

In [4]:
sigmoid_fig = go.FigureWidget()
demo_x = np.arange(-10,10,0.1)
demo_y = 1 / (1 + np.exp(-demo_x))
sigmoid_fig.add_scatter(
    x=demo_x,
    y=demo_y)
sigmoid_fig.layout.title = 'Sigmoid Function'
sigmoid_fig

FigureWidget({
    'data': [{'type': 'scatter',
              'uid': 'a748f270-1d78-4f12-833b-8b181fa39b15',
 …

### make fake data

In [5]:
m = 100
x0 = np.ones(shape=(m, 1))
x1 = np.linspace(0, 10, m).reshape(-1, 1)
X = np.column_stack((x0, x1))

# let y = 0.5 * x + 1 + epsilon
epsilon =  np.random.normal(scale=2, size=(m, 1))
y = x1 + epsilon
y = (y > 5).astype(int)

In [6]:
fig = go.FigureWidget()
fig = fig.add_scatter(
    x=X[:,1],
    y=y[:,0],
    mode='markers',
    name='linear data + noise')
fig

FigureWidget({
    'data': [{'mode': 'markers',
              'name': 'linear data + noise',
              'ty…

### graident descent

In [7]:
class LogisticRegression():
    
    def __init__(self, learning_rate=0.05):
        self.learning_rate = learning_rate
        print('Creating logistic model instance')

    def __repr__(self):
        return (
            f'<LogisticRegression '
            f'learning_rate={self.learning_rate}>')

    def fit(self, X, y, n_iter=1000):
        m, n = X.shape
        print(f'fitting with m={m} samples with n={n} features\n')
        self.beta = np.zeros(shape=(n, 1))
        self.costs = []
        self.betas = [self.beta]
        for iteration in range(n_iter):
            y_pred = self.predict(X)
            cost = (-1 / m) * (
                (y.T @ np.log(y_pred)) +
                ((np.ones(shape=y.shape) - y).T @ np.log(
                    np.ones(shape=y_pred.shape) - y_pred))
            )
            self.costs.append(cost[0][0])
            gradient = (1 / m) * X.T @ (y_pred - y)
            self.beta = self.beta - (
                self.learning_rate * gradient)
            self.betas.append(self.beta)

    def predict(self, X):
        y_pred = self.sigmoid(X @ self.beta)
        return y_pred
    
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    

In [8]:
logistic_regression = LogisticRegression()
logistic_regression.fit(X, y)

Creating logistic model instance
fitting with m=100 samples with n=2 features



### plot the best fit

In [9]:
fig = fig.add_scatter(
    x=X[:,1], 
    y=logistic_regression.predict(X)[:,0],
    mode='markers',
    name='logistic best fit')
fig

FigureWidget({
    'data': [{'mode': 'markers',
              'name': 'linear data + noise',
              'ty…

### plot the cost function

In [12]:
# Haven't got round to this yet - see linear regression for an example error 
# surface decent.

## end