# Multiclass Classification

<font color='blue'><h2>Multiclass Classification</h2></font>

<font color='red'>Note: on matrix dot product

$$ X_{m x n} \cdot W_{p x q} = O_{m x q} $$</font>

Above problem was binary classification, What if number class is more than two classes to classify

Logistics function can easily handled binary as it's output between [0, 1], We need function that can able to distribute the result between n classes, and top distributions class will be output result

Introducing one more activation function which is softmax

$$ Softmax = \frac{e^{h_i}}{\sum_k e^{h_k}}$$

where:
$$ h = \theta_0 + \theta_1 x_1 + \theta_2 x_2 + .. + \theta_n x_n = \theta^T X $$

Input $\theta^T X$ to softmax to get probability matrix where each row will contains probailities for each classess

Now we need minimized the cost $J(\theta)$ using optimized algorithm

Here we take cross-entropy as cost function

$$ Cost= J(\theta) =  - \sum_{i=1}^{n} T_i \cdot log(O_i)$$

Compute the gradient of $\nabla_\theta J(\theta)$

Using chain rule will differentiate binary entropy


$$\frac{d J(\theta)}{d\theta} = \frac{d J(\theta)}{dz} \cdot \frac{dz}{dh} \cdot \frac{dh}{d\theta} \qquad \text{where z = $\frac{e^{h_i}}{(\sum_{k}e^{h_k})}$  $\qquad$ h = $\theta^T x$} \qquad equation(*)$$ </font>

$$\nabla_\theta J(\theta) = - \frac{1}{n} \sum_{i=1}^{n} [X^i \cdot (T_i - O_i)]$$

<font color='green'>__Repeat__:
    {
    $$\theta_j: \theta_j - \alpha \cdot \nabla_\theta J(\theta) \qquad \alpha \text{ is learning rate}$$
    }</font>

In [1]:
# Import required methods
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# from sklearn
from sklearn.datasets import load_iris
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

%matplotlib inline

In [2]:
# Define Softmax
def softmax(h):
    return np.exp(h) / np.sum(np.exp(h) ,axis=1, keepdims=True)

# Define Loss
def cross_entropy(O, T):
    return - np.sum(T * np.log(O), axis=1)

In [3]:
# Train model

In [4]:
iris = load_iris()

In [5]:
X = iris.data
y = iris.target

In [6]:
X = X[:, [0, 3]] # sepal length and petal width

In [7]:
# standardize
X[:,0] = (X[:,0] - X[:,0].mean()) / X[:,0].std()
X[:,1] = (X[:,1] - X[:,1].mean()) / X[:,1].std()

In [8]:
index = np.random.randint(0,len(y), int(len(y)*0.3)) # create index id

Xtrain, Xtest = np.delete(X, index, axis=0) , X[index] # subset based on index id
ytrain, ytest = np.delete(y, index, axis=0) , y[index]

In [9]:
print(Xtrain.shape, ytrain.shape) # Verify the shape of data

(110, 2) (110,)


In [10]:
nfeature, nclassess = X.shape[1], np.max(y)+1

<!-- # %%time
# weight_, bias_, loss_, acc_  = gradient(weight, bias, Xtrain, ytrain, alpha, iterations=itern) -->

In [46]:
def _cost(cross_entropy,l2, w_):
    L2_term = l2 * np.sum(w_ ** 2)
    cross_entropy = cross_entropy + L2_term
    return 0.5 * np.mean(cross_entropy)

def new_gradient(X, y, w, b, eta, itern, l2):
    
    acc_ = []
    cost_ = []
    bias_ = []
    weight_ = []
    
    y_enc = np.zeros((y.size, y.max()+1))
    y_enc[np.arange(y.size),y] = 1
    y_enc = y_enc.astype('float64')
    
    for epoch in range(itern):
        
        h = X.dot(w) + b
        Output = softmax(h)
        
        yhat= Output.argmax(axis=1)
        acc_.append(accuracy_score(y, yhat))
        
        diff = Output - y_enc
        
        grad = X.T.dot(diff)
        
        weight_.append(w)
        w -= (eta * grad + eta * l2* w)
        
        bias_.append(b)
        b -= (eta * np.sum(diff))
        
        entropy = cross_entropy(Output, y_enc)
        cost  = _cost(entropy, l2, w)
        cost_.append(cost)
        
    return weight_, bias_,cost_, acc_

In [179]:
rgen = np.random.RandomState(None) # set random state
theta = rgen.normal(loc=0.0, scale=0.01, size=(nfeature, nclassess),) # initalized weight and bias
theta

array([[-0.00104031, -0.01475533,  0.01376637],
       [ 0.00801639,  0.00384731, -0.00678715]])

In [180]:
const = np.zeros(shape=nclassess)
const

array([0., 0., 0.])

In [192]:
coeff, b, cost, score = new_gradient(X=Xtrain, y=ytrain, w = theta, b = const, eta = 0.1, itern = 10, l2=0.1)

In [193]:
score

[0.6363636363636364,
 0.7272727272727273,
 0.8181818181818182,
 0.6363636363636364,
 0.6363636363636364,
 0.7272727272727273,
 0.8181818181818182,
 0.8181818181818182,
 0.7272727272727273,
 0.5454545454545454,
 0.8181818181818182,
 0.7272727272727273,
 0.8181818181818182,
 0.8181818181818182,
 0.7272727272727273,
 0.6363636363636364,
 0.8181818181818182,
 0.6363636363636364,
 0.8181818181818182,
 0.5454545454545454,
 0.8181818181818182,
 0.9090909090909091,
 0.5454545454545454,
 0.5454545454545454,
 0.45454545454545453,
 0.45454545454545453,
 0.7272727272727273,
 0.8181818181818182,
 0.8181818181818182,
 0.9090909090909091,
 0.5454545454545454,
 0.9090909090909091,
 0.7272727272727273,
 0.7272727272727273,
 0.8181818181818182,
 0.6363636363636364,
 0.8181818181818182,
 0.6363636363636364,
 0.8181818181818182,
 0.6363636363636364,
 0.6363636363636364,
 0.8181818181818182,
 0.6363636363636364,
 0.45454545454545453,
 0.8181818181818182,
 0.6363636363636364,
 0.45454545454545453,
 1.0,
 0.

In [183]:
h = Xtrain.dot(coeff[-1])  + b[-1]

In [184]:
z = softmax(h)

In [185]:
accuracy_score(ytrain, z.argmax(axis=1))

0.7090909090909091

In [186]:
def _yield_minibatches_idx(rgen, n_batches, data_ary, shuffle=True):
        indices = np.arange(data_ary.shape[0])

        if shuffle:
            indices = rgen.permutation(indices)
        if n_batches > 1:
            remainder = data_ary.shape[0] % n_batches

            if remainder:
                minis = np.array_split(indices[:-remainder], n_batches)
                minis[-1] = np.concatenate((minis[-1],
                                            indices[-remainder:]),
                                           axis=0)
            else:
                minis = np.array_split(indices, n_batches)

        else:
            minis = (indices,)

        for idx_batch in minis:
            yield idx_batch

In [191]:
def _cost(cross_entropy,l2, w_):
    L2_term = l2 * np.sum(w_ ** 2)
    cross_entropy = cross_entropy + L2_term
    return 0.5 * np.mean(cross_entropy)

def new_gradient(X, y, w, b, eta, itern, l2):
    
    acc_ = []
    cost_ = []
    bias_ = []
    weight_ = []
    
    y_enc = np.zeros((y.size, y.max()+1))
    y_enc[np.arange(y.size),y] = 1
    y_enc = y_enc.astype('float64')

    for epoch in range(itern):
        
        for idx in list(_yield_minibatches_idx(rgen, 10, data_ary=Xtrain)):
            Xt ,yt, y_enct  = X[idx], y[idx], y_enc[idx]
            
            h = Xt.dot(w) + b
            Output = softmax(h)

            yhat= Output.argmax(axis=1)
            acc_.append(accuracy_score(yt, yhat))

            diff = Output - y_enct

            grad = Xt.T.dot(diff)

            weight_.append(w)
            w -= (eta * grad + eta * l2* w)

            bias_.append(b)
            b -= (eta * np.sum(diff))

            entropy = cross_entropy(Output, y_enct)
            cost  = _cost(entropy, l2, w)
            cost_.append(cost)

    return weight_, bias_,cost_, acc_