In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In Multiclass Logistic Regression

The Prediction function is:
$$S(\theta_0, \theta) = \frac{e ^{\theta_0 + \theta^T X}}{\Sigma_{i = 0}^{k} e ^ {\theta_{i0} + \theta_i^T X}}$$

Gradient Descent Function

$$\frac{\partial}{\partial\theta} \log(L(\theta_0, \theta) = \frac{1}{N} \Sigma_{i = 0}^{N} \big(c_0^i - S_i(\theta_0, \theta)\big) X^i$$

Loss function:

$$L(\theta_0, \theta) = -\frac{1}{N}\Sigma_{i = 0}^{N}\Sigma_{j = 0}^{K}C_i^j log_e(S(\theta_0^j, \theta^j))$$

Where $N$ is number of example in dataset. <br>
$K$ number of classes. 


In [2]:
class MultiClassLogisticRegression(object):
    
    def __init__(self, alpha = 0.1, tolerance = 0.01, fit_intercept = True, theta_init = 'rand'):
        '''
        : alpha: 
              It is the learning rate default (0.01)
          tolerance:
              Minimum value to reach default (0.001)
          fit_intercept:
              Adding 0th Vetor default (True)
          theta_init:
              Initial value of theta
              rand - random values for initial vector (Default)
              zero - zero values for initial vector
              value - Giving own values
        '''
        self.alpha = alpha
        self.tolerance = tolerance 
        self.intercept = fit_intercept
        self.theta_init = theta_init
    
    def __softmax__(self, X, weight):
        softmax_val = np.exp(np.dot(X, weight))
        total_val = np.sum(softmax_val, axis = 1)
        return softmax_val / total_val
    
    
    def __add_0th_feature__(self, X):
        
#         if X.shape[0] == self.theta.shape:
#             return (np.concatenate(([1], X)))
        ones = np.ones((X.shape[0], 1))
        return np.concatenate((ones, X), axis = 1)
    
    def predict_prob(self, X):
        if self.intercept:
            X = self.__add_0th_feature__(X)
        
        logit = self.__sigmoid__(X, self.theta).reshape((X.shape[0], 1))
        return np.concatenate((1 - logit, logit), axis = 1)
    
    def predict(self, X):
        val = self.predict_prob(X)
        return np.argmax(val, axis = 1)
    
    def one_hot_encoding(self, Y):
        
        lab_vector = None
        if np.min(Y) == 0:
            mat = np.eye(np.max(Y) + 1)
            lab_vector = mat[Y]
        else:
            mat = np.eye(np.max(Y))
            lab_vector = mat[Y - 1]
        return lab_vector
        
    
    def plot_loss(self):
        plt.xlabel('# of interation -->')
        plt.ylabel('Rate of loss -->')
        plt.title('Gradient loss graph')
        plt.plot(range(self.count), self.loss_history)
    
    def __loss__(self, X, W):
        
            
    def __gradient_descent__(self, X, Y):
        if self.intercept:
            w = np.random.randn(X.shape[1] + 1, np.unique(Y).shape[0])
        else:
            w = np.random.randn(X.shape[1], np.unique(Y).shape[0])
        Converged = False
        vec = self.one_hot_encoding(Y)
        while not Converged:
            del_theta = np.mean((vec - self.__softmax__(X, w)) @ X)
            w_new = w - (self.alpha * del_theta)
            
            pass
    def fit(self, X, Y):
        
#         if (self.intercept):
#             X = self.__add_0th_feature__(X)
            
        
        loss = self.__gradient_descent__(X, Y)
        return loss 
#         raise(NotImplementedError('Not Imp'))

In [3]:
lb = MultiClassLogisticRegression()


In [5]:
x = np.random.randint(1, 10, (4, 5))
w = np.random.randn(5, 4)
l = lb.__softmax__(x, w)

In [39]:
val = np.exp(x @ w)
total = np.sum(val, axis = 1)
val / total

array([[1.34016381e-04, 1.00126892e-14, 5.83185673e-02, 1.38762086e-22],
       [4.23535375e-03, 1.65919812e-16, 9.84275756e-08, 3.35339064e-17],
       [1.10103091e-04, 1.91204106e-11, 9.99993578e-01, 5.28709605e-20],
       [1.13636975e+00, 6.45526378e-17, 2.58449876e-02, 6.74304964e-21]])

array([[1.10133632e+04, 3.48638631e-09, 8.21682182e+07, 1.80113756e-14],
       [3.48058117e+05, 5.77727472e-11, 1.38679993e+02, 4.35271478e-09],
       [9.04818744e+03, 6.65766571e-06, 1.40894563e+09, 6.86267231e-12],
       [9.33859930e+07, 2.24770217e-11, 3.64144163e+07, 8.75250603e-13]])

In [40]:
sum([1.34016381e-04, 1.00126892e-14, 5.83185673e-02, 1.38762086e-22])

0.058452583681010015

In [34]:
total

array([7.51875529e+02, 8.52119840e-01, 1.85850736e+06, 9.34363118e+02,
       1.09357981e-01])

In [7]:
y = [0, 1, 3, 2]

In [8]:
c = lb.one_hot_encoding(y)

In [13]:
(c - l) @ x

array([[ 2.47473085e+00,  6.53251335e+00,  5.53264736e+00,
         4.94101135e+00,  7.88229073e+00],
       [ 1.98729305e+00,  5.97035174e+00,  9.74587090e-01,
         8.97882313e+00,  6.96611697e+00],
       [-6.00027251e+00, -3.00071935e+00, -6.09243245e-04,
         7.99945591e+00,  4.99913202e+00],
       [ 5.35828586e+00, -1.61348163e-01,  9.75021588e-01,
        -4.70769375e+00, -7.14264799e+00]])

In [24]:
l

array([[1.34016381e-04, 1.00126892e-14, 5.83185673e-02, 1.38762086e-22],
       [4.23535375e-03, 1.65919812e-16, 9.84275756e-08, 3.35339064e-17],
       [1.10103091e-04, 1.91204106e-11, 9.99993578e-01, 5.28709605e-20],
       [1.13636975e+00, 6.45526378e-17, 2.58449876e-02, 6.74304964e-21]])

In [19]:
l.shape

(4, 4)

In [81]:
(c - np.max(l * c, axis = 0)) * x

ValueError: operands could not be broadcast together with shapes (4,4) (4,5) 

In [86]:
c - np.max(l * c, axis = 0).reshape(1, 4)

array([[ 1.81960925e-02, -6.01610329e-02, -4.93113134e-07,
        -4.98997783e-05],
       [-9.81803907e-01,  9.39838967e-01, -4.93113134e-07,
        -4.98997783e-05],
       [-9.81803907e-01, -6.01610329e-02, -4.93113134e-07,
         9.99950100e-01],
       [-9.81803907e-01, -6.01610329e-02,  9.99999507e-01,
        -4.98997783e-05]])