# softmax derivative with neural network

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
def softmax(data):
    exp_data = np.exp(data)
    denom = np.sum(exp_data, axis=0)
    result = exp_data/denom
    return result

def softmaxPrime(data, targets):
    # each column of data is a prediction
    # each column of target is a one-hot vector
    m, n = data.shape
    exp_data = np.exp(data)
    sum_data = np.sum(exp_data, axis=0)
    denom = sum_data**2
    add_term = exp_data * targets * sum_data # element-wise
    constant = -1*np.sum(exp_data*targets**2,axis=0).reshape((1,n))/denom
    result = np.repeat(constant,m,axis=0) + add_term
    return result
    
    

In [None]:
# use softmax activation function
class WordModel:
    def __init__(self, input_dim, hidden_dim, text = None, learning_rate=0.01, epochs=10, lam = 0.5):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = input_dim
        self.text = text
        self.lam = lam
        
        # model has 2 layers, hidden and output
        self.weights = dict()
        self.biases = dict()
        self.weights['L1'] = np.random.rand((hidden_dim,input_dim))
        self.weights['L2'] = np.random.rand((input_dim,hidden_dim))
        self.biases['L1'] = np.random.rand((hidden_dim, 1))
        self.biases['L2'] = np.random.rand((input_dim, 1))
        
    def sigmoid(self, z):
        return 1/(1+np.exp(-z))
    
    def sigmoidPrime(self, z):
        oz = self.sigmoid(z)
        return oz(1-oz)
    
    def softmax(self, data):
        exp_data = np.exp(data)
        denom = np.sum(exp_data, axis=0)
        result = exp_data/denom
        return result
    
    def softmaxPrime(self, data, targets):
        # each column of data is a prediction
        # each column of target is a one-hot vector
        m, n = data.shape
        exp_data = np.exp(data)
        sum_data = np.sum(exp_data, axis=0)
        denom = sum_data**2
        add_term = exp_data * targets * sum_data # element-wise
        constant = -1*np.sum(exp_data*targets**2,axis=0).reshape((1,n))/denom
        result = np.repeat(constant,m,axis=0) + add_term
        return result
    
    def forward(self, X, y):
        # assume y is one-hot encoded
        # assume columns are data points
        # Z = WX + b
        self.z1 = self.weights['L1']*X+self.biases['L1']
        self.a1 = self.sigmoid(self.z1)
        self.z2 = self.weights['L2']*self.a1+self.biases['L2']
        result = self.softmax(self.z2)
        
    def backprop(self, result, targets):
        # we will start by passing each dL/dSy to each dSy/df, but this might change moving forward (to element-wise,
        # where we only care about element y)
        m, n = result.shape
        
        Sy = np.dot(result.transpose(),targets)
        
        dLdSy = -1/Sy
        dSydf = self.softmaxPrime(result, targets)
        deltaL2 = dLdSy * dSydf/n
        
        # add regularization term...
        regL2 = lam*self.weights['L2']
        
        dLdW2 = np.dot(deltaL2, self.z2.transpose()) +regL2
        
        dLdb2 = np.sum(deltaL2,axis=1)
        
        # get new W2, b2
        W2_new = self.weights['L2'] + self.learning_rate*dLdW2
        b2_new = self.biases['b2'] + self.learning_rate*dLdb2
        
        
        
        
    

In [8]:
a = np.array([
    [0,1,0,0],
    [1,0,0,0],
    [0,0,1,0],
    [0,0,0,1],
])
b = np.array([[5,5,5,5]])
print(a*b)

[[0 5 0 0]
 [5 0 0 0]
 [0 0 5 0]
 [0 0 0 5]]


In [9]:
np.repeat(b,2,axis=0)

array([[5, 5, 5, 5],
       [5, 5, 5, 5]])