In [49]:
import numpy as np

In [112]:
def softmax_forward(x):
    exps = np.exp(x)
    sexps = np.sum(exps, axis=1).reshape(-1,1)
    divexps = (1/sexps).reshape(-1,1)
    mul = exps * divexps
    return mul, {"exps":exps, "sexps":sexps, "divexps":divexps}

def softmax_backward(dout, cache, print_=False):
    softmax_grad = {}
    
    dexps_0 = cache["divexps"] * dout
    softmax_grad["dexps_0"] = dexps_0

    ddivexps = np.sum(cache["exps"] * dout, axis=1).reshape(-1,1)
    softmax_grad["ddivexps"] = ddivexps
    
    dsexps = -1.0/(cache["sexps"]**2) * ddivexps
    softmax_grad["dsexps"] = dsexps
    
    # dexps_1 = dsexps/(np.sum(cache["exps"], axis=1).reshape(-1,1))*cache["exps"]
    dexps_1 = dsexps * np.ones(dout.shape)
    softmax_grad["dexps_1"] = dexps_1
    
    dexps = dexps_0 + dexps_1
    softmax_grad["dexps"] = dexps
    
    dx = cache["exps"] * (dexps)
    softmax_grad["dx"] = dx
    
    if print_:
        print("dexps_0:")
        print(dexps_0.shape)
        print(dexps_0)
        print("ddivexps:")
        print(ddivexps.shape)
        print(ddivexps)
        print("dsexps:")
        print(dsexps.shape)
        print(dsexps)
        print("dexps_1:")
        print(dexps_1.shape)
        print(dexps_1)
        print("dexps:")
        print(dexps.shape)
        print(dexps)
        print("dx:")
        print(dx.shape)
        print(dx)

    return softmax_grad

In [113]:
w1 = np.asarray([[0.5, 0.4, 0.3]])
w2 = np.asarray([[0.8, -0.1], [1.3, 0.15], [-1.1, 0.95,]])
b1 = np.asarray([-1, 3, 1])
b2 = np.asarray([0.65, 0.8])
x = np.asarray([2, -5]).reshape(-1,1)
y = [1,0]
reg = 0.05

In [256]:
l1_relu, w2)

array([[3.18, 2.09],
       [1.3 , 0.15]])

In [235]:
w1_out = np.dot(x, w1)
l1_lin = w1_out + b1
l1_relu = np.maximum(l1_lin, 0)
w2_out = np.dot(l1_relu, w2)
l2_lin = w2_out + b2
softmax, cache = softmax_forward(l2_lin)
#softmax = np.exp(l2_lin)/np.sum(np.exp(l2_lin), axis=1).reshape(-1,1)
print("softmax = {}".format(softmax))
neg_log = -np.log(softmax[range(x.shape[0]), y])
loss_data = np.mean(neg_log)
print("neg_log = {}".format(neg_log))
print("loss_data = {}".format(loss_data))
w1_l2 = np.sum(w1 * w1)
w2_l2 = np.sum(w2 * w2)
b1_l2 = np.sum(b1 * b1)
b2_l2 = np.sum(b2 * b2)
loss_reg = (w1_l2 +w2_l2 +b1_l2 +b2_l2 )*reg
print("loss_reg = {}".format(loss_reg))
loss = loss_data + loss_reg
loss

softmax = [[0.71909966 0.28090034]
 [0.73105858 0.26894142]]
neg_log = [1.26975533 0.31326169]
loss_data = 0.7915085063990551
loss_reg = 0.8518750000000002


1.6433835063990552

In [337]:
dsoftmax = np.asarray([[0.0, -1.78], [-0.6839, 0.0]])
#dsoftmax = softmax_backward(np.asarray(dsoftmax), cache, False)
dl2_lin = softmax_backward(np.asarray(dsoftmax), cache, False)
db2 = np.sum(dl2_lin["dx"], axis=0) + 2 * 0.05 * b2
dw2 = np.dot(l1_relu.T, dl2_lin["dx"]) + 2 * 0.05 * w2
dl1_relu = np.dot(dl2_lin["dx"], w2.T)
dl1_lin = np.maximum(l1_lin, 0)
dl1_lin[dl1_lin > 0] = 1
dl1_lin *= dl1_relu
db1 = np.sum(dl1_lin, axis=0) + 2 * 0.05 * b1
dw1 = np.dot(x.T, dl1_lin) + 2 * 0.05 * w1

In [335]:
np.dot(x.T, dl1_lin) + 2 * 0.05 * w1

array([[ 0.05      ,  1.6401306 , -1.44416199]])

In [339]:
dw1

array([[ 0.05      ,  1.6401306 , -1.44416199]])

In [348]:
w1_new = w1.copy()
w1_new[0, 0] += 1e-10
(nn_loss(w1_new, w2, b1, b2, x, y) - nn_loss(w1, w2, b1, b2, x, y))/1e-10

0.6971889732199088

In [347]:
dw1

array([[ 0.05      ,  1.6401306 , -1.44416199]])

In [341]:
np.dot(dl1_lin, x) + 2 * 0.05 * w1

ValueError: shapes (2,3) and (2,1) not aligned: 3 (dim 1) != 2 (dim 0)

## Checking grads

### dl1_lin

In [304]:
def from_l1_lin_loss(w1, w2, b1, b2, x, y, l1_lin, reg=0.05):
    l1_relu = np.maximum(l1_lin, 0)
    w2_out = np.dot(l1_relu, w2)
    l2_lin = w2_out + b2
    softmax, cache = softmax_forward(l2_lin)
    
    neg_log = -np.log(softmax[range(x.shape[0]), y])
    loss_data = np.mean(neg_log)

    w1_l2 = np.sum(w1 * w1)
    w2_l2 = np.sum(w2 * w2)
    b1_l2 = np.sum(b1 * b1)
    b2_l2 = np.sum(b2 * b2)
    loss_reg = (w1_l2 + w2_l2 + b1_l2 + b2_l2 )*reg

    loss = loss_data + loss_reg
    return loss

In [315]:
l1_lin_new = l1_lin.copy()
l1_lin_new[0,0] += 1e-8
(from_l1_lin_loss(w1, w2, b1, b2, x, y, l1_lin_new) -
 from_l1_lin_loss(w1, w2, b1, b2, x, y, l1_lin))/1e-8

0.323594884221734

In [308]:
from_l1_lin_loss(w1, w2, b1, b2, x, y, l1_lin)

1.6433835063990552

### dl1_relu

In [281]:
def from_l1_relu_loss(w1, w2, b1, b2, x, y, l1_relu, reg=0.05):
    w2_out = np.dot(l1_relu, w2)
    l2_lin = w2_out + b2
    softmax, cache = softmax_forward(l2_lin)
    
    neg_log = -np.log(softmax[range(x.shape[0]), y])
    loss_data = np.mean(neg_log)

    w1_l2 = np.sum(w1 * w1)
    w2_l2 = np.sum(w2 * w2)
    b1_l2 = np.sum(b1 * b1)
    b2_l2 = np.sum(b2 * b2)
    loss_reg = (w1_l2 + w2_l2 + b1_l2 + b2_l2 )*reg

    loss = loss_data + loss_reg
    return loss

In [292]:
l1_relu_new = l1_relu.copy()
l1_relu_new[1,2] += 1e-8
(from_l1_relu_loss(w1, w2, b1, b2, x, y, l1_relu_new) -
 from_l1_relu_loss(w1, w2, b1, b2, x, y, l1_relu))/1e-8

0.2756649575275105

In [291]:
dl1_relu

array([[ 0.32359653,  0.41348446, -0.737081  ],
       [-0.12101661, -0.15463234,  0.27564895]])

### dw2_out

In [227]:
def from_w2_out_loss(w1, w2, b1, b2, x, y, w2_out, reg=0.05):
    
    l2_lin = w2_out + b2
    softmax, cache = softmax_forward(l2_lin)
    
    neg_log = -np.log(softmax[range(x.shape[0]), y])
    loss_data = np.mean(neg_log)

    w1_l2 = np.sum(w1 * w1)
    w2_l2 = np.sum(w2 * w2)
    b1_l2 = np.sum(b1 * b1)
    b2_l2 = np.sum(b2 * b2)
    loss_reg = (w1_l2 + w2_l2 + b1_l2 + b2_l2 )*reg

    loss = loss_data + loss_reg
    return loss

In [253]:
w2_out_new = w2_out.copy()
w2_out_new[1,0] += 1e-8
(from_w2_out_loss(w1, w2, b1, b2, x, y, w2_out_new) -
 from_w2_out_loss(w1, w2, b1, b2, x, y, w2_out))/1e-8

-0.1344707012407298

### dw2

In [182]:
def nn_loss(w1, w2, b1, b2, x, y, reg=0.05):
    w1_out = np.dot(x, w1)
    l1_lin = w1_out + b1
    l1_relu = np.maximum(l1_lin, 0)
    w2_out = np.dot(l1_relu, w2)
    l2_lin = w2_out + b2
    softmax, cache = softmax_forward(l2_lin)
    
    neg_log = -np.log(softmax[range(x.shape[0]), y])
    loss_data = np.mean(neg_log)

    w1_l2 = np.sum(w1 * w1)
    w2_l2 = np.sum(w2 * w2)
    b1_l2 = np.sum(b1 * b1)
    b2_l2 = np.sum(b2 * b2)
    loss_reg = (w1_l2 + w2_l2 + b1_l2 + b2_l2 )*reg

    loss = loss_data + loss_reg
    return loss

In [266]:
w2_new = w2.copy()
w2_new[2, 1] += 1e-10
(nn_loss(w1, w2_new, b1, b2, x, y) - nn_loss(w1, w2, b1, b2, x, y))/1e-10

-0.4802780395607442

In [267]:
dw2_empirical = np.asarray([[0.08, -0.01],
                            [1.3618186578412406, -1.2168186458438868],
                            [0.4652797258586361, -0.48027972709974165]
                           ])

In [272]:
dw2_empirical - dw2

array([[-1.38777878e-17,  1.73472348e-18],
       [-1.49215867e-05,  1.49335841e-05],
       [-3.00280319e-06,  3.00156208e-06]])

### db2

In [135]:
def nn_loss(w1, w2, b1, b2, x, y, reg=0.05):
    w1_out = np.dot(x, w1)
    l1_lin = w1_out + b1
    l1_relu = np.maximum(l1_lin, 0)
    w2_out = np.dot(l1_relu, w2)
    l2_lin = w2_out + b2
    softmax, cache = softmax_forward(l2_lin)
    
    neg_log = -np.log(softmax[range(x.shape[0]), y])
    loss_data = np.mean(neg_log)

    w1_l2 = np.sum(w1 * w1)
    w2_l2 = np.sum(w2 * w2)
    b1_l2 = np.sum(b1 * b1)
    b2_l2 = np.sum(b2 * b2)
    loss_reg = (w1_l2 + w2_l2 + b1_l2 + b2_l2 )*reg

    loss = loss_data + loss_reg
    return loss

In [165]:
b2_new = b2.copy()
b2_new[1] += 1e-7
(nn_loss(w1, w2, b1, b2_new, x, y) - nn_loss(w1, w2, b1, b2, x, y))/1e-7

-0.1450791042856281

In [134]:
b2, b2_new

(array([0.65, 0.8 ]), array([0.65, 0.8 ]))

### dsoftmax

In [60]:
def from_softmax_loss(softmax, y, w1, w2, b1, b2, reg=0.05):
    neg_log = -np.log(softmax[range(x.shape[0]), y])
    loss_data = np.mean(neg_log)
    # print("neg_log = {}".format(neg_log))
    # print("loss_data = {}".format(loss_data))
    w1_l2 = np.sum(w1 * w1)
    w2_l2 = np.sum(w2 * w2)
    b1_l2 = np.sum(b1 * b1)
    b2_l2 = np.sum(b2 * b2)
    loss_reg = (w1_l2 + w2_l2 + b1_l2 + b2_l2 )*reg
    # print(loss_reg)
    loss = loss_data + loss_reg
    return loss

In [61]:
from_softmax_loss(softmax, y, w1, w2, b1, b2, reg=0.05)

1.6433835063990552

In [62]:
softmax

array([[0.71909966, 0.28090034],
       [0.73105858, 0.26894142]])

In [63]:
softmax_new = softmax.copy()
softmax_new[0,1] += 1e-8
(from_softmax_loss(softmax_new, y, w1, w2, b1, b2, reg=0.05) - 
 from_softmax_loss(softmax, y, w1, w2, b1, b2, reg=0.05))/1e-8

-1.7799906437687696

In [64]:
softmax

array([[0.71909966, 0.28090034],
       [0.73105858, 0.26894142]])

### dl2_lin

In [127]:
def from_softmax_loss(l2_lin, x, y, w1, w2, b1, b2, reg=0.05):
    softmax, cache = softmax_forward(l2_lin)
    neg_log = -np.log(softmax[range(x.shape[0]), y])
    loss_data = np.mean(neg_log)
    w1_l2 = np.sum(w1 * w1)
    w2_l2 = np.sum(w2 * w2)
    b1_l2 = np.sum(b1 * b1)
    b2_l2 = np.sum(b2 * b2)
    loss_reg = (w1_l2 + w2_l2 + b1_l2 + b2_l2 )*reg
    loss = loss_data + loss_reg
    return loss

In [148]:
l2_lin_new = l2_lin.copy()
l2_lin_new[0,1] += 1e-8

In [149]:
l2_lin

array([[3.83, 2.89],
       [1.95, 0.95]])

In [150]:
from_softmax_loss(l2_lin, x, y, w1, w2, b1, b2)

1.6433835063990552

In [151]:
(from_softmax_loss(l2_lin_new, x, y, w1, w2, b1, b2) 
 - from_softmax_loss(l2_lin, x, y, w1, w2, b1, b2))/1e-8

-0.35954981214558757

In [40]:
0.35954983435004806 -0.35954981214558757
-0.1344706612727009 0.1344707234451903

SyntaxError: invalid syntax (<ipython-input-40-f98aa03b7f79>, line 2)

In [130]:
dsoftmax["dx"]

array([[ 0.35955171, -0.35955171],
       [-0.1344629 ,  0.1344629 ]])

# gradients inside softmax

In [70]:
def from_exps_loss(exps, x, y, w1, w2, b1, b2, reg=0.05):
    
    sexps = np.sum(exps, axis=1).reshape(-1,1)
    divexps = (1/sexps).reshape(-1,1)
    softmax = exps * divexps
    neg_log = -np.log(softmax[range(x.shape[0]), y])
    loss_data = np.mean(neg_log)
    
    w1_l2 = np.sum(w1 * w1)
    w2_l2 = np.sum(w2 * w2)
    b1_l2 = np.sum(b1 * b1)
    b2_l2 = np.sum(b2 * b2)
    
    loss_reg = (w1_l2 + w2_l2 + b1_l2 + b2_l2 )*reg
    loss = loss_data + loss_reg
    
    return loss

In [71]:
exps_orig = np.exp(l2_lin)
exps_new = exps_orig.copy()
exps_new[1, 0] += 1e-8

In [72]:
exps_new

array([[46.06253823, 17.9933096 ],
       [ 7.02868759,  2.58570966]])

In [73]:
exps_orig

array([[46.06253823, 17.9933096 ],
       [ 7.02868758,  2.58570966]])

In [74]:
exps_new - exps_orig

array([[0.00000000e+00, 0.00000000e+00],
       [9.99999994e-09, 0.00000000e+00]])

In [75]:
(from_exps_loss(exps_new, x, y, w1, w2, b1, b2) 
 - from_exps_loss(exps_orig, x, y, w1, w2, b1, b2))/1e-8

-0.019131674022787593

#### real gradient for exps

In [76]:
dexps_empirical = np.asarray([[0.007805711632613566, -0.019982415722097358],
[-0.019131674022787593, 0.052005355577477985]])

In [77]:
dexps_empirical

array([[ 0.00780571, -0.01998242],
       [-0.01913167,  0.05200536]])

#### analytical gradient for exps

In [119]:
dsoftmax["dexps"]

array([[ 0.00780573, -0.01998252],
       [-0.01913058,  0.05200232]])

In [120]:
dsoftmax["dexps_1"]

array([[0.00780573, 0.00780573],
       [0.05200232, 0.05200232]])

In [121]:
dsoftmax["dexps_0"]

array([[ 0.        , -0.02778825],
       [-0.0711329 ,  0.        ]])

In [86]:
dexps_empirical.sum(axis=1) - dsoftmax["dexps"].sum(axis=1)

array([0.00780582, 0.05200427])

In [82]:
dexps_empirical.sum(axis=1)

array([-0.0121767 ,  0.03287368])

In [83]:
dsoftmax["dexps"].sum(axis=1)

array([-0.01998252, -0.01913058])

In [111]:
dexps_empirical - dsoftmax["dexps_0"]

array([[0.00780571, 0.00780584],
       [0.05200123, 0.05200536]])

### sexps

In [91]:
def from_sexps_loss(exps, sexps, x, y, w1, w2, b1, b2, reg=0.05):

    divexps = (1/sexps).reshape(-1,1)
    softmax = exps * divexps
    neg_log = -np.log(softmax[range(x.shape[0]), y])
    loss_data = np.mean(neg_log)
    
    w1_l2 = np.sum(w1 * w1)
    w2_l2 = np.sum(w2 * w2)
    b1_l2 = np.sum(b1 * b1)
    b2_l2 = np.sum(b2 * b2)
    
    loss_reg = (w1_l2 + w2_l2 + b1_l2 + b2_l2 )*reg
    loss = loss_data + loss_reg
    
    return loss

In [104]:
sexps_orig = np.sum(exps_orig, axis=1).reshape(-1,1)

In [105]:
sexps_new = sexps_orig.copy()
sexps_new[1, 0] += 1e-8
sexps_new - sexps_orig

array([[0.00000000e+00],
       [1.00000008e-08]])

In [106]:
(from_sexps_loss(exps_orig, sexps_new, x, y, w1, w2, b1, b2) 
 - from_sexps_loss(exps_orig, sexps_orig, x, y, w1, w2, b1, b2))/1e-8

0.052005355577477985

In [107]:
dsexps_empirical = np.asarray([[0.007805711632613566],
[0.052005355577477985]])

In [108]:
dsexps_empirical

array([[0.00780571],
       [0.05200536]])

In [109]:
dsoftmax["dsexps"]

array([[0.00780573],
       [0.05200232]])