In [1]:
%matplotlib inline
import mxnet as mx
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm

In [2]:
dev = mx.gpu()
batch_size = 100
data_shape = (1, 28, 28)

train_iter = mx.io.MNISTIter(
        image       = "../data/mnist/train-images-idx3-ubyte",
        label       = "../data/mnist/train-labels-idx1-ubyte",
        input_shape = data_shape,
        batch_size  = batch_size,
        shuffle     = True,
        flat        = False)

val_iter = mx.io.MNISTIter(
        image       = "../data/mnist/t10k-images-idx3-ubyte",
        label       = "../data/mnist/t10k-labels-idx1-ubyte",
        input_shape = data_shape,
        batch_size  = batch_size,
        flat        = False)

In [3]:
def Softmax(theta):
    max_val = np.max(theta, axis=1, keepdims=True)
    tmp = theta - max_val
    exp = np.exp(tmp)
    norm = np.sum(exp, axis=1, keepdims=True)
    return exp / norm

def MultiHinge(theta, label):
    tmp1 = theta-theta[label]
    tmp1[label] = -np.inf
    tmp2 = np.max(tmp1)+1
    return np.max(tmp2, 0)
    
def MultiHingeGrad(theta, label):
    tmp = np.zeros(theta.shape)
    tmp1 = theta-theta[label]
    tmp1[label] = -np.inf
    if np.max(tmp1)+1 < 0:
        return tmp
    else:
        ind = np.argmax(tmp1)
        tmp[ind] = 1
        tmp[label] = -1
        return tmp
    
def LogLossGrad(alpha, label):
    grad = np.copy(alpha)
    for i in range(alpha.shape[0]):
        grad[i, label[i]] -= 1.
    return grad

def SGD(weight, grad, lr=0.1, grad_norm=batch_size):
    weight[:] -= lr * grad / batch_size

def CalAcc(pred_prob, label):
    pred = np.argmax(pred_prob, axis=1)
    return np.sum(pred == label) * 1.0

def CalLoss(pred_prob, label):
    loss = 0.
    for i in range(pred_prob.shape[0]):
        loss += -np.log(max(pred_prob[i, label[i]], 1e-10))
    return loss

In [4]:
def acc_normal(model, val_iter, arg_map, grad_map):
    val_iter.reset()
    val_acc = 0.0
    num_samp = 0
    for dbatch in val_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        batch_size = label.asnumpy().shape[0]
        arg_map["data"][:] = data    

        model.forward(is_train=False)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        val_acc += CalAcc(alpha, label.asnumpy()) 
        num_samp += batch_size
    return(val_acc / num_samp)
    
def acc_perb_L0(model, val_iter, coe_pb,arg_map, grad_map):
    val_iter.reset()
    val_acc = 0.0
    num_samp = 0
    nn=0
    for dbatch in val_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        batch_size = label.asnumpy().shape[0]
        arg_map["data"][:] = data    

        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        
        grad = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = grad
        model.backward([out_grad])
        noise = np.sign(grad_map["data"].asnumpy())
        
        for j in range(batch_size):
            if np.linalg.norm(noise[j].flatten(),2) ==0:
                nn+=1
            y = label.asnumpy()[j]
            if (y == np.argmax(alpha[j])): 
                noise[j] = noise[j]/np.linalg.norm(noise[j].flatten(),2)
            else:
                noise[j] = 0
            
        pdata = data.asnumpy() + coe_pb * noise
        arg_map["data"][:] = pdata
        model.forward(is_train=False)
        raw_output = model.outputs[0].asnumpy()
        pred = Softmax(raw_output)
        
        val_acc += CalAcc(pred, label.asnumpy()) 
        num_samp += batch_size
    if  nn>0:
        print('L0 gradien being 0 :', nn)
    return(val_acc / num_samp)

def acc_perb_L2(model, val_iter, coe_pb, arg_map, grad_map):
    val_iter.reset()
    val_acc = 0.0
    num_batch = 0
    nn=0
    for dbatch in val_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        batch_size = label.asnumpy().shape[0]
        arg_map["data"][:] = data    

        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        
        grad = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = grad
        model.backward([out_grad])
        noise = grad_map["data"].asnumpy()
        
        for j in range(batch_size):
            if np.linalg.norm(noise[j].flatten(),2) ==0:
                nn+=1
            y = label.asnumpy()[j]
            if (y == np.argmax(alpha[j])): 
                noise[j] = noise[j]/np.linalg.norm(noise[j].flatten(),2)
            else:
                noise[j] = 0
        pdata = data.asnumpy() + coe_pb * noise
        arg_map["data"][:] = pdata
        model.forward(is_train=False)
        raw_output = model.outputs[0].asnumpy()
        pred = Softmax(raw_output)
        
        val_acc += CalAcc(pred, label.asnumpy()) /  batch_size 
        num_batch += 1
    if  nn>0:
        print('L2 gradien being 0 :', nn)
    return(val_acc / num_batch)

def acc_perb_alpha(model, val_iter, coe_pb,arg_map, grad_map):
    val_iter.reset()
    val_acc = 0.0
    num_samp = 0
    nn=0
    for dbatch in val_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        batch_size = label.asnumpy().shape[0]
        arg_map["data"][:] = data    

        T = np.zeros((10, batch_size, data_shape[1], data_shape[2], data_shape[3]))
        noise = np.zeros(data.shape)
        #===================
        for i in range(10):
            arg_map["data"][:] = data   
            model.forward(is_train=True)
            theta = model.outputs[0].asnumpy()
            alpha = Softmax(theta)
            
            grad = LogLossGrad(alpha, i*np.ones(alpha.shape[0]))
            for j in range(batch_size):
                grad[j] = -alpha[j][i]*grad[j]
            out_grad[:] = grad
            model.backward([out_grad])
            T[i] = grad_map["data"].asnumpy()
        
        for j in range(batch_size):
            y = label.asnumpy()[j]
            if (y == np.argmax(alpha[j])): 
                perb_scale = np.zeros(10)
                for i in range(10):
                    if (i == y):
                        perb_scale[i] = np.inf
                    else:
                        perb_scale[i] = (alpha[j][y] - alpha[j][i])/np.linalg.norm((T[i][j]-T[y][j]).flatten(),2)
                noise[j] = T[np.argmin(perb_scale)][j]-T[y][j]
        #====================
        for j in range(batch_size):
            if np.linalg.norm(noise[j].flatten(),2) ==0:
                nn+=1
            else:
                noise[j] = noise[j]/np.linalg.norm(noise[j].flatten(),2)
        pdata = data.asnumpy() + coe_pb * noise
        arg_map["data"][:] = pdata
        model.forward(is_train=False)
        raw_output = model.outputs[0].asnumpy()
        pred = Softmax(raw_output)
        
        val_acc += CalAcc(pred, label.asnumpy()) /batch_size
        num_samp += 1
    if  nn>0:
        print('Alpha gradien being 0 :', nn)
    return(val_acc / num_samp)

# Generate Fixed Perturbed Data

In [19]:
data = mx.symbol.Variable('data')
# first conv
conv1 = mx.symbol.Convolution(data=data, kernel=(5,5), num_filter=20)
tanh1 = mx.symbol.Activation(data=conv1, act_type="tanh")
pool1 = mx.symbol.Pooling(data=tanh1, pool_type="max",
                              kernel=(2,2), stride=(2,2))
# second conv
conv2 = mx.symbol.Convolution(data=pool1, kernel=(5,5), num_filter=50)
tanh2 = mx.symbol.Activation(data=conv2, act_type="tanh")
pool2 = mx.symbol.Pooling(data=tanh2, pool_type="max",
                              kernel=(2,2), stride=(2,2))
# first fullc
flatten = mx.symbol.Flatten(data=pool2)
fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=100)
tanh3 = mx.symbol.Activation(data=fc1, act_type="tanh")
# second fullc
fc2 = mx.symbol.FullyConnected(data=tanh3, num_hidden=10)

In [20]:
data_shape = (batch_size, 1, 28, 28)
arg_names = fc2.list_arguments() # 'data' 
arg_shapes, output_shapes, aux_shapes = fc2.infer_shape(data=data_shape)

arg_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
grad_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
reqs = ["write" for name in arg_names]

model = fc2.bind(ctx=dev, args=arg_arrays, args_grad = grad_arrays, grad_req=reqs)
arg_map = dict(zip(arg_names, arg_arrays))
grad_map = dict(zip(arg_names, grad_arrays))
data_grad = grad_map["data"]
out_grad = mx.nd.zeros(model.outputs[0].shape, ctx=dev)

In [21]:
for name in arg_names:
    if "weight" in name:
        arr = arg_map[name]
        arr[:] = mx.rnd.uniform(-0.07, 0.07, arr.shape)

In [22]:
num_round = 30
train_acc = 0.
nbatch = 0
for i in range(num_round):
    train_loss = 0.
    train_acc = 0.
    nbatch = 0
    train_iter.reset()
    for dbatch in train_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        arg_map["data"][:] = data
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        train_acc += CalAcc(alpha, label.asnumpy()) / batch_size
        train_loss += CalLoss(alpha, label.asnumpy()) / batch_size
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        for name in arg_names:
            if name != "data":
                SGD(arg_map[name], grad_map[name])
        
        nbatch += 1
    train_acc /= nbatch
    train_loss /= nbatch
    print("Train Accuracy: %.4f\t Train Loss: %.5f" % (train_acc, train_loss))

Train Accuracy: 0.8846	 Train Loss: 0.41799
Train Accuracy: 0.9713	 Train Loss: 0.09705
Train Accuracy: 0.9808	 Train Loss: 0.06569
Train Accuracy: 0.9856	 Train Loss: 0.05111
Train Accuracy: 0.9883	 Train Loss: 0.04211
Train Accuracy: 0.9902	 Train Loss: 0.03571
Train Accuracy: 0.9916	 Train Loss: 0.03078
Train Accuracy: 0.9930	 Train Loss: 0.02681
Train Accuracy: 0.9940	 Train Loss: 0.02350
Train Accuracy: 0.9948	 Train Loss: 0.02069
Train Accuracy: 0.9957	 Train Loss: 0.01825
Train Accuracy: 0.9962	 Train Loss: 0.01614
Train Accuracy: 0.9968	 Train Loss: 0.01433
Train Accuracy: 0.9973	 Train Loss: 0.01275
Train Accuracy: 0.9978	 Train Loss: 0.01139
Train Accuracy: 0.9982	 Train Loss: 0.01021
Train Accuracy: 0.9985	 Train Loss: 0.00918
Train Accuracy: 0.9988	 Train Loss: 0.00827
Train Accuracy: 0.9989	 Train Loss: 0.00748
Train Accuracy: 0.9991	 Train Loss: 0.00678
Train Accuracy: 0.9992	 Train Loss: 0.00617
Train Accuracy: 0.9994	 Train Loss: 0.00563
Train Accuracy: 0.9994	 Train Lo



In [23]:
val_iter.reset()
val_acc = 0.0
val_acc_pb = 0.0
coe_pb = 1.5
num_samp = 0

perb_data = []
perb_lab = []

for dbatch in val_iter:
    data = dbatch.data[0]
    label = dbatch.label[0]
    arg_map["data"][:] = data    
    
    model.forward(is_train=True)
    theta = model.outputs[0].asnumpy()
    alpha = Softmax(theta)
    val_acc += CalAcc(alpha, label.asnumpy()) 
    #########
    grad = LogLossGrad(alpha, label.asnumpy())
    out_grad[:] = grad
    model.backward([out_grad])
    noise = data_grad.asnumpy()
    for j in range(batch_size):
        noise[j] = noise[j]/np.linalg.norm(noise[j].flatten(),2)
    pdata = data.asnumpy() + coe_pb * noise
    arg_map["data"][:] = pdata
    model.forward(is_train=True)
    raw_output = model.outputs[0].asnumpy()
    pred = Softmax(raw_output)
    val_acc_pb += CalAcc(pred, label.asnumpy()) 
    num_samp += batch_size
    
    perb_data.append(pdata)
    perb_lab.append(label.asnumpy())
print("Val Batch Accuracy: ", val_acc / num_samp)
print("Val Batch Accuracy after pertubation: ", val_acc_pb / num_samp)

Val Batch Accuracy:  0.9921
Val Batch Accuracy after pertubation:  0.543




In [24]:
pdata = np.concatenate(perb_data, axis = 0)
plabel = np.concatenate(perb_lab, axis = 0)
perb_iter = mx.io.NDArrayIter(
    data = pdata,
    label = plabel,
    batch_size = 100,
    shuffle = False    
)

In [25]:
perb_iter.reset()
num_samp = 0
val_acc = 0.0
for dbatch in perb_iter:
    data = dbatch.data[0]
    label = dbatch.label[0]
    arg_map["data"][:] = data    
    
    model.forward(is_train=True)
    theta = model.outputs[0].asnumpy()
    alpha = Softmax(theta)
    val_acc += CalAcc(alpha, label.asnumpy()) 
    num_samp += batch_size
print("Val Batch Accuracy after pertubation: ", val_acc / num_samp)

Val Batch Accuracy after pertubation:  0.543


# Normal Training

In [26]:
data = mx.symbol.Variable('data')
# first conv
conv1 = mx.symbol.Convolution(data=data, kernel=(5,5), num_filter=20)
tanh1 = mx.symbol.Activation(data=conv1, act_type="tanh")
pool1 = mx.symbol.Pooling(data=tanh1, pool_type="max",
                              kernel=(2,2), stride=(2,2))
# second conv
conv2 = mx.symbol.Convolution(data=pool1, kernel=(5,5), num_filter=50)
tanh2 = mx.symbol.Activation(data=conv2, act_type="tanh")
pool2 = mx.symbol.Pooling(data=tanh2, pool_type="max",
                              kernel=(2,2), stride=(2,2))
# first fullc
flatten = mx.symbol.Flatten(data=pool2)
fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=100)
tanh3 = mx.symbol.Activation(data=fc1, act_type="tanh")
# second fullc
fc2 = mx.symbol.FullyConnected(data=tanh3, num_hidden=10)

In [27]:
data_shape = (batch_size, 1, 28, 28)
arg_names = fc2.list_arguments() # 'data' 
arg_shapes, output_shapes, aux_shapes = fc2.infer_shape(data=data_shape)

arg_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
grad_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
reqs = ["write" for name in arg_names]

model = fc2.bind(ctx=dev, args=arg_arrays, args_grad = grad_arrays, grad_req=reqs)
arg_map = dict(zip(arg_names, arg_arrays))
grad_map = dict(zip(arg_names, grad_arrays))
data_grad = grad_map["data"]
out_grad = mx.nd.zeros(model.outputs[0].shape, ctx=dev)

In [32]:
for name in arg_names:
    if "weight" in name:
        arr = arg_map[name]
        arr[:] = mx.rnd.uniform(-0.07, 0.07, arr.shape)

In [33]:
num_round = 30
train_acc = 0.
nbatch = 0
for i in range(num_round):
    train_loss = 0.
    train_acc = 0.
    nbatch = 0
    train_iter.reset()
    for dbatch in train_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        arg_map["data"][:] = data
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        train_acc += CalAcc(alpha, label.asnumpy()) / batch_size
        train_loss += CalLoss(alpha, label.asnumpy()) / batch_size
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        for name in arg_names:
            if name != "data":
                SGD(arg_map[name], grad_map[name])
        
        nbatch += 1
    train_acc /= nbatch
    train_loss /= nbatch
    val_acc = acc_normal(model, val_iter,arg_map, grad_map)
    print("Train Accuracy: %.4f\t Val Accuracy: %.4f\t Train Loss: %.5f" % (train_acc, val_acc, train_loss))

Train Accuracy: 0.8919	 Val Accuracy: 0.9689	 Train Loss: 0.38585
Train Accuracy: 0.9732	 Val Accuracy: 0.9795	 Train Loss: 0.09228
Train Accuracy: 0.9816	 Val Accuracy: 0.9843	 Train Loss: 0.06318
Train Accuracy: 0.9860	 Val Accuracy: 0.9866	 Train Loss: 0.04923
Train Accuracy: 0.9887	 Val Accuracy: 0.9871	 Train Loss: 0.04052
Train Accuracy: 0.9906	 Val Accuracy: 0.9883	 Train Loss: 0.03427
Train Accuracy: 0.9920	 Val Accuracy: 0.9892	 Train Loss: 0.02944
Train Accuracy: 0.9932	 Val Accuracy: 0.9895	 Train Loss: 0.02553
Train Accuracy: 0.9943	 Val Accuracy: 0.9896	 Train Loss: 0.02228
Train Accuracy: 0.9952	 Val Accuracy: 0.9898	 Train Loss: 0.01953
Train Accuracy: 0.9959	 Val Accuracy: 0.9907	 Train Loss: 0.01721
Train Accuracy: 0.9968	 Val Accuracy: 0.9908	 Train Loss: 0.01523
Train Accuracy: 0.9972	 Val Accuracy: 0.9911	 Train Loss: 0.01351
Train Accuracy: 0.9977	 Val Accuracy: 0.9912	 Train Loss: 0.01201
Train Accuracy: 0.9980	 Val Accuracy: 0.9912	 Train Loss: 0.01072
Train Accu



In [34]:
num_round = 20
train_acc = 0.
nbatch = 0
lr = 0.01
for i in range(num_round):
    train_loss = 0.
    train_acc = 0.
    nbatch = 0
    train_iter.reset()
    for dbatch in train_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        arg_map["data"][:] = data
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        train_acc += CalAcc(alpha, label.asnumpy()) / batch_size
        train_loss += CalLoss(alpha, label.asnumpy()) / batch_size
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        for name in arg_names:
            if name != "data":
                SGD(arg_map[name], grad_map[name],lr)
        
        nbatch += 1
    train_acc /= nbatch
    train_loss /= nbatch
    val_acc = acc_normal(model, val_iter,arg_map, grad_map)
    print("Train Accuracy: %.4f\t Val Accuracy: %.4f\t Train Loss: %.5f" % (train_acc, val_acc, train_loss))

Train Accuracy: 0.9999	 Val Accuracy: 0.9913	 Train Loss: 0.00265
Train Accuracy: 0.9999	 Val Accuracy: 0.9913	 Train Loss: 0.00257
Train Accuracy: 0.9999	 Val Accuracy: 0.9913	 Train Loss: 0.00254
Train Accuracy: 0.9999	 Val Accuracy: 0.9913	 Train Loss: 0.00251
Train Accuracy: 0.9999	 Val Accuracy: 0.9913	 Train Loss: 0.00249
Train Accuracy: 0.9999	 Val Accuracy: 0.9913	 Train Loss: 0.00247
Train Accuracy: 0.9999	 Val Accuracy: 0.9913	 Train Loss: 0.00245
Train Accuracy: 0.9999	 Val Accuracy: 0.9913	 Train Loss: 0.00244
Train Accuracy: 0.9999	 Val Accuracy: 0.9913	 Train Loss: 0.00242
Train Accuracy: 0.9999	 Val Accuracy: 0.9912	 Train Loss: 0.00240
Train Accuracy: 0.9999	 Val Accuracy: 0.9912	 Train Loss: 0.00239
Train Accuracy: 0.9999	 Val Accuracy: 0.9912	 Train Loss: 0.00237
Train Accuracy: 0.9999	 Val Accuracy: 0.9912	 Train Loss: 0.00236
Train Accuracy: 1.0000	 Val Accuracy: 0.9912	 Train Loss: 0.00234
Train Accuracy: 1.0000	 Val Accuracy: 0.9912	 Train Loss: 0.00233
Train Accu



In [35]:
print('Normal Validation: %.4f' % acc_normal(model,val_iter,arg_map, grad_map))
print('Fixed set perturbation: %.4f' % acc_normal(model, perb_iter,arg_map, grad_map))
print('L0 perturbation: %.4f' % acc_perb_L0(model, val_iter, 1.5,arg_map, grad_map))
print('L2 perturbation: %.4f' % acc_perb_L2(model, val_iter, 1.5,arg_map, grad_map))
print('Alpha perturbation: %.4f' % acc_perb_alpha(model, val_iter, 1.5,arg_map, grad_map))

Normal Validation: 0.9912
Fixed set perturbation: 0.7456
L0 perturbation: 0.8082
L2 perturbation: 0.5194




Alpha gradien being 0 : 88
Alpha perturbation: 0.5014




# Dropout Training

In [215]:
data = mx.symbol.Variable('data')
# first conv
conv1 = mx.symbol.Convolution(data=data, kernel=(5,5), num_filter=20)
tanh1 = mx.symbol.Activation(data=conv1, act_type="tanh")
pool1 = mx.symbol.Pooling(data=tanh1, pool_type="max",
                              kernel=(2,2), stride=(2,2))
# second conv
conv2 = mx.symbol.Convolution(data=pool1, kernel=(5,5), num_filter=50)
tanh2 = mx.symbol.Activation(data=conv2, act_type="tanh")
pool2 = mx.symbol.Pooling(data=tanh2, pool_type="max",
                              kernel=(2,2), stride=(2,2))
# first fullc
flatten = mx.symbol.Flatten(data=pool2)
fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=200)
tanh3 = mx.symbol.Activation(data=fc1, act_type="tanh")
dropout1 = mx.symbol.Dropout(data=tanh3, p=0.5)
# second fullc
fc2 = mx.symbol.FullyConnected(data=dropout1, num_hidden=10)

In [216]:
data_shape = (batch_size, 1, 28, 28)
arg_names = fc2.list_arguments() # 'data' 
arg_shapes, output_shapes, aux_shapes = fc2.infer_shape(data=data_shape)

arg_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
grad_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
reqs = ["write" for name in arg_names]

model = fc2.bind(ctx=dev, args=arg_arrays, args_grad = grad_arrays, grad_req=reqs)
arg_map = dict(zip(arg_names, arg_arrays))
grad_map = dict(zip(arg_names, grad_arrays))
data_grad = grad_map["data"]
out_grad = mx.nd.zeros(model.outputs[0].shape, ctx=dev)

In [217]:
for name in arg_names:
    if "weight" in name:
        arr = arg_map[name]
        arr[:] = mx.rnd.uniform(-0.07, 0.07, arr.shape)

In [218]:
num_round = 45
train_acc = 0.
nbatch = 0
for i in range(num_round):
    train_loss = 0.
    train_acc = 0.
    nbatch = 0
    train_iter.reset()
    for dbatch in train_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        arg_map["data"][:] = data
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        train_acc += CalAcc(alpha, label.asnumpy()) / batch_size
        train_loss += CalLoss(alpha, label.asnumpy()) / batch_size
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        for name in arg_names:
            if name != "data":
                SGD(arg_map[name], grad_map[name])
        
        nbatch += 1
    train_acc /= nbatch
    train_loss /= nbatch
    val_acc = acc_normal(model, val_iter,arg_map, grad_map)
    print("Train Accuracy: %.4f\t Val Accuracy: %.4f\t Train Loss: %.5f" % (train_acc, val_acc, train_loss))

Train Accuracy: 0.8906	 Val Accuracy: 0.9654	 Train Loss: 0.37487
Train Accuracy: 0.9666	 Val Accuracy: 0.9790	 Train Loss: 0.11069
Train Accuracy: 0.9748	 Val Accuracy: 0.9846	 Train Loss: 0.08163
Train Accuracy: 0.9801	 Val Accuracy: 0.9861	 Train Loss: 0.06559
Train Accuracy: 0.9827	 Val Accuracy: 0.9874	 Train Loss: 0.05679
Train Accuracy: 0.9851	 Val Accuracy: 0.9885	 Train Loss: 0.04814
Train Accuracy: 0.9867	 Val Accuracy: 0.9886	 Train Loss: 0.04364
Train Accuracy: 0.9880	 Val Accuracy: 0.9895	 Train Loss: 0.03850
Train Accuracy: 0.9890	 Val Accuracy: 0.9894	 Train Loss: 0.03575
Train Accuracy: 0.9902	 Val Accuracy: 0.9895	 Train Loss: 0.03230
Train Accuracy: 0.9910	 Val Accuracy: 0.9892	 Train Loss: 0.02969
Train Accuracy: 0.9918	 Val Accuracy: 0.9902	 Train Loss: 0.02639
Train Accuracy: 0.9918	 Val Accuracy: 0.9904	 Train Loss: 0.02586
Train Accuracy: 0.9929	 Val Accuracy: 0.9902	 Train Loss: 0.02344
Train Accuracy: 0.9932	 Val Accuracy: 0.9902	 Train Loss: 0.02105
Train Accu



In [219]:
num_round = 40
train_acc = 0.
nbatch = 0
lr = 0.01
for i in range(num_round):
    train_loss = 0.
    train_acc = 0.
    nbatch = 0
    train_iter.reset()
    for dbatch in train_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        arg_map["data"][:] = data
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        train_acc += CalAcc(alpha, label.asnumpy()) / batch_size
        train_loss += CalLoss(alpha, label.asnumpy()) / batch_size
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        for name in arg_names:
            if name != "data":
                SGD(arg_map[name], grad_map[name],lr)
        
        nbatch += 1
    train_acc /= nbatch
    train_loss /= nbatch
    val_acc = acc_normal(model, val_iter,arg_map, grad_map)
    print("Train Accuracy: %.4f\t Val Accuracy: %.4f\t Train Loss: %.5f" % (train_acc, val_acc, train_loss))

Train Accuracy: 0.9993	 Val Accuracy: 0.9926	 Train Loss: 0.00275
Train Accuracy: 0.9993	 Val Accuracy: 0.9925	 Train Loss: 0.00294
Train Accuracy: 0.9994	 Val Accuracy: 0.9925	 Train Loss: 0.00260
Train Accuracy: 0.9995	 Val Accuracy: 0.9925	 Train Loss: 0.00233
Train Accuracy: 0.9992	 Val Accuracy: 0.9925	 Train Loss: 0.00274
Train Accuracy: 0.9994	 Val Accuracy: 0.9923	 Train Loss: 0.00245
Train Accuracy: 0.9995	 Val Accuracy: 0.9925	 Train Loss: 0.00241
Train Accuracy: 0.9996	 Val Accuracy: 0.9926	 Train Loss: 0.00219
Train Accuracy: 0.9996	 Val Accuracy: 0.9925	 Train Loss: 0.00246
Train Accuracy: 0.9995	 Val Accuracy: 0.9922	 Train Loss: 0.00230
Train Accuracy: 0.9995	 Val Accuracy: 0.9924	 Train Loss: 0.00240
Train Accuracy: 0.9994	 Val Accuracy: 0.9925	 Train Loss: 0.00231
Train Accuracy: 0.9996	 Val Accuracy: 0.9924	 Train Loss: 0.00214
Train Accuracy: 0.9996	 Val Accuracy: 0.9926	 Train Loss: 0.00231
Train Accuracy: 0.9996	 Val Accuracy: 0.9922	 Train Loss: 0.00207
Train Accu



In [221]:
print('Normal Validation: %.4f' % acc_normal(model,val_iter,arg_map, grad_map))
print('Fixed set perturbation: %.4f' % acc_normal(model, perb_iter,arg_map, grad_map))
print('L0 perturbation: %.4f' % acc_perb_L0(model, val_iter, 1.5,arg_map, grad_map))
print('L2 perturbation: %.4f' % acc_perb_L2(model, val_iter, 1.5,arg_map, grad_map))
print('Alpha perturbation: %.4f' % acc_perb_alpha(model, val_iter, 1.5,arg_map, grad_map))

Normal Validation: 0.9922
Fixed set perturbation: 0.7217
L0 perturbation: 0.7694
L2 perturbation: 0.5322




Alpha gradien being 0 : 86
Alpha perturbation: 0.4893




# Ian's Method

In [222]:
data = mx.symbol.Variable('data')
# first conv
conv1 = mx.symbol.Convolution(data=data, kernel=(5,5), num_filter=20)
tanh1 = mx.symbol.Activation(data=conv1, act_type="tanh")
pool1 = mx.symbol.Pooling(data=tanh1, pool_type="max",
                              kernel=(2,2), stride=(2,2))
# second conv
conv2 = mx.symbol.Convolution(data=pool1, kernel=(5,5), num_filter=50)
tanh2 = mx.symbol.Activation(data=conv2, act_type="tanh")
pool2 = mx.symbol.Pooling(data=tanh2, pool_type="max",
                              kernel=(2,2), stride=(2,2))
# first fullc
flatten = mx.symbol.Flatten(data=pool2)
fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=200)
tanh3 = mx.symbol.Activation(data=fc1, act_type="tanh")
dropout1 = mx.symbol.Dropout(data=tanh3, p=0.5)
# second fullc
fc2 = mx.symbol.FullyConnected(data=dropout1, num_hidden=10)

In [223]:
data_shape = (batch_size, 1, 28, 28)
arg_names = fc2.list_arguments() # 'data' 
arg_shapes, output_shapes, aux_shapes = fc2.infer_shape(data=data_shape)

arg_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
grad_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
sum_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]

reqs = ["write" for name in arg_names]

model = fc2.bind(ctx=dev, args=arg_arrays, args_grad = grad_arrays, grad_req=reqs)
arg_map = dict(zip(arg_names, arg_arrays))
grad_map = dict(zip(arg_names, grad_arrays))
sum_map = dict(zip(arg_names, sum_arrays))
data_grad = grad_map["data"]
out_grad = mx.nd.zeros(model.outputs[0].shape, ctx=dev)

In [224]:
for name in arg_names:
    if "weight" in name:
        arr = arg_map[name]
        arr[:] = mx.rnd.uniform(-0.07, 0.07, arr.shape)
for name in arg_names:
    sum_map[name][:] = 0.

In [225]:
num_round = 45
train_acc = 0.
nbatch = 0
coe_pb = 1.75
lr= 0.05
for i in range(num_round):
    train_loss = 0.
    train_acc = 0.
    nbatch = 0
    train_iter.reset()
    for dbatch in train_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        arg_map["data"][:] = data
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        train_acc += CalAcc(alpha, label.asnumpy()) / batch_size
        train_loss += CalLoss(alpha, label.asnumpy()) / batch_size
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        
        for name in arg_names:
            if name != "data":
                sum_map[name][:] += grad_map[name]
        #grad1 = grad_map
        
        noise = np.sign(data_grad.asnumpy())
        for j in range(batch_size):
            noise[j] = noise[j]/np.linalg.norm(noise[j].flatten(),2)
        pdata = data.asnumpy() + coe_pb * noise
        arg_map["data"][:] = pdata
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        
        for name in arg_names:
            if name != "data":
                sum_map[name][:] += grad_map[name]

        for name in arg_names:
            if name != "data":
                SGD(arg_map[name], sum_map[name], lr)
            sum_map[name][:] = 0.
        
        nbatch += 1
    train_acc /= nbatch
    train_loss /= nbatch
    val_acc = acc_normal(model, val_iter,arg_map, grad_map)
    print("Train Accuracy: %.4f\t Val Accuracy: %.4f\t Train Loss: %.5f" % (train_acc, val_acc, train_loss))

Train Accuracy: 0.8979	 Val Accuracy: 0.9706	 Train Loss: 0.37182
Train Accuracy: 0.9708	 Val Accuracy: 0.9814	 Train Loss: 0.10097
Train Accuracy: 0.9787	 Val Accuracy: 0.9839	 Train Loss: 0.07308
Train Accuracy: 0.9826	 Val Accuracy: 0.9859	 Train Loss: 0.05980
Train Accuracy: 0.9843	 Val Accuracy: 0.9874	 Train Loss: 0.05306
Train Accuracy: 0.9863	 Val Accuracy: 0.9884	 Train Loss: 0.04731
Train Accuracy: 0.9870	 Val Accuracy: 0.9889	 Train Loss: 0.04398
Train Accuracy: 0.9880	 Val Accuracy: 0.9887	 Train Loss: 0.03935
Train Accuracy: 0.9897	 Val Accuracy: 0.9891	 Train Loss: 0.03555
Train Accuracy: 0.9900	 Val Accuracy: 0.9898	 Train Loss: 0.03411
Train Accuracy: 0.9910	 Val Accuracy: 0.9902	 Train Loss: 0.03197
Train Accuracy: 0.9912	 Val Accuracy: 0.9910	 Train Loss: 0.02973
Train Accuracy: 0.9916	 Val Accuracy: 0.9913	 Train Loss: 0.02859
Train Accuracy: 0.9924	 Val Accuracy: 0.9918	 Train Loss: 0.02640
Train Accuracy: 0.9928	 Val Accuracy: 0.9912	 Train Loss: 0.02452
Train Accu



In [226]:
num_round = 40
train_acc = 0.
nbatch = 0
coe_pb = 1.75
lr= 0.005
for i in range(num_round):
    train_loss = 0.
    train_acc = 0.
    nbatch = 0
    train_iter.reset()
    for dbatch in train_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        arg_map["data"][:] = data
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        train_acc += CalAcc(alpha, label.asnumpy()) / batch_size
        train_loss += CalLoss(alpha, label.asnumpy()) / batch_size
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        
        for name in arg_names:
            if name != "data":
                sum_map[name][:] += grad_map[name]
        #grad1 = grad_map
        
        noise = np.sign(data_grad.asnumpy())
        for j in range(batch_size):
            noise[j] = noise[j]/np.linalg.norm(noise[j].flatten(),2)
        pdata = data.asnumpy() + coe_pb * noise
        arg_map["data"][:] = pdata
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        
        for name in arg_names:
            if name != "data":
                sum_map[name][:] += grad_map[name]

        for name in arg_names:
            if name != "data":
                SGD(arg_map[name], sum_map[name], lr)
            sum_map[name][:] = 0.
        
        nbatch += 1
    train_acc /= nbatch
    train_loss /= nbatch
    val_acc = acc_normal(model, val_iter,arg_map, grad_map)
    print("Train Accuracy: %.4f\t Val Accuracy: %.4f\t Train Loss: %.5f" % (train_acc, val_acc, train_loss))

Train Accuracy: 0.9986	 Val Accuracy: 0.9937	 Train Loss: 0.00561
Train Accuracy: 0.9986	 Val Accuracy: 0.9937	 Train Loss: 0.00533
Train Accuracy: 0.9986	 Val Accuracy: 0.9938	 Train Loss: 0.00527
Train Accuracy: 0.9988	 Val Accuracy: 0.9938	 Train Loss: 0.00502
Train Accuracy: 0.9989	 Val Accuracy: 0.9937	 Train Loss: 0.00440
Train Accuracy: 0.9986	 Val Accuracy: 0.9936	 Train Loss: 0.00487
Train Accuracy: 0.9986	 Val Accuracy: 0.9936	 Train Loss: 0.00496
Train Accuracy: 0.9986	 Val Accuracy: 0.9935	 Train Loss: 0.00501
Train Accuracy: 0.9988	 Val Accuracy: 0.9935	 Train Loss: 0.00485
Train Accuracy: 0.9988	 Val Accuracy: 0.9937	 Train Loss: 0.00478
Train Accuracy: 0.9988	 Val Accuracy: 0.9935	 Train Loss: 0.00463
Train Accuracy: 0.9986	 Val Accuracy: 0.9936	 Train Loss: 0.00496
Train Accuracy: 0.9988	 Val Accuracy: 0.9936	 Train Loss: 0.00450
Train Accuracy: 0.9989	 Val Accuracy: 0.9935	 Train Loss: 0.00457
Train Accuracy: 0.9989	 Val Accuracy: 0.9936	 Train Loss: 0.00447
Train Accu



In [227]:
print('Normal Validation: %.4f' % acc_normal(model,val_iter,arg_map, grad_map))
print('Fixed set perturbation: %.4f' % acc_normal(model, perb_iter,arg_map, grad_map))
print('L0 perturbation: %.4f' % acc_perb_L0(model, val_iter, 2,arg_map, grad_map))
print('L2 perturbation: %.4f' % acc_perb_L2(model, val_iter, 2,arg_map, grad_map))
print('Alpha perturbation: %.4f' % acc_perb_alpha(model, val_iter, 2,arg_map, grad_map))

Normal Validation: 0.9937
Fixed set perturbation: 0.9744
L0 perturbation: 0.9755
L2 perturbation: 0.9066




Alpha gradien being 0 : 78
Alpha perturbation: 0.9035




# LWA

In [245]:
data = mx.symbol.Variable('data')
# first conv
conv1 = mx.symbol.Convolution(data=data, kernel=(5,5), num_filter=20)
tanh1 = mx.symbol.Activation(data=conv1, act_type="tanh")
pool1 = mx.symbol.Pooling(data=tanh1, pool_type="max",
                              kernel=(2,2), stride=(2,2))
# second conv
conv2 = mx.symbol.Convolution(data=pool1, kernel=(5,5), num_filter=50)
tanh2 = mx.symbol.Activation(data=conv2, act_type="tanh")
pool2 = mx.symbol.Pooling(data=tanh2, pool_type="max",
                              kernel=(2,2), stride=(2,2))
# first fullc
flatten = mx.symbol.Flatten(data=pool2)
fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=400)
tanh3 = mx.symbol.Activation(data=fc1, act_type="tanh")
dropout1 = mx.symbol.Dropout(data=tanh3, p=0.5)

# second fullc
fc2 = mx.symbol.FullyConnected(data=dropout1, num_hidden=10)

In [246]:
data_shape = (batch_size, 1, 28, 28)
arg_names = fc2.list_arguments() # 'data' 
arg_shapes, output_shapes, aux_shapes = fc2.infer_shape(data=data_shape)

arg_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
grad_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
reqs = ["write" for name in arg_names]

model = fc2.bind(ctx=dev, args=arg_arrays, args_grad = grad_arrays, grad_req=reqs)
arg_map = dict(zip(arg_names, arg_arrays))
grad_map = dict(zip(arg_names, grad_arrays))
data_grad = grad_map["data"]
out_grad = mx.nd.zeros(model.outputs[0].shape, ctx=dev)

In [253]:
for name in arg_names:
    if "weight" in name:
        arr = arg_map[name]
        arr[:] = mx.rnd.uniform(-0.07, 0.07, arr.shape)

In [254]:
num_round = 60
train_acc = 0.
nbatch = 0
coe_pb = 1.2
lr = 0.1
for i in range(num_round):
    train_loss = 0.
    train_acc = 0.
    nbatch = 0
    train_iter.reset()
    for dbatch in train_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        arg_map["data"][:] = data
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        train_acc += CalAcc(alpha, label.asnumpy()) / batch_size
        train_loss += CalLoss(alpha, label.asnumpy()) / batch_size
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        
        noise = data_grad.asnumpy()
        for j in range(batch_size):
            noise[j] = noise[j]/np.linalg.norm(noise[j].flatten(),2)
        pdata = data.asnumpy() + coe_pb * noise
        arg_map["data"][:] = pdata
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        for name in arg_names:
            if name != "data":
                SGD(arg_map[name], grad_map[name], lr)
        
        nbatch += 1
    train_acc /= nbatch
    train_loss /= nbatch
    val_acc = acc_normal(model, val_iter,arg_map, grad_map)
    print("Train Accuracy: %.4f\t Val Accuracy: %.4f\t Train Loss: %.5f" % (train_acc, val_acc, train_loss))

Train Accuracy: 0.8749	 Val Accuracy: 0.9691	 Train Loss: 0.41803
Train Accuracy: 0.9707	 Val Accuracy: 0.9801	 Train Loss: 0.10303
Train Accuracy: 0.9767	 Val Accuracy: 0.9830	 Train Loss: 0.07761
Train Accuracy: 0.9804	 Val Accuracy: 0.9843	 Train Loss: 0.06567
Train Accuracy: 0.9831	 Val Accuracy: 0.9856	 Train Loss: 0.05692
Train Accuracy: 0.9849	 Val Accuracy: 0.9870	 Train Loss: 0.05127
Train Accuracy: 0.9860	 Val Accuracy: 0.9880	 Train Loss: 0.04645
Train Accuracy: 0.9871	 Val Accuracy: 0.9873	 Train Loss: 0.04308
Train Accuracy: 0.9883	 Val Accuracy: 0.9881	 Train Loss: 0.04005
Train Accuracy: 0.9889	 Val Accuracy: 0.9888	 Train Loss: 0.03809
Train Accuracy: 0.9897	 Val Accuracy: 0.9895	 Train Loss: 0.03504
Train Accuracy: 0.9900	 Val Accuracy: 0.9889	 Train Loss: 0.03339
Train Accuracy: 0.9906	 Val Accuracy: 0.9898	 Train Loss: 0.03234
Train Accuracy: 0.9916	 Val Accuracy: 0.9897	 Train Loss: 0.02991
Train Accuracy: 0.9917	 Val Accuracy: 0.9900	 Train Loss: 0.02870
Train Accu



In [255]:
num_round = 60
train_acc = 0.
nbatch = 0
coe_pb = 1.2
lr = 0.01
for i in range(num_round):
    train_loss = 0.
    train_acc = 0.
    nbatch = 0
    train_iter.reset()
    for dbatch in train_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        arg_map["data"][:] = data
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        train_acc += CalAcc(alpha, label.asnumpy()) / batch_size
        train_loss += CalLoss(alpha, label.asnumpy()) / batch_size
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        
        noise = data_grad.asnumpy()
        for j in range(batch_size):
            noise[j] = noise[j]/np.linalg.norm(noise[j].flatten(),2)
        pdata = data.asnumpy() + coe_pb * noise
        arg_map["data"][:] = pdata
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        for name in arg_names:
            if name != "data":
                SGD(arg_map[name], grad_map[name], lr)
        
        nbatch += 1
    train_acc /= nbatch
    train_loss /= nbatch
    val_acc = acc_normal(model, val_iter,arg_map, grad_map)
    print("Train Accuracy: %.4f\t Val Accuracy: %.4f\t Train Loss: %.5f" % (train_acc, val_acc, train_loss))

Train Accuracy: 0.9983	 Val Accuracy: 0.9934	 Train Loss: 0.00601
Train Accuracy: 0.9984	 Val Accuracy: 0.9929	 Train Loss: 0.00547
Train Accuracy: 0.9986	 Val Accuracy: 0.9930	 Train Loss: 0.00498
Train Accuracy: 0.9985	 Val Accuracy: 0.9930	 Train Loss: 0.00512
Train Accuracy: 0.9986	 Val Accuracy: 0.9932	 Train Loss: 0.00499
Train Accuracy: 0.9986	 Val Accuracy: 0.9930	 Train Loss: 0.00501
Train Accuracy: 0.9986	 Val Accuracy: 0.9930	 Train Loss: 0.00506
Train Accuracy: 0.9987	 Val Accuracy: 0.9933	 Train Loss: 0.00515
Train Accuracy: 0.9990	 Val Accuracy: 0.9932	 Train Loss: 0.00472
Train Accuracy: 0.9988	 Val Accuracy: 0.9932	 Train Loss: 0.00488
Train Accuracy: 0.9987	 Val Accuracy: 0.9933	 Train Loss: 0.00478
Train Accuracy: 0.9987	 Val Accuracy: 0.9933	 Train Loss: 0.00474
Train Accuracy: 0.9988	 Val Accuracy: 0.9931	 Train Loss: 0.00430
Train Accuracy: 0.9988	 Val Accuracy: 0.9931	 Train Loss: 0.00462
Train Accuracy: 0.9988	 Val Accuracy: 0.9930	 Train Loss: 0.00481
Train Accu



In [256]:
print('Normal Validation: %.4f' % acc_normal(model,val_iter,arg_map, grad_map))
print('Fixed set perturbation: %.4f' % acc_normal(model, perb_iter,arg_map, grad_map))
print('L0 perturbation: %.4f' % acc_perb_L0(model, val_iter, 1.5,arg_map, grad_map))
print('L2 perturbation: %.4f' % acc_perb_L2(model, val_iter, 1.5,arg_map, grad_map))
print('Alpha perturbation: %.4f' % acc_perb_alpha(model, val_iter, 1.5,arg_map, grad_map))

Normal Validation: 0.9934
Fixed set perturbation: 0.9822
L0 perturbation: 0.9859
L2 perturbation: 0.9632




Alpha gradien being 0 : 79
Alpha perturbation: 0.9627


