In [67]:
%matplotlib inline
import mxnet as mx
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm

In [2]:
dev = mx.gpu()
batch_size = 100
data_shape = (1, 28, 28)

train_iter = mx.io.MNISTIter(
        image       = "../data/mnist/train-images-idx3-ubyte",
        label       = "../data/mnist/train-labels-idx1-ubyte",
        input_shape = data_shape,
        batch_size  = batch_size,
        shuffle     = True,
        flat        = False)

val_iter = mx.io.MNISTIter(
        image       = "../data/mnist/t10k-images-idx3-ubyte",
        label       = "../data/mnist/t10k-labels-idx1-ubyte",
        input_shape = data_shape,
        batch_size  = batch_size,
        flat        = False)

In [3]:
def Softmax(theta):
    max_val = np.max(theta, axis=1, keepdims=True)
    tmp = theta - max_val
    exp = np.exp(tmp)
    norm = np.sum(exp, axis=1, keepdims=True)
    return exp / norm
    
def LogLossGrad(alpha, label):
    grad = np.copy(alpha)
    for i in range(alpha.shape[0]):
        grad[i, label[i]] -= 1.
    return grad

def SGD(weight, grad, lr=0.1, grad_norm=batch_size):
    weight[:] -= lr * grad / batch_size

def CalAcc(pred_prob, label):
    pred = np.argmax(pred_prob, axis=1)
    return np.sum(pred == label) * 1.0

def CalLoss(pred_prob, label):
    loss = 0.
    for i in range(pred_prob.shape[0]):
        loss += -np.log(max(pred_prob[i, label[i]], 1e-10))
    return loss

In [27]:
def acc_normal(model, val_iter, arg_map, grad_map):
    val_iter.reset()
    val_acc = 0.0
    num_samp = 0
    for dbatch in val_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        batch_size = label.asnumpy().shape[0]
        arg_map["data"][:] = data    

        model.forward(is_train=False)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        val_acc += CalAcc(alpha, label.asnumpy()) 
        num_samp += batch_size
    return(val_acc / num_samp)
    
def acc_perb_L0(model, val_iter, coe_pb,arg_map, grad_map):
    val_iter.reset()
    val_acc = 0.0
    num_samp = 0
    nn=0
    for dbatch in val_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        batch_size = label.asnumpy().shape[0]
        arg_map["data"][:] = data    

        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        
        grad = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = grad
        model.backward([out_grad])
        noise = np.sign(grad_map["data"].asnumpy())
        
        for j in range(batch_size):
            if np.linalg.norm(noise[j].flatten(),2) ==0:
                nn+=1
            y = label.asnumpy()[j]
            if (y == np.argmax(alpha[j])): 
                noise[j] = noise[j]/np.linalg.norm(noise[j].flatten(),2)
            else:
                noise[j] = 0
            
        pdata = data.asnumpy() + coe_pb * noise
        arg_map["data"][:] = pdata
        model.forward(is_train=False)
        raw_output = model.outputs[0].asnumpy()
        pred = Softmax(raw_output)
        
        val_acc += CalAcc(pred, label.asnumpy()) 
        num_samp += batch_size
    if  nn>0:
        print('L0 gradien being 0 :', nn)
    return(val_acc / num_samp)

def acc_perb_L2(model, val_iter, coe_pb, arg_map, grad_map):
    val_iter.reset()
    val_acc = 0.0
    num_batch = 0
    nn=0
    for dbatch in val_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        batch_size = label.asnumpy().shape[0]
        arg_map["data"][:] = data    

        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        
        grad = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = grad
        model.backward([out_grad])
        noise = grad_map["data"].asnumpy()
        
        for j in range(batch_size):
            if np.linalg.norm(noise[j].flatten(),2) ==0:
                nn+=1
            y = label.asnumpy()[j]
            if (y == np.argmax(alpha[j])):
                noise[j] = noise[j]/np.linalg.norm(noise[j].flatten(),2)
            else:
                noise[j] = 0
        pdata = data.asnumpy() + coe_pb * noise
        arg_map["data"][:] = pdata
        model.forward(is_train=False)
        raw_output = model.outputs[0].asnumpy()
        pred = Softmax(raw_output)
        
        val_acc += CalAcc(pred, label.asnumpy()) /  batch_size 
        num_batch += 1
    if  nn>0:
        print('L2 gradien being 0 :', nn)
    return(val_acc / num_batch)


def acc_perb_alpha(model, val_iter, coe_pb,arg_map, grad_map):
    val_iter.reset()
    val_acc = 0.0
    num_samp = 0
    nn=0
    for dbatch in val_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        batch_size = label.asnumpy().shape[0]
        arg_map["data"][:] = data    

        T = np.zeros((10, batch_size, data_shape[1], data_shape[2], data_shape[3]))
        noise = np.zeros(data.shape)
        #===================
        for i in range(10):
            arg_map["data"][:] = data   
            model.forward(is_train=True)
            theta = model.outputs[0].asnumpy()
            alpha = Softmax(theta)
            
            grad = LogLossGrad(alpha, i*np.ones(alpha.shape[0]))
            for j in range(batch_size):
                grad[j] = -alpha[j][i]*grad[j]
            out_grad[:] = grad
            model.backward([out_grad])
 
            T[i] = grad_map["data"].asnumpy()
        
        for j in range(batch_size):
            y = label.asnumpy()[j]
            if (y == np.argmax(alpha[j])):
                perb_scale = np.zeros(10)
                for i in range(10):
                    if (i == y):
                        perb_scale[i] = np.inf
                    else:
                        perb_scale[i] = (alpha[j][y] - alpha[j][i])/np.linalg.norm((T[i][j]-T[y][j]).flatten(),2)
                noise[j] = T[np.argmin(perb_scale)][j]-T[y][j]
        #====================
        for j in range(batch_size):
            if np.linalg.norm(noise[j].flatten(),2) ==0:
                nn+=1
            else:
                noise[j] = noise[j]/np.linalg.norm(noise[j].flatten(),2)
        pdata = data.asnumpy() + coe_pb * noise
        arg_map["data"][:] = pdata
        model.forward(is_train=False)
        raw_output = model.outputs[0].asnumpy()
        pred = Softmax(raw_output)
        
        val_acc += CalAcc(pred, label.asnumpy()) /batch_size
        num_samp += 1
    if  nn>0:
        print('Alpha gradien being 0 :', nn)
    return(val_acc / num_samp)

# Generate Fixed Perturbed Data

In [5]:
# input
data = mx.symbol.Variable('data')

# first fullc
flatten = mx.symbol.Flatten(data=data)
fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=100)
relu1 = mx.symbol.Activation(data=fc1, act_type="relu")
fc2 = mx.symbol.FullyConnected(data=relu1, num_hidden=100)
relu2 = mx.symbol.Activation(data=fc2, act_type="relu")

# second fullc
fc3 = mx.symbol.FullyConnected(data=relu2, num_hidden=10)

In [6]:
data_shape = (batch_size, 1, 28, 28)
arg_names = fc3.list_arguments() # 'data' 
arg_shapes, output_shapes, aux_shapes = fc3.infer_shape(data=data_shape)

arg_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
grad_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
reqs = ["write" for name in arg_names]

model = fc3.bind(ctx=dev, args=arg_arrays, args_grad = grad_arrays, grad_req=reqs)
arg_map = dict(zip(arg_names, arg_arrays))
grad_map = dict(zip(arg_names, grad_arrays))
data_grad = grad_map["data"]
out_grad = mx.nd.zeros(model.outputs[0].shape, ctx=dev)

In [7]:
for name in arg_names:
    if "weight" in name:
        arr = arg_map[name]
        arr[:] = mx.rnd.uniform(-0.07, 0.07, arr.shape)

In [8]:
num_round = 30
train_acc = 0.
nbatch = 0
for i in range(num_round):
    train_loss = 0.
    train_acc = 0.
    nbatch = 0
    train_iter.reset()
    for dbatch in train_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        arg_map["data"][:] = data
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        train_acc += CalAcc(alpha, label.asnumpy()) / batch_size
        train_loss += CalLoss(alpha, label.asnumpy()) / batch_size
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        for name in arg_names:
            if name != "data":
                SGD(arg_map[name], grad_map[name])
        
        nbatch += 1
    train_acc /= nbatch
    train_loss /= nbatch
    print("Train Accuracy: %.4f\t Train Loss: %.5f" % (train_acc, train_loss))

Train Accuracy: 0.8073	 Train Loss: 0.69494
Train Accuracy: 0.9251	 Train Loss: 0.25785
Train Accuracy: 0.9454	 Train Loss: 0.18794
Train Accuracy: 0.9568	 Train Loss: 0.14847
Train Accuracy: 0.9641	 Train Loss: 0.12197
Train Accuracy: 0.9701	 Train Loss: 0.10288
Train Accuracy: 0.9746	 Train Loss: 0.08853
Train Accuracy: 0.9780	 Train Loss: 0.07745
Train Accuracy: 0.9808	 Train Loss: 0.06838
Train Accuracy: 0.9830	 Train Loss: 0.06078
Train Accuracy: 0.9850	 Train Loss: 0.05423
Train Accuracy: 0.9868	 Train Loss: 0.04839
Train Accuracy: 0.9886	 Train Loss: 0.04338
Train Accuracy: 0.9900	 Train Loss: 0.03876
Train Accuracy: 0.9911	 Train Loss: 0.03477
Train Accuracy: 0.9923	 Train Loss: 0.03109
Train Accuracy: 0.9934	 Train Loss: 0.02783
Train Accuracy: 0.9943	 Train Loss: 0.02477
Train Accuracy: 0.9953	 Train Loss: 0.02215
Train Accuracy: 0.9961	 Train Loss: 0.01968
Train Accuracy: 0.9969	 Train Loss: 0.01745
Train Accuracy: 0.9975	 Train Loss: 0.01553
Train Accuracy: 0.9980	 Train Lo



In [9]:
val_iter.reset()
val_acc = 0.0
val_acc_pb = 0.0
coe_pb = 1.5
num_samp = 0

perb_data = []
perb_lab = []

for dbatch in val_iter:
    data = dbatch.data[0]
    label = dbatch.label[0]
    arg_map["data"][:] = data    
    
    model.forward(is_train=True)
    theta = model.outputs[0].asnumpy()
    alpha = Softmax(theta)
    val_acc += CalAcc(alpha, label.asnumpy()) 
    #########
    grad = LogLossGrad(alpha, label.asnumpy())
    out_grad[:] = grad
    model.backward([out_grad])
    noise = data_grad.asnumpy()
    for j in range(batch_size):
        noise[j] = noise[j]/np.linalg.norm(noise[j].flatten(),2)
    pdata = data.asnumpy() + coe_pb * noise
    arg_map["data"][:] = pdata
    model.forward(is_train=True)
    raw_output = model.outputs[0].asnumpy()
    pred = Softmax(raw_output)
    val_acc_pb += CalAcc(pred, label.asnumpy()) 
    num_samp += batch_size
    
    perb_data.append(pdata)
    perb_lab.append(label.asnumpy())
print("Val Batch Accuracy: ", val_acc / num_samp)
print("Val Batch Accuracy after pertubation: ", val_acc_pb / num_samp)

Val Batch Accuracy:  0.9782
Val Batch Accuracy after pertubation:  0.1947




In [10]:
pdata = np.concatenate(perb_data, axis = 0)
plabel = np.concatenate(perb_lab, axis = 0)
perb_iter = mx.io.NDArrayIter(
    data = pdata,
    label = plabel,
    batch_size = 100,
    shuffle = False    
)

In [11]:
perb_iter.reset()
num_samp = 0
val_acc = 0.0
for dbatch in perb_iter:
    data = dbatch.data[0]
    label = dbatch.label[0]
    arg_map["data"][:] = data    
    
    model.forward(is_train=True)
    theta = model.outputs[0].asnumpy()
    alpha = Softmax(theta)
    val_acc += CalAcc(alpha, label.asnumpy()) 
    num_samp += batch_size
print("Val Batch Accuracy after pertubation: ", val_acc / num_samp)

Val Batch Accuracy after pertubation:  0.1947


# Normal Training

In [12]:
# input
data = mx.symbol.Variable('data')
# first fullc
flatten = mx.symbol.Flatten(data=data)
fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=100)
relu1 = mx.symbol.Activation(data=fc1, act_type="relu")
fc2 = mx.symbol.FullyConnected(data=relu1, num_hidden=100)
relu2 = mx.symbol.Activation(data=fc2, act_type="relu")

# second fullc
fc3 = mx.symbol.FullyConnected(data=relu2, num_hidden=10)

In [13]:
data_shape = (batch_size, 1, 28, 28)
arg_names = fc3.list_arguments() # 'data' 
arg_shapes, output_shapes, aux_shapes = fc3.infer_shape(data=data_shape)

arg_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
grad_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
reqs = ["write" for name in arg_names]

model = fc3.bind(ctx=dev, args=arg_arrays, args_grad = grad_arrays, grad_req=reqs)
arg_map = dict(zip(arg_names, arg_arrays))
grad_map = dict(zip(arg_names, grad_arrays))
data_grad = grad_map["data"]
out_grad = mx.nd.zeros(model.outputs[0].shape, ctx=dev)

In [20]:
for name in arg_names:
    if "weight" in name:
        arr = arg_map[name]
        arr[:] = mx.rnd.uniform(-0.07, 0.07, arr.shape)

In [21]:
num_round = 30
train_acc = 0.
nbatch = 0
for i in range(num_round):
    train_loss = 0.
    train_acc = 0.
    nbatch = 0
    train_iter.reset()
    for dbatch in train_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        arg_map["data"][:] = data
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        train_acc += CalAcc(alpha, label.asnumpy()) / batch_size
        train_loss += CalLoss(alpha, label.asnumpy()) / batch_size
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        for name in arg_names:
            if name != "data":
                SGD(arg_map[name], grad_map[name])
        
        nbatch += 1
    train_acc /= nbatch
    train_loss /= nbatch
    val_acc = acc_normal(model, val_iter,arg_map, grad_map)
    print("Train Accuracy: %.4f\t Val Accuracy: %.4f\t Train Loss: %.5f" % (train_acc, val_acc, train_loss))

Train Accuracy: 0.8096	 Val Accuracy: 0.9143	 Train Loss: 0.65890
Train Accuracy: 0.9245	 Val Accuracy: 0.9390	 Train Loss: 0.25878
Train Accuracy: 0.9446	 Val Accuracy: 0.9514	 Train Loss: 0.18797
Train Accuracy: 0.9568	 Val Accuracy: 0.9584	 Train Loss: 0.14671
Train Accuracy: 0.9647	 Val Accuracy: 0.9632	 Train Loss: 0.11978
Train Accuracy: 0.9711	 Val Accuracy: 0.9666	 Train Loss: 0.10056
Train Accuracy: 0.9753	 Val Accuracy: 0.9690	 Train Loss: 0.08656
Train Accuracy: 0.9789	 Val Accuracy: 0.9708	 Train Loss: 0.07536
Train Accuracy: 0.9815	 Val Accuracy: 0.9729	 Train Loss: 0.06649
Train Accuracy: 0.9836	 Val Accuracy: 0.9728	 Train Loss: 0.05914
Train Accuracy: 0.9854	 Val Accuracy: 0.9739	 Train Loss: 0.05305
Train Accuracy: 0.9870	 Val Accuracy: 0.9742	 Train Loss: 0.04754
Train Accuracy: 0.9886	 Val Accuracy: 0.9742	 Train Loss: 0.04274
Train Accuracy: 0.9899	 Val Accuracy: 0.9750	 Train Loss: 0.03850
Train Accuracy: 0.9914	 Val Accuracy: 0.9750	 Train Loss: 0.03461
Train Accu



In [22]:
num_round = 20
train_acc = 0.
nbatch = 0
lr = 0.01
for i in range(num_round):
    train_loss = 0.
    train_acc = 0.
    nbatch = 0
    train_iter.reset()
    for dbatch in train_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        arg_map["data"][:] = data
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        train_acc += CalAcc(alpha, label.asnumpy()) / batch_size
        train_loss += CalLoss(alpha, label.asnumpy()) / batch_size
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        for name in arg_names:
            if name != "data":
                SGD(arg_map[name], grad_map[name],lr)
        
        nbatch += 1
    train_acc /= nbatch
    train_loss /= nbatch
    val_acc = acc_normal(model, val_iter,arg_map, grad_map)
    print("Train Accuracy: %.4f\t Val Accuracy: %.4f\t Train Loss: %.5f" % (train_acc, val_acc, train_loss))

Train Accuracy: 0.9993	 Val Accuracy: 0.9770	 Train Loss: 0.00614
Train Accuracy: 0.9994	 Val Accuracy: 0.9770	 Train Loss: 0.00555
Train Accuracy: 0.9995	 Val Accuracy: 0.9770	 Train Loss: 0.00533
Train Accuracy: 0.9996	 Val Accuracy: 0.9771	 Train Loss: 0.00518
Train Accuracy: 0.9996	 Val Accuracy: 0.9769	 Train Loss: 0.00506
Train Accuracy: 0.9996	 Val Accuracy: 0.9770	 Train Loss: 0.00495
Train Accuracy: 0.9996	 Val Accuracy: 0.9771	 Train Loss: 0.00485
Train Accuracy: 0.9997	 Val Accuracy: 0.9769	 Train Loss: 0.00477
Train Accuracy: 0.9997	 Val Accuracy: 0.9769	 Train Loss: 0.00469
Train Accuracy: 0.9998	 Val Accuracy: 0.9768	 Train Loss: 0.00461
Train Accuracy: 0.9998	 Val Accuracy: 0.9768	 Train Loss: 0.00454
Train Accuracy: 0.9998	 Val Accuracy: 0.9768	 Train Loss: 0.00447
Train Accuracy: 0.9998	 Val Accuracy: 0.9768	 Train Loss: 0.00441
Train Accuracy: 0.9998	 Val Accuracy: 0.9767	 Train Loss: 0.00435
Train Accuracy: 0.9999	 Val Accuracy: 0.9767	 Train Loss: 0.00429
Train Accu



In [28]:
print('Normal Validation: %.3f' % acc_normal(model,val_iter,arg_map, grad_map))
print('Fixed set perturbation: %.3f' % acc_normal(model, perb_iter,arg_map, grad_map))
print('L0 perturbation: %.3f' % acc_perb_L0(model, val_iter, 1.5,arg_map, grad_map))
print('L2 perturbation: %.3f' % acc_perb_L2(model, val_iter, 1.5,arg_map, grad_map))
print('Alpha perturbation: %.3f' % acc_perb_alpha(model, val_iter, 1.5,arg_map, grad_map))

Normal Validation: 0.977
Fixed set perturbation: 0.361
L0 perturbation: 0.287
L2 perturbation: 0.201




Alpha perturbation: 0.074




# Dropout Training

In [29]:
# input
data = mx.symbol.Variable('data')
# first fullc
flatten = mx.symbol.Flatten(data=data)
fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=200)
relu1 = mx.symbol.Activation(data=fc1, act_type="relu")
fc2 = mx.symbol.FullyConnected(data=relu1, num_hidden=200)
relu2 = mx.symbol.Activation(data=fc2, act_type="relu")
dropout1 = mx.symbol.Dropout(data=relu2, p=0.5)
# second fullc
fc3 = mx.symbol.FullyConnected(data=dropout1, num_hidden=10)

In [30]:
data_shape = (batch_size, 1, 28, 28)
arg_names = fc3.list_arguments() # 'data' 
arg_shapes, output_shapes, aux_shapes = fc3.infer_shape(data=data_shape)

arg_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
grad_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
reqs = ["write" for name in arg_names]

model = fc3.bind(ctx=dev, args=arg_arrays, args_grad = grad_arrays, grad_req=reqs)
arg_map = dict(zip(arg_names, arg_arrays))
grad_map = dict(zip(arg_names, grad_arrays))
data_grad = grad_map["data"]
out_grad = mx.nd.zeros(model.outputs[0].shape, ctx=dev)

In [31]:
for name in arg_names:
    if "weight" in name:
        arr = arg_map[name]
        arr[:] = mx.rnd.uniform(-0.07, 0.07, arr.shape)

In [32]:
num_round = 50
train_acc = 0.
nbatch = 0
for i in range(num_round):
    train_loss = 0.
    train_acc = 0.
    nbatch = 0
    train_iter.reset()
    for dbatch in train_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        arg_map["data"][:] = data
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        train_acc += CalAcc(alpha, label.asnumpy()) / batch_size
        train_loss += CalLoss(alpha, label.asnumpy()) / batch_size
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        for name in arg_names:
            if name != "data":
                SGD(arg_map[name], grad_map[name])
        
        nbatch += 1
    train_acc /= nbatch
    train_loss /= nbatch
    val_acc = acc_normal(model, val_iter,arg_map, grad_map)
    print("Train Accuracy: %.4f\t Val Accuracy: %.4f\t Train Loss: %.5f" % (train_acc, val_acc, train_loss))

Train Accuracy: 0.8128	 Val Accuracy: 0.9254	 Train Loss: 0.64124
Train Accuracy: 0.9200	 Val Accuracy: 0.9455	 Train Loss: 0.27623
Train Accuracy: 0.9415	 Val Accuracy: 0.9567	 Train Loss: 0.20205
Train Accuracy: 0.9525	 Val Accuracy: 0.9622	 Train Loss: 0.16100
Train Accuracy: 0.9611	 Val Accuracy: 0.9672	 Train Loss: 0.13313
Train Accuracy: 0.9671	 Val Accuracy: 0.9698	 Train Loss: 0.11352
Train Accuracy: 0.9712	 Val Accuracy: 0.9716	 Train Loss: 0.09879
Train Accuracy: 0.9753	 Val Accuracy: 0.9733	 Train Loss: 0.08655
Train Accuracy: 0.9774	 Val Accuracy: 0.9752	 Train Loss: 0.07768
Train Accuracy: 0.9792	 Val Accuracy: 0.9773	 Train Loss: 0.06974
Train Accuracy: 0.9817	 Val Accuracy: 0.9784	 Train Loss: 0.06287
Train Accuracy: 0.9825	 Val Accuracy: 0.9790	 Train Loss: 0.05925
Train Accuracy: 0.9843	 Val Accuracy: 0.9782	 Train Loss: 0.05370
Train Accuracy: 0.9851	 Val Accuracy: 0.9783	 Train Loss: 0.05061
Train Accuracy: 0.9868	 Val Accuracy: 0.9792	 Train Loss: 0.04545
Train Accu



In [33]:
num_round = 30
train_acc = 0.
nbatch = 0
lr = 0.01
for i in range(num_round):
    train_loss = 0.
    train_acc = 0.
    nbatch = 0
    train_iter.reset()
    for dbatch in train_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        arg_map["data"][:] = data
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        train_acc += CalAcc(alpha, label.asnumpy()) / batch_size
        train_loss += CalLoss(alpha, label.asnumpy()) / batch_size
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        for name in arg_names:
            if name != "data":
                SGD(arg_map[name], grad_map[name],lr)
        
        nbatch += 1
    train_acc /= nbatch
    train_loss /= nbatch
    val_acc = acc_normal(model, val_iter,arg_map, grad_map)
    print("Train Accuracy: %.4f\t Val Accuracy: %.4f\t Train Loss: %.5f" % (train_acc, val_acc, train_loss))

Train Accuracy: 0.9989	 Val Accuracy: 0.9825	 Train Loss: 0.00461
Train Accuracy: 0.9992	 Val Accuracy: 0.9825	 Train Loss: 0.00412
Train Accuracy: 0.9993	 Val Accuracy: 0.9825	 Train Loss: 0.00394
Train Accuracy: 0.9994	 Val Accuracy: 0.9825	 Train Loss: 0.00354
Train Accuracy: 0.9992	 Val Accuracy: 0.9821	 Train Loss: 0.00392
Train Accuracy: 0.9994	 Val Accuracy: 0.9825	 Train Loss: 0.00343
Train Accuracy: 0.9992	 Val Accuracy: 0.9826	 Train Loss: 0.00383
Train Accuracy: 0.9995	 Val Accuracy: 0.9822	 Train Loss: 0.00338
Train Accuracy: 0.9994	 Val Accuracy: 0.9822	 Train Loss: 0.00336
Train Accuracy: 0.9993	 Val Accuracy: 0.9824	 Train Loss: 0.00363
Train Accuracy: 0.9995	 Val Accuracy: 0.9823	 Train Loss: 0.00328
Train Accuracy: 0.9994	 Val Accuracy: 0.9830	 Train Loss: 0.00338
Train Accuracy: 0.9996	 Val Accuracy: 0.9825	 Train Loss: 0.00317
Train Accuracy: 0.9995	 Val Accuracy: 0.9828	 Train Loss: 0.00332
Train Accuracy: 0.9996	 Val Accuracy: 0.9822	 Train Loss: 0.00300
Train Accu



In [34]:
print('Normal Validation: %.3f' % acc_normal(model,val_iter,arg_map, grad_map))
print('Fixed set perturbation: %.3f' % acc_normal(model, perb_iter,arg_map, grad_map))
print('L0 perturbation: %.3f' % acc_perb_L0(model, val_iter, 1.5,arg_map, grad_map))
print('L2 perturbation: %.3f' % acc_perb_L2(model, val_iter, 1.5,arg_map, grad_map))
print('Alpha perturbation: %.3f' % acc_perb_alpha(model, val_iter, 1.5,arg_map, grad_map))

Normal Validation: 0.982
Fixed set perturbation: 0.446
L0 perturbation: 0.440
L2 perturbation: 0.321




Alpha perturbation: 0.193




# Ian's Method

In [35]:
# input
data = mx.symbol.Variable('data')
# first fullc
flatten = mx.symbol.Flatten(data=data)
fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=500)
relu1 = mx.symbol.Activation(data=fc1, act_type="relu")
fc2 = mx.symbol.FullyConnected(data=relu1, num_hidden=500)
relu2 = mx.symbol.Activation(data=fc2, act_type="relu")
# second fullc
fc3 = mx.symbol.FullyConnected(data=relu2, num_hidden=10)

In [36]:
data_shape = (batch_size, 1, 28, 28)
arg_names = fc3.list_arguments() # 'data' 
arg_shapes, output_shapes, aux_shapes = fc3.infer_shape(data=data_shape)

arg_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
grad_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
sum_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]

reqs = ["write" for name in arg_names]

model = fc3.bind(ctx=dev, args=arg_arrays, args_grad = grad_arrays, grad_req=reqs)
arg_map = dict(zip(arg_names, arg_arrays))
grad_map = dict(zip(arg_names, grad_arrays))
sum_map = dict(zip(arg_names, sum_arrays))
data_grad = grad_map["data"]
out_grad = mx.nd.zeros(model.outputs[0].shape, ctx=dev)

In [37]:
for name in arg_names:
    if "weight" in name:
        arr = arg_map[name]
        arr[:] = mx.rnd.uniform(-0.07, 0.07, arr.shape)
for name in arg_names:
    sum_map[name][:] = 0.

In [38]:
num_round = 50
train_acc = 0.
nbatch = 0
coe_pb = 1.5
lr= 0.05
for i in range(num_round):
    train_loss = 0.
    train_acc = 0.
    nbatch = 0
    train_iter.reset()
    for dbatch in train_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        arg_map["data"][:] = data
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        train_acc += CalAcc(alpha, label.asnumpy()) / batch_size
        train_loss += CalLoss(alpha, label.asnumpy()) / batch_size
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        
        for name in arg_names:
            if name != "data":
                sum_map[name][:] += grad_map[name]
        
        noise = np.sign(data_grad.asnumpy())
        for j in range(batch_size):
            noise[j] = noise[j]/np.linalg.norm(noise[j].flatten(),2)
        pdata = data.asnumpy() + coe_pb * noise
        arg_map["data"][:] = pdata
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        
        for name in arg_names:
            if name != "data":
                sum_map[name][:] += grad_map[name]

        for name in arg_names:
            if name != "data":
                SGD(arg_map[name], sum_map[name], lr)
            sum_map[name][:] = 0.
        
        nbatch += 1
    train_acc /= nbatch
    train_loss /= nbatch
    val_acc = acc_normal(model, val_iter,arg_map, grad_map)
    print("Train Accuracy: %.4f\t Val Accuracy: %.4f\t Train Loss: %.5f" % (train_acc, val_acc, train_loss))

Train Accuracy: 0.8632	 Val Accuracy: 0.9369	 Train Loss: 0.56602
Train Accuracy: 0.9440	 Val Accuracy: 0.9562	 Train Loss: 0.24647
Train Accuracy: 0.9593	 Val Accuracy: 0.9658	 Train Loss: 0.17804
Train Accuracy: 0.9665	 Val Accuracy: 0.9718	 Train Loss: 0.14333
Train Accuracy: 0.9716	 Val Accuracy: 0.9741	 Train Loss: 0.12186
Train Accuracy: 0.9752	 Val Accuracy: 0.9758	 Train Loss: 0.10689
Train Accuracy: 0.9777	 Val Accuracy: 0.9772	 Train Loss: 0.09585
Train Accuracy: 0.9797	 Val Accuracy: 0.9784	 Train Loss: 0.08720
Train Accuracy: 0.9810	 Val Accuracy: 0.9794	 Train Loss: 0.08015
Train Accuracy: 0.9825	 Val Accuracy: 0.9808	 Train Loss: 0.07422
Train Accuracy: 0.9837	 Val Accuracy: 0.9815	 Train Loss: 0.06926
Train Accuracy: 0.9847	 Val Accuracy: 0.9824	 Train Loss: 0.06488
Train Accuracy: 0.9857	 Val Accuracy: 0.9827	 Train Loss: 0.06101
Train Accuracy: 0.9864	 Val Accuracy: 0.9831	 Train Loss: 0.05753
Train Accuracy: 0.9870	 Val Accuracy: 0.9837	 Train Loss: 0.05438
Train Accu



In [39]:
num_round = 30
train_acc = 0.
nbatch = 0
coe_pb = 1.5
lr= 0.005
for i in range(num_round):
    train_loss = 0.
    train_acc = 0.
    nbatch = 0
    train_iter.reset()
    for dbatch in train_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        arg_map["data"][:] = data
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        train_acc += CalAcc(alpha, label.asnumpy()) / batch_size
        train_loss += CalLoss(alpha, label.asnumpy()) / batch_size
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        
        for name in arg_names:
            if name != "data":
                sum_map[name][:] += grad_map[name]
        #grad1 = grad_map
        
        noise = np.sign(data_grad.asnumpy())
        for j in range(batch_size):
            noise[j] = noise[j]/np.linalg.norm(noise[j].flatten(),2)
        pdata = data.asnumpy() + coe_pb * noise
        arg_map["data"][:] = pdata
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        
        for name in arg_names:
            if name != "data":
                sum_map[name][:] += grad_map[name]

        for name in arg_names:
            if name != "data":
                SGD(arg_map[name], sum_map[name], lr)
            sum_map[name][:] = 0.
        
        nbatch += 1
    train_acc /= nbatch
    train_loss /= nbatch
    val_acc = acc_normal(model, val_iter,arg_map, grad_map)
    print("Train Accuracy: %.4f\t Val Accuracy: %.4f\t Train Loss: %.5f" % (train_acc, val_acc, train_loss))

Train Accuracy: 0.9976	 Val Accuracy: 0.9900	 Train Loss: 0.01189
Train Accuracy: 0.9978	 Val Accuracy: 0.9900	 Train Loss: 0.01148
Train Accuracy: 0.9978	 Val Accuracy: 0.9901	 Train Loss: 0.01133
Train Accuracy: 0.9978	 Val Accuracy: 0.9902	 Train Loss: 0.01123
Train Accuracy: 0.9978	 Val Accuracy: 0.9902	 Train Loss: 0.01115
Train Accuracy: 0.9979	 Val Accuracy: 0.9902	 Train Loss: 0.01108
Train Accuracy: 0.9979	 Val Accuracy: 0.9902	 Train Loss: 0.01101
Train Accuracy: 0.9979	 Val Accuracy: 0.9902	 Train Loss: 0.01095
Train Accuracy: 0.9979	 Val Accuracy: 0.9904	 Train Loss: 0.01090
Train Accuracy: 0.9979	 Val Accuracy: 0.9904	 Train Loss: 0.01084
Train Accuracy: 0.9979	 Val Accuracy: 0.9905	 Train Loss: 0.01078
Train Accuracy: 0.9979	 Val Accuracy: 0.9905	 Train Loss: 0.01073
Train Accuracy: 0.9979	 Val Accuracy: 0.9906	 Train Loss: 0.01068
Train Accuracy: 0.9979	 Val Accuracy: 0.9907	 Train Loss: 0.01063
Train Accuracy: 0.9980	 Val Accuracy: 0.9908	 Train Loss: 0.01058
Train Accu



In [40]:
print('Normal Validation: %.3f' % acc_normal(model,val_iter,arg_map, grad_map))
print('Fixed set perturbation: %.3f' % acc_normal(model, perb_iter,arg_map, grad_map))
print('L0 perturbation: %.3f' % acc_perb_L0(model, val_iter, 1.5,arg_map, grad_map))
print('L2 perturbation: %.3f' % acc_perb_L2(model, val_iter, 1.5,arg_map, grad_map))
print('Alpha perturbation: %.3f' % acc_perb_alpha(model, val_iter, 1.5,arg_map, grad_map))

Normal Validation: 0.991
Fixed set perturbation: 0.972
L0 perturbation: 0.939
L2 perturbation: 0.844




Alpha perturbation: 0.836




# LWA

In [41]:
# input
data = mx.symbol.Variable('data')
# first fullc
flatten = mx.symbol.Flatten(data=data)
fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=500)
relu1 = mx.symbol.Activation(data=fc1, act_type="relu")
fc2 = mx.symbol.FullyConnected(data=relu1, num_hidden=500)
relu2 = mx.symbol.Activation(data=fc2, act_type="relu")
# second fullc
fc3 = mx.symbol.FullyConnected(data=relu2, num_hidden=10)

In [42]:
data_shape = (batch_size, 1, 28, 28)
arg_names = fc3.list_arguments() # 'data' 
arg_shapes, output_shapes, aux_shapes = fc3.infer_shape(data=data_shape)

arg_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
grad_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
reqs = ["write" for name in arg_names]

model = fc3.bind(ctx=dev, args=arg_arrays, args_grad = grad_arrays, grad_req=reqs)
arg_map = dict(zip(arg_names, arg_arrays))
grad_map = dict(zip(arg_names, grad_arrays))
data_grad = grad_map["data"]
out_grad = mx.nd.zeros(model.outputs[0].shape, ctx=dev)

In [63]:
for name in arg_names:
    if "weight" in name:
        arr = arg_map[name]
        arr[:] = mx.rnd.uniform(-0.07, 0.07, arr.shape)

In [64]:
num_round = 50
train_acc = 0.
nbatch = 0
coe_pb = 1.7
lr = 0.1
for i in range(num_round):
    train_loss = 0.
    train_acc = 0.
    nbatch = 0
    train_iter.reset()
    for dbatch in train_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        arg_map["data"][:] = data
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        train_acc += CalAcc(alpha, label.asnumpy()) / batch_size
        train_loss += CalLoss(alpha, label.asnumpy()) / batch_size
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        
        noise = data_grad.asnumpy()
        for j in range(batch_size):
            noise[j] = noise[j]/np.linalg.norm(noise[j].flatten(),2)
        pdata = data.asnumpy() + coe_pb * noise
        arg_map["data"][:] = pdata
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        for name in arg_names:
            if name != "data":
                SGD(arg_map[name], grad_map[name], lr)
        
        nbatch += 1
    train_acc /= nbatch
    train_loss /= nbatch
    val_acc = acc_normal(model, val_iter,arg_map, grad_map)
    print("Train Accuracy: %.4f\t Val Accuracy: %.4f\t Train Loss: %.5f" % (train_acc, val_acc, train_loss))

Train Accuracy: 0.7834	 Val Accuracy: 0.8810	 Train Loss: 0.85596
Train Accuracy: 0.9085	 Val Accuracy: 0.9376	 Train Loss: 0.42259
Train Accuracy: 0.9372	 Val Accuracy: 0.9543	 Train Loss: 0.30936
Train Accuracy: 0.9501	 Val Accuracy: 0.9608	 Train Loss: 0.25102
Train Accuracy: 0.9582	 Val Accuracy: 0.9660	 Train Loss: 0.21293
Train Accuracy: 0.9639	 Val Accuracy: 0.9692	 Train Loss: 0.18502
Train Accuracy: 0.9679	 Val Accuracy: 0.9728	 Train Loss: 0.16432
Train Accuracy: 0.9703	 Val Accuracy: 0.9736	 Train Loss: 0.14820
Train Accuracy: 0.9729	 Val Accuracy: 0.9754	 Train Loss: 0.13480
Train Accuracy: 0.9750	 Val Accuracy: 0.9760	 Train Loss: 0.12353
Train Accuracy: 0.9769	 Val Accuracy: 0.9762	 Train Loss: 0.11404
Train Accuracy: 0.9783	 Val Accuracy: 0.9771	 Train Loss: 0.10583
Train Accuracy: 0.9794	 Val Accuracy: 0.9781	 Train Loss: 0.09876
Train Accuracy: 0.9806	 Val Accuracy: 0.9789	 Train Loss: 0.09258
Train Accuracy: 0.9815	 Val Accuracy: 0.9793	 Train Loss: 0.08719
Train Accu



In [65]:
num_round = 35
train_acc = 0.
nbatch = 0
coe_pb = 1.7
lr = 0.02
for i in range(num_round):
    train_loss = 0.
    train_acc = 0.
    nbatch = 0
    train_iter.reset()
    for dbatch in train_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        arg_map["data"][:] = data
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        train_acc += CalAcc(alpha, label.asnumpy()) / batch_size
        train_loss += CalLoss(alpha, label.asnumpy()) / batch_size
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        
        noise = data_grad.asnumpy()
        for j in range(batch_size):
            noise[j] = noise[j]/np.linalg.norm(noise[j].flatten(),2)
        pdata = data.asnumpy() + coe_pb * noise
        arg_map["data"][:] = pdata
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        for name in arg_names:
            if name != "data":
                SGD(arg_map[name], grad_map[name],lr)
        
        nbatch += 1
    train_acc /= nbatch
    train_loss /= nbatch
    val_acc = acc_normal(model, val_iter,arg_map, grad_map)
    print("Train Accuracy: %.4f\t Val Accuracy: %.4f\t Train Loss: %.5f" % (train_acc, val_acc, train_loss))

Train Accuracy: 0.9951	 Val Accuracy: 0.9884	 Train Loss: 0.01954
Train Accuracy: 0.9953	 Val Accuracy: 0.9886	 Train Loss: 0.01884
Train Accuracy: 0.9954	 Val Accuracy: 0.9885	 Train Loss: 0.01854
Train Accuracy: 0.9955	 Val Accuracy: 0.9885	 Train Loss: 0.01831
Train Accuracy: 0.9956	 Val Accuracy: 0.9884	 Train Loss: 0.01809
Train Accuracy: 0.9957	 Val Accuracy: 0.9886	 Train Loss: 0.01788
Train Accuracy: 0.9958	 Val Accuracy: 0.9886	 Train Loss: 0.01769
Train Accuracy: 0.9958	 Val Accuracy: 0.9885	 Train Loss: 0.01751
Train Accuracy: 0.9958	 Val Accuracy: 0.9885	 Train Loss: 0.01734
Train Accuracy: 0.9959	 Val Accuracy: 0.9887	 Train Loss: 0.01718
Train Accuracy: 0.9959	 Val Accuracy: 0.9888	 Train Loss: 0.01701
Train Accuracy: 0.9960	 Val Accuracy: 0.9886	 Train Loss: 0.01685
Train Accuracy: 0.9960	 Val Accuracy: 0.9888	 Train Loss: 0.01670
Train Accuracy: 0.9961	 Val Accuracy: 0.9890	 Train Loss: 0.01653
Train Accuracy: 0.9961	 Val Accuracy: 0.9889	 Train Loss: 0.01637
Train Accu



In [66]:
print('Normal Validation: %.3f' % acc_normal(model,val_iter,arg_map, grad_map))
print('Fixed set perturbation: %.3f' % acc_normal(model, perb_iter,arg_map, grad_map))
print('L0 perturbation: %.3f' % acc_perb_L0(model, val_iter, 1.5,arg_map, grad_map))
print('L2 perturbation: %.3f' % acc_perb_L2(model, val_iter, 1.5,arg_map, grad_map))
print('Alpha perturbation: %.3f' % acc_perb_alpha(model, val_iter, 1.5,arg_map, grad_map))

Normal Validation: 0.990
Fixed set perturbation: 0.974
L0 perturbation: 0.944
L2 perturbation: 0.867




Alpha perturbation: 0.862


