In [1]:
%matplotlib inline
import numpy as np
import mxnet as mx
import logging
from skimage import io

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [2]:
batch_size = 128
dev = mx.gpu()
train_iter = mx.io.ImageRecordIter(
    shuffle=True,
    path_imgrec="../data/cifar10/train.rec",
    mean_r = 128,
    mean_g = 128,
    mean_b = 128,
    scale = 0.0078125,
    rand_crop=True,
    rand_mirror=True,
    data_shape=(3, 28, 28),
    batch_size=batch_size,
    prefetch_buffer=4,
    preprocess_threads=2)

val_iter = mx.io.ImageRecordIter(
    path_imgrec="../data/cifar10/test.rec",
    mean_r = 128,
    mean_g = 128,
    mean_b = 128,
    scale = 0.0078125,
    rand_crop=False,
    rand_mirror=False,
    data_shape=(3, 28, 28),
    batch_size=batch_size,
    prefetch_buffer=4,
    preprocess_threads=2,
    round_batch=False)

In [3]:
def Softmax(arr):
    max_val = np.max(arr, axis=1, keepdims=True)
    tmp = arr - max_val
    exp = np.exp(tmp)
    norm = np.sum(exp, axis=1, keepdims=True)
    return exp / norm

def SoftmaxGrad(arr, idx):
    grad = np.copy(arr)
    for i in range(arr.shape[0]):
        p = grad[i, idx]
        grad[i, :] *= -p
        grad[i, idx] = p * (1. - p)
    return grad

def SGD(weight, grad, lr=0.1, wd=0.0001, grad_norm=batch_size):
    grad = mx.nd.clip(grad, -5,5)
    weight[:] -= lr * (grad / batch_size + wd*weight)
    
def LogLossGrad(arr, label):
    grad = np.copy(arr)
    for i in range(arr.shape[0]):
        grad[i, label[i]] -= 1.
    return grad

def CalAcc(pred_prob, label):
    pred = np.argmax(pred_prob, axis=1)
    return np.sum(pred == label) * 1.0

def CalLoss(pred_prob, label):
    loss = 0.
    for i in range(pred_prob.shape[0]):
        loss += -np.log(max(pred_prob[i, label[i]], 1e-10))
    return loss

def ConvFactory(data, kernel, pad, num_filter, stride=1):
    conv = mx.sym.Convolution(data=data, kernel=(kernel, kernel), pad=(pad, pad), stride=(stride, stride), num_filter=num_filter)
    act = mx.sym.Activation(data=conv, act_type='relu')
    return act

In [4]:
def acc_normal(model, val_iter, arg_map, grad_map):
    val_iter.reset()
    val_acc = 0.0
    num_samp = 0
    for dbatch in val_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        batch_size = label.asnumpy().shape[0]
        arg_map["data"][:] = data    

        model.forward(is_train=False)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        val_acc += CalAcc(alpha, label.asnumpy()) 
        num_samp += batch_size
    return(val_acc / num_samp)
    
def acc_perb_L0(model, val_iter, coe_pb,arg_map, grad_map):
    val_iter.reset()
    val_acc = 0.0
    num_samp = 0
    nn=0
    for dbatch in val_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        batch_size = label.asnumpy().shape[0]
        arg_map["data"][:] = data    

        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        
        grad = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = grad
        model.backward([out_grad])
        noise = np.sign(grad_map["data"].asnumpy())
        
        for j in range(batch_size):
            if np.linalg.norm(noise[j].flatten(),2) ==0:
                nn+=1
            y = label.asnumpy()[j]
            if (y == np.argmax(alpha[j])):
                noise[j] = noise[j]/np.linalg.norm(noise[j].flatten(),2)
            else:
                noise[j] = 0
            
        pdata = data.asnumpy() + coe_pb * noise
        arg_map["data"][:] = pdata
        model.forward(is_train=False)
        raw_output = model.outputs[0].asnumpy()
        pred = Softmax(raw_output)
        
        val_acc += CalAcc(pred, label.asnumpy()) 
        num_samp += batch_size
    if  nn>0:
        print('L0 gradien being 0 :', nn)
    return(val_acc / num_samp)

def acc_perb_L2(model, val_iter, coe_pb, arg_map, grad_map):
    val_iter.reset()
    val_acc = 0.0
    num_batch = 0
    nn=0
    for dbatch in val_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        batch_size = label.asnumpy().shape[0]
        arg_map["data"][:] = data    

        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        
        grad = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = grad
        model.backward([out_grad])
        noise = grad_map["data"].asnumpy()
        
        for j in range(batch_size):
            if np.linalg.norm(noise[j].flatten(),2) ==0:
                nn+=1
            y = label.asnumpy()[j]
            if (y == np.argmax(alpha[j])): 
                noise[j] = noise[j]/np.linalg.norm(noise[j].flatten(),2)
            else:
                noise[j] = 0
        pdata = data.asnumpy() + coe_pb * noise
        arg_map["data"][:] = pdata
        model.forward(is_train=False)
        raw_output = model.outputs[0].asnumpy()
        pred = Softmax(raw_output)
        
        val_acc += CalAcc(pred, label.asnumpy()) /  batch_size 
        num_batch += 1
    if  nn>0:
        print('L2 gradien being 0 :', nn)
    return(val_acc / num_batch)

def acc_perb_alpha(model, val_iter, coe_pb,arg_map, grad_map):
    val_iter.reset()
    val_acc = 0.0
    num_samp = 0
    nn=0
    for dbatch in val_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        batch_size = label.asnumpy().shape[0]
        arg_map["data"][:] = data    

        T = np.zeros((10, batch_size, data_shape[1], data_shape[2], data_shape[3]))
        noise = np.zeros(data.shape)
        #===================
        for i in range(10):
            arg_map["data"][:] = data   
            model.forward(is_train=True)
            theta = model.outputs[0].asnumpy()
            alpha = Softmax(theta)
            
            grad = LogLossGrad(alpha, i*np.ones(alpha.shape[0]))
            for j in range(batch_size):
                grad[j] = -alpha[j][i]*grad[j]
            out_grad[:] = grad
            model.backward([out_grad])
            T[i] = grad_map["data"].asnumpy()
        
        for j in range(batch_size):
            y = label.asnumpy()[j]
            if (y == np.argmax(alpha[j])):
                perb_scale = np.zeros(10)
                for i in range(10):
                    if (i == y):
                        perb_scale[i] = np.inf
                    else:
                        perb_scale[i] = (alpha[j][y] - alpha[j][i])/np.linalg.norm((T[i][j]-T[y][j]).flatten(),2)
                noise[j] = T[np.argmin(perb_scale)][j]-T[y][j]
        #====================
        for j in range(batch_size):
            if np.linalg.norm(noise[j].flatten(),2) ==0:
                nn+=1
            else:
                noise[j] = noise[j]/np.linalg.norm(noise[j].flatten(),2)
        pdata = data.asnumpy() + coe_pb * noise
        arg_map["data"][:] = pdata
        model.forward(is_train=False)
        raw_output = model.outputs[0].asnumpy()
        pred = Softmax(raw_output)
        
        val_acc += CalAcc(pred, label.asnumpy()) /batch_size
        num_samp += 1
    return(val_acc / num_samp)

# Generate Fixed Perturbed Data

In [5]:
data = mx.sym.Variable('data')
conv1 = ConvFactory(data, 3, 1, 64)
conv2 = ConvFactory(conv1, 3, 1, 64)
conv3 = ConvFactory(conv2, 3, 1, 64)
mp1 = mx.sym.Pooling(data=conv3, pool_type="max", kernel=(3,3), stride=(2,2))
conv4 = ConvFactory(mp1, 3, 1, 128)
conv5 = ConvFactory(conv4, 3, 1, 128)
conv6 = ConvFactory(conv5, 3, 1, 128)
mp1 = mx.sym.Pooling(data=conv6, pool_type="max", kernel=(3,3), stride=(2,2))
fl = mx.sym.Flatten(data=mp1)
fc1 = mx.sym.FullyConnected(data=fl, num_hidden=2048)
act1 = mx.sym.Activation(data=fc1, act_type="relu")

fc2 = mx.sym.FullyConnected(data=act1, num_hidden=2048)
act2 = mx.sym.Activation(data=fc2, act_type="relu")

flatten = mx.sym.FullyConnected(data=act2, num_hidden=10)

In [6]:
data_shape = (batch_size, 3, 28, 28)
arg_names = flatten.list_arguments()
arg_shapes, output_shapes, aux_shapes = flatten.infer_shape(data=data_shape)
arg_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
grad_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
grad_sum = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
aux_states =  [mx.nd.zeros(shape, ctx=dev) for shape in aux_shapes]
pred = mx.nd.zeros(output_shapes[0])
reqs = ["write" for name in arg_names]

model = flatten.bind(ctx=dev, args=arg_arrays, args_grad = grad_arrays, grad_req=reqs, aux_states=aux_states)
arg_map = dict(zip(arg_names, arg_arrays))
grad_map = dict(zip(arg_names, grad_arrays))
sum_map = dict(zip(arg_names, grad_sum))
data_grad = grad_map["data"]
out_grad = mx.nd.zeros(model.outputs[0].shape, ctx=dev)

In [7]:
for name in arg_names:
    if "weight" in name:
        arr = arg_map[name]
        shape = arr.shape
        fan_in, fan_out = np.prod(shape[1:]), shape[0]
        factor = fan_in
        scale = np.sqrt(6 / factor)
        arr[:] = mx.rnd.uniform(-scale, scale, arr.shape)
    elif "gamma" in name:
        arr = arg_map[name]
        arr[:] = 1.0
    else:
        arr = arg_map[name]
        arr[:] = 0.

In [8]:
num_round = 30
train_acc = 0.
nbatch = 0
lr = 0.03
for i in range(num_round):
    train_loss = 0.
    train_acc = 0.
    nbatch = 0
    train_iter.reset()
    for dbatch in train_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        
        arg_map["data"][:] = data
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        train_acc += CalAcc(alpha, label.asnumpy()) / batch_size
        train_loss += CalLoss(alpha, label.asnumpy()) / batch_size
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        for name in arg_names:
            if name != "data":
                if name.endswith("weight"):
                    SGD(arg_map[name], grad_map[name], lr)
                else:
                    SGD(arg_map[name], grad_map[name], lr, 0)
        
        nbatch += 1
    train_acc /= nbatch
    train_loss /= nbatch
    val_acc = acc_normal(model, val_iter,arg_map, grad_map)
    print("Train Accuracy: %.4f\t Val Accuracy: %.4f\t Train Loss: %.5f" % (train_acc, val_acc, train_loss))

Train Accuracy: 0.3759	 Val Accuracy: 0.5005	 Train Loss: 1.74886
Train Accuracy: 0.5101	 Val Accuracy: 0.5941	 Train Loss: 1.37674
Train Accuracy: 0.5821	 Val Accuracy: 0.5932	 Train Loss: 1.18168
Train Accuracy: 0.6405	 Val Accuracy: 0.6725	 Train Loss: 1.02275
Train Accuracy: 0.6872	 Val Accuracy: 0.7162	 Train Loss: 0.89907
Train Accuracy: 0.7202	 Val Accuracy: 0.7031	 Train Loss: 0.80818
Train Accuracy: 0.7480	 Val Accuracy: 0.7371	 Train Loss: 0.72595
Train Accuracy: 0.7663	 Val Accuracy: 0.7621	 Train Loss: 0.67368
Train Accuracy: 0.7843	 Val Accuracy: 0.7171	 Train Loss: 0.62520
Train Accuracy: 0.8014	 Val Accuracy: 0.7722	 Train Loss: 0.57382
Train Accuracy: 0.8106	 Val Accuracy: 0.7660	 Train Loss: 0.54033
Train Accuracy: 0.8262	 Val Accuracy: 0.7682	 Train Loss: 0.50015
Train Accuracy: 0.8380	 Val Accuracy: 0.8021	 Train Loss: 0.46571
Train Accuracy: 0.8467	 Val Accuracy: 0.7888	 Train Loss: 0.43971
Train Accuracy: 0.8581	 Val Accuracy: 0.7737	 Train Loss: 0.40931
Train Accu



In [9]:
num_round = 20
train_acc = 0.
nbatch = 0
lr = 0.003
for i in range(num_round):
    train_loss = 0.
    train_acc = 0.
    nbatch = 0
    train_iter.reset()
    for dbatch in train_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        
        arg_map["data"][:] = data
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        train_acc += CalAcc(alpha, label.asnumpy()) / batch_size
        train_loss += CalLoss(alpha, label.asnumpy()) / batch_size
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        for name in arg_names:
            if name != "data":
                if name.endswith("weight"):
                    SGD(arg_map[name], grad_map[name], lr)
                else:
                    SGD(arg_map[name], grad_map[name], lr, 0)
        
        nbatch += 1
    train_acc /= nbatch
    train_loss /= nbatch
    val_acc = acc_normal(model, val_iter,arg_map, grad_map)
    print("Train Accuracy: %.4f\t Val Accuracy: %.4f\t Train Loss: %.5f" % (train_acc, val_acc, train_loss))

Train Accuracy: 0.9705	 Val Accuracy: 0.8359	 Train Loss: 0.08914
Train Accuracy: 0.9750	 Val Accuracy: 0.8360	 Train Loss: 0.07748
Train Accuracy: 0.9781	 Val Accuracy: 0.8394	 Train Loss: 0.06774
Train Accuracy: 0.9791	 Val Accuracy: 0.8398	 Train Loss: 0.06494
Train Accuracy: 0.9811	 Val Accuracy: 0.8384	 Train Loss: 0.05873
Train Accuracy: 0.9821	 Val Accuracy: 0.8402	 Train Loss: 0.05575
Train Accuracy: 0.9831	 Val Accuracy: 0.8404	 Train Loss: 0.05350
Train Accuracy: 0.9837	 Val Accuracy: 0.8394	 Train Loss: 0.05075
Train Accuracy: 0.9841	 Val Accuracy: 0.8395	 Train Loss: 0.04863
Train Accuracy: 0.9845	 Val Accuracy: 0.8414	 Train Loss: 0.04891
Train Accuracy: 0.9862	 Val Accuracy: 0.8419	 Train Loss: 0.04476
Train Accuracy: 0.9864	 Val Accuracy: 0.8400	 Train Loss: 0.04285
Train Accuracy: 0.9861	 Val Accuracy: 0.8427	 Train Loss: 0.04301
Train Accuracy: 0.9869	 Val Accuracy: 0.8419	 Train Loss: 0.04014
Train Accuracy: 0.9886	 Val Accuracy: 0.8404	 Train Loss: 0.03700
Train Accu



In [10]:
val_iter.reset()
val_acc = 0.0
val_acc_pb = 0.0
coe_pb = 0.5
num_samp = 0

perb_data = []
perb_lab = []

for dbatch in val_iter:
    data = dbatch.data[0]
    label = dbatch.label[0]
    arg_map["data"][:] = data    
    batch_size = label.asnumpy().shape[0]
    
    model.forward(is_train=False)
    theta = model.outputs[0].asnumpy()
    alpha = Softmax(theta)
    val_acc += CalAcc(alpha, label.asnumpy()) 
    #########
    grad = LogLossGrad(alpha, label.asnumpy())
    out_grad[:] = grad
    model.backward([out_grad])
    noise = data_grad.asnumpy()
    for j in range(batch_size):
        noise[j] = noise[j]/np.linalg.norm(noise[j].flatten(),2)
    pdata = data.asnumpy() + coe_pb * noise
    arg_map["data"][:] = pdata
    model.forward(is_train=True)
    raw_output = model.outputs[0].asnumpy()
    pred = Softmax(raw_output)
    val_acc_pb += CalAcc(pred, label.asnumpy()) 
    num_samp += batch_size
    
    perb_data.append(pdata)
    perb_lab.append(label.asnumpy())
print("Val Batch Accuracy: ", val_acc / num_samp)
print("Val Batch Accuracy after pertubation: ", val_acc_pb / num_samp)
print(acc_normal(model, val_iter,arg_map, grad_map))



Val Batch Accuracy:  0.842266613924
Val Batch Accuracy after pertubation:  0.405557753165
0.842266613924


In [11]:
pdata = np.concatenate(perb_data, axis = 0)
plabel = np.concatenate(perb_lab, axis = 0)
perb_iter = mx.io.NDArrayIter(
    data = pdata,
    label = plabel,
    batch_size = 128,
    shuffle = False    
)

In [12]:
perb_iter.reset()
num_samp = 0
val_acc = 0.0
for dbatch in perb_iter:
    data = dbatch.data[0]
    label = dbatch.label[0]
    arg_map["data"][:] = data    
    
    model.forward(is_train=True)
    theta = model.outputs[0].asnumpy()
    alpha = Softmax(theta)
    val_acc += CalAcc(alpha, label.asnumpy()) 
    num_samp += batch_size
print("Val Batch Accuracy after pertubation: ", val_acc / num_samp)

Val Batch Accuracy after pertubation:  0.405557753165


# Normal Training

In [13]:
data = mx.sym.Variable('data')
conv1 = ConvFactory(data, 3, 1, 64)
conv2 = ConvFactory(conv1, 3, 1, 64)
conv3 = ConvFactory(conv2, 3, 1, 64)
mp1 = mx.sym.Pooling(data=conv3, pool_type="max", kernel=(3,3), stride=(2,2))
conv4 = ConvFactory(mp1, 3, 1, 128)
conv5 = ConvFactory(conv4, 3, 1, 128)
conv6 = ConvFactory(conv5, 3, 1, 128)
mp1 = mx.sym.Pooling(data=conv6, pool_type="max", kernel=(3,3), stride=(2,2))
fl = mx.sym.Flatten(data=mp1)
fc1 = mx.sym.FullyConnected(data=fl, num_hidden=2048)
act1 = mx.sym.Activation(data=fc1, act_type="relu")

fc2 = mx.sym.FullyConnected(data=act1, num_hidden=2048)
act2 = mx.sym.Activation(data=fc2, act_type="relu")

flatten = mx.sym.FullyConnected(data=act2, num_hidden=10)

In [14]:
data_shape = (batch_size, 3, 28, 28)
arg_names = flatten.list_arguments()
arg_shapes, output_shapes, aux_shapes = flatten.infer_shape(data=data_shape)
arg_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
grad_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
grad_sum = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
aux_states =  [mx.nd.zeros(shape, ctx=dev) for shape in aux_shapes]
pred = mx.nd.zeros(output_shapes[0])
reqs = ["write" for name in arg_names]

model = flatten.bind(ctx=dev, args=arg_arrays, args_grad = grad_arrays, grad_req=reqs, aux_states=aux_states)
arg_map = dict(zip(arg_names, arg_arrays))
grad_map = dict(zip(arg_names, grad_arrays))
sum_map = dict(zip(arg_names, grad_sum))
data_grad = grad_map["data"]
out_grad = mx.nd.zeros(model.outputs[0].shape, ctx=dev)

In [15]:
for name in arg_names:
    if "weight" in name:
        arr = arg_map[name]
        shape = arr.shape
        fan_in, fan_out = np.prod(shape[1:]), shape[0]
        factor = fan_in
        scale = np.sqrt(6 / factor)
        arr[:] = mx.rnd.uniform(-scale, scale, arr.shape)
    elif "gamma" in name:
        arr = arg_map[name]
        arr[:] = 1.0
    else:
        arr = arg_map[name]
        arr[:] = 0.

In [16]:
num_round = 40
train_acc = 0.
nbatch = 0
lr = 0.03
for i in range(num_round):
    train_loss = 0.
    train_acc = 0.
    nbatch = 0
    train_iter.reset()
    for dbatch in train_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        
        arg_map["data"][:] = data
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        train_acc += CalAcc(alpha, label.asnumpy()) / batch_size
        train_loss += CalLoss(alpha, label.asnumpy()) / batch_size
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        for name in arg_names:
            if name != "data":
                if name.endswith("weight"):
                    SGD(arg_map[name], grad_map[name], lr)
                else:
                    SGD(arg_map[name], grad_map[name], lr, 0)
        
        nbatch += 1
    train_acc /= nbatch
    train_loss /= nbatch
    val_acc = acc_normal(model, val_iter,arg_map, grad_map)
    print("Train Accuracy: %.4f\t Val Accuracy: %.4f\t Train Loss: %.5f" % (train_acc, val_acc, train_loss))

Train Accuracy: 0.3731	 Val Accuracy: 0.4895	 Train Loss: 1.75075
Train Accuracy: 0.5160	 Val Accuracy: 0.5870	 Train Loss: 1.36211
Train Accuracy: 0.5890	 Val Accuracy: 0.5893	 Train Loss: 1.16560
Train Accuracy: 0.6423	 Val Accuracy: 0.6799	 Train Loss: 1.01737
Train Accuracy: 0.6873	 Val Accuracy: 0.6473	 Train Loss: 0.89851
Train Accuracy: 0.7167	 Val Accuracy: 0.7130	 Train Loss: 0.81132
Train Accuracy: 0.7432	 Val Accuracy: 0.7487	 Train Loss: 0.74035
Train Accuracy: 0.7616	 Val Accuracy: 0.7664	 Train Loss: 0.68549
Train Accuracy: 0.7795	 Val Accuracy: 0.7693	 Train Loss: 0.63441
Train Accuracy: 0.7937	 Val Accuracy: 0.7812	 Train Loss: 0.58743
Train Accuracy: 0.8094	 Val Accuracy: 0.7865	 Train Loss: 0.54928
Train Accuracy: 0.8212	 Val Accuracy: 0.7949	 Train Loss: 0.51527
Train Accuracy: 0.8330	 Val Accuracy: 0.7402	 Train Loss: 0.47993
Train Accuracy: 0.8406	 Val Accuracy: 0.8044	 Train Loss: 0.45415
Train Accuracy: 0.8548	 Val Accuracy: 0.8021	 Train Loss: 0.41732
Train Accu



In [17]:
num_round = 30
train_acc = 0.
nbatch = 0
lr = 0.003
for i in range(num_round):
    train_loss = 0.
    train_acc = 0.
    nbatch = 0
    train_iter.reset()
    for dbatch in train_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        
        arg_map["data"][:] = data
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        train_acc += CalAcc(alpha, label.asnumpy()) / batch_size
        train_loss += CalLoss(alpha, label.asnumpy()) / batch_size
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        for name in arg_names:
            if name != "data":
                if name.endswith("weight"):
                    SGD(arg_map[name], grad_map[name], lr)
                else:
                    SGD(arg_map[name], grad_map[name], lr, 0)
        
        nbatch += 1
    train_acc /= nbatch
    train_loss /= nbatch
    val_acc = acc_normal(model, val_iter,arg_map, grad_map)
    print("Train Accuracy: %.4f\t Val Accuracy: %.4f\t Train Loss: %.5f" % (train_acc, val_acc, train_loss))

Train Accuracy: 0.9823	 Val Accuracy: 0.8415	 Train Loss: 0.05530
Train Accuracy: 0.9873	 Val Accuracy: 0.8454	 Train Loss: 0.04009
Train Accuracy: 0.9893	 Val Accuracy: 0.8450	 Train Loss: 0.03425
Train Accuracy: 0.9891	 Val Accuracy: 0.8467	 Train Loss: 0.03493
Train Accuracy: 0.9908	 Val Accuracy: 0.8470	 Train Loss: 0.03046
Train Accuracy: 0.9902	 Val Accuracy: 0.8469	 Train Loss: 0.03044
Train Accuracy: 0.9914	 Val Accuracy: 0.8472	 Train Loss: 0.02756
Train Accuracy: 0.9927	 Val Accuracy: 0.8487	 Train Loss: 0.02404
Train Accuracy: 0.9928	 Val Accuracy: 0.8478	 Train Loss: 0.02403
Train Accuracy: 0.9929	 Val Accuracy: 0.8479	 Train Loss: 0.02338
Train Accuracy: 0.9934	 Val Accuracy: 0.8487	 Train Loss: 0.02173
Train Accuracy: 0.9933	 Val Accuracy: 0.8478	 Train Loss: 0.02121
Train Accuracy: 0.9933	 Val Accuracy: 0.8497	 Train Loss: 0.02200
Train Accuracy: 0.9938	 Val Accuracy: 0.8485	 Train Loss: 0.02014
Train Accuracy: 0.9943	 Val Accuracy: 0.8505	 Train Loss: 0.01860
Train Accu



In [18]:
print('Normal Validation: %.3f' % acc_normal(model,val_iter,arg_map, grad_map))
print('Fixed set perturbation: %.3f' % acc_normal(model, perb_iter,arg_map, grad_map))
print('L0 perturbation: %.3f' % acc_perb_L0(model, val_iter, 0.5,arg_map, grad_map))
print('L2 perturbation: %.3f' % acc_perb_L2(model, val_iter, 0.5,arg_map, grad_map))
print('Alpha perturbation: %.3f' % acc_perb_alpha(model, val_iter, 0.5,arg_map, grad_map))

Normal Validation: 0.851
Fixed set perturbation: 0.747
L0 perturbation: 0.517




L2 gradien being 0 : 16
L2 perturbation: 0.429
Alpha perturbation: 0.416




# Dropout Training

In [19]:
data = mx.sym.Variable('data')
conv1 = ConvFactory(data, 3, 1, 64)
conv2 = ConvFactory(conv1, 3, 1, 64)
conv3 = ConvFactory(conv2, 3, 1, 64)
mp1 = mx.sym.Pooling(data=conv3, pool_type="max", kernel=(3,3), stride=(2,2))
conv4 = ConvFactory(mp1, 3, 1, 128)
conv5 = ConvFactory(conv4, 3, 1, 128)
conv6 = ConvFactory(conv5, 3, 1, 128)
mp1 = mx.sym.Pooling(data=conv6, pool_type="max", kernel=(3,3), stride=(2,2))
fl = mx.sym.Flatten(data=mp1)
fc1 = mx.sym.FullyConnected(data=fl, num_hidden=2048)
act1 = mx.sym.Activation(data=fc1, act_type="relu")
dp1 = mx.sym.Dropout(data=act1, p=0.5)

fc2 = mx.sym.FullyConnected(data=dp1, num_hidden=2048)
act2 = mx.sym.Activation(data=fc2, act_type="relu")
dp2 = mx.sym.Dropout(data=act2, p=0.5)

flatten = mx.sym.FullyConnected(data=dp2, num_hidden=10)

In [20]:
data_shape = (batch_size, 3, 28, 28)
arg_names = flatten.list_arguments()
arg_shapes, output_shapes, aux_shapes = flatten.infer_shape(data=data_shape)
arg_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
grad_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
grad_sum = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
aux_states =  [mx.nd.zeros(shape, ctx=dev) for shape in aux_shapes]
pred = mx.nd.zeros(output_shapes[0])
reqs = ["write" for name in arg_names]

model = flatten.bind(ctx=dev, args=arg_arrays, args_grad = grad_arrays, grad_req=reqs, aux_states=aux_states)
arg_map = dict(zip(arg_names, arg_arrays))
grad_map = dict(zip(arg_names, grad_arrays))
sum_map = dict(zip(arg_names, grad_sum))
data_grad = grad_map["data"]
out_grad = mx.nd.zeros(model.outputs[0].shape, ctx=dev)

In [21]:
for name in arg_names:
    if "weight" in name:
        arr = arg_map[name]
        shape = arr.shape
        fan_in, fan_out = np.prod(shape[1:]), shape[0]
        factor = fan_in
        scale = np.sqrt(6 / factor)
        arr[:] = mx.rnd.uniform(-scale, scale, arr.shape)
    elif "gamma" in name:
        arr = arg_map[name]
        arr[:] = 1.0
    else:
        arr = arg_map[name]
        arr[:] = 0.

In [22]:
num_round = 50
train_acc = 0.
nbatch = 0
lr = 0.03
for i in range(num_round):
    train_loss = 0.
    train_acc = 0.
    nbatch = 0
    train_iter.reset()
    for dbatch in train_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        
        arg_map["data"][:] = data
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        train_acc += CalAcc(alpha, label.asnumpy()) / batch_size
        train_loss += CalLoss(alpha, label.asnumpy()) / batch_size
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        for name in arg_names:
            if name != "data":
                if name.endswith("weight"):
                    SGD(arg_map[name], grad_map[name], lr)
                else:
                    SGD(arg_map[name], grad_map[name], lr, 0)
        
        nbatch += 1
    train_acc /= nbatch
    train_loss /= nbatch
    val_acc = acc_normal(model, val_iter,arg_map, grad_map)
    print("Train Accuracy: %.4f\t Val Accuracy: %.4f\t Train Loss: %.5f" % (train_acc, val_acc, train_loss))

Train Accuracy: 0.3072	 Val Accuracy: 0.4179	 Train Loss: 1.90753
Train Accuracy: 0.4391	 Val Accuracy: 0.5167	 Train Loss: 1.54575
Train Accuracy: 0.5007	 Val Accuracy: 0.5916	 Train Loss: 1.38349
Train Accuracy: 0.5532	 Val Accuracy: 0.5941	 Train Loss: 1.24992
Train Accuracy: 0.5914	 Val Accuracy: 0.6552	 Train Loss: 1.14376
Train Accuracy: 0.6264	 Val Accuracy: 0.6889	 Train Loss: 1.04894
Train Accuracy: 0.6602	 Val Accuracy: 0.7114	 Train Loss: 0.96482
Train Accuracy: 0.6805	 Val Accuracy: 0.7366	 Train Loss: 0.90941
Train Accuracy: 0.6986	 Val Accuracy: 0.7392	 Train Loss: 0.85371
Train Accuracy: 0.7158	 Val Accuracy: 0.7316	 Train Loss: 0.80954
Train Accuracy: 0.7288	 Val Accuracy: 0.7541	 Train Loss: 0.77440
Train Accuracy: 0.7422	 Val Accuracy: 0.7641	 Train Loss: 0.74084
Train Accuracy: 0.7542	 Val Accuracy: 0.7800	 Train Loss: 0.71008
Train Accuracy: 0.7586	 Val Accuracy: 0.7704	 Train Loss: 0.68524
Train Accuracy: 0.7686	 Val Accuracy: 0.7754	 Train Loss: 0.66487
Train Accu



In [23]:
num_round = 30
train_acc = 0.
nbatch = 0
lr = 0.003
for i in range(num_round):
    train_loss = 0.
    train_acc = 0.
    nbatch = 0
    train_iter.reset()
    for dbatch in train_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        
        arg_map["data"][:] = data
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        train_acc += CalAcc(alpha, label.asnumpy()) / batch_size
        train_loss += CalLoss(alpha, label.asnumpy()) / batch_size
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        for name in arg_names:
            if name != "data":
                if name.endswith("weight"):
                    SGD(arg_map[name], grad_map[name], lr)
                else:
                    SGD(arg_map[name], grad_map[name], lr, 0)
        
        nbatch += 1
    train_acc /= nbatch
    train_loss /= nbatch
    val_acc = acc_normal(model, val_iter,arg_map, grad_map)
    print("Train Accuracy: %.4f\t Val Accuracy: %.4f\t Train Loss: %.5f" % (train_acc, val_acc, train_loss))

Train Accuracy: 0.9180	 Val Accuracy: 0.8722	 Train Loss: 0.23360
Train Accuracy: 0.9219	 Val Accuracy: 0.8734	 Train Loss: 0.22317
Train Accuracy: 0.9233	 Val Accuracy: 0.8745	 Train Loss: 0.21908
Train Accuracy: 0.9244	 Val Accuracy: 0.8750	 Train Loss: 0.21594
Train Accuracy: 0.9273	 Val Accuracy: 0.8734	 Train Loss: 0.20904
Train Accuracy: 0.9264	 Val Accuracy: 0.8730	 Train Loss: 0.20780
Train Accuracy: 0.9257	 Val Accuracy: 0.8746	 Train Loss: 0.21051
Train Accuracy: 0.9293	 Val Accuracy: 0.8741	 Train Loss: 0.20336
Train Accuracy: 0.9286	 Val Accuracy: 0.8751	 Train Loss: 0.20496
Train Accuracy: 0.9290	 Val Accuracy: 0.8769	 Train Loss: 0.20252
Train Accuracy: 0.9304	 Val Accuracy: 0.8755	 Train Loss: 0.19973
Train Accuracy: 0.9291	 Val Accuracy: 0.8752	 Train Loss: 0.20103
Train Accuracy: 0.9310	 Val Accuracy: 0.8758	 Train Loss: 0.19991
Train Accuracy: 0.9305	 Val Accuracy: 0.8747	 Train Loss: 0.19617
Train Accuracy: 0.9316	 Val Accuracy: 0.8767	 Train Loss: 0.19446
Train Accu



In [24]:
print('Normal Validation: %.3f' % acc_normal(model,val_iter,arg_map, grad_map))
print('Fixed set perturbation: %.3f' % acc_normal(model, perb_iter,arg_map, grad_map))
print('L0 perturbation: %.3f' % acc_perb_L0(model, val_iter, 0.5,arg_map, grad_map))
print('L2 perturbation: %.3f' % acc_perb_L2(model, val_iter, 0.5,arg_map, grad_map))
print('Alpha perturbation: %.3f' % acc_perb_alpha(model, val_iter, 0.5,arg_map, grad_map))

Normal Validation: 0.877
Fixed set perturbation: 0.777
L0 perturbation: 0.593
L2 perturbation: 0.495




Alpha perturbation: 0.479




# Ian's Method

In [25]:
data = mx.sym.Variable('data')
conv1 = ConvFactory(data, 3, 1, 128)
conv2 = ConvFactory(conv1, 3, 1, 128)
conv3 = ConvFactory(conv2, 3, 1, 128)
mp1 = mx.sym.Pooling(data=conv3, pool_type="max", kernel=(3,3), stride=(2,2))
conv4 = ConvFactory(mp1, 3, 1, 256)
conv5 = ConvFactory(conv4, 3, 1, 256)
conv6 = ConvFactory(conv5, 3, 1, 256)
mp1 = mx.sym.Pooling(data=conv6, pool_type="max", kernel=(3,3), stride=(2,2))
fl = mx.sym.Flatten(data=mp1)
fc1 = mx.sym.FullyConnected(data=fl, num_hidden=2048)
act1 = mx.sym.Activation(data=fc1, act_type="relu")
dp1 = mx.sym.Dropout(data=act1, p=0.5)

fc2 = mx.sym.FullyConnected(data=dp1, num_hidden=2048)
act2 = mx.sym.Activation(data=fc2, act_type="relu")
dp2 = mx.sym.Dropout(data=act2, p=0.5)

flatten = mx.sym.FullyConnected(data=dp2, num_hidden=10)

In [26]:
data_shape = (batch_size, 3, 28, 28)
arg_names = flatten.list_arguments()
arg_shapes, output_shapes, aux_shapes = flatten.infer_shape(data=data_shape)
arg_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
grad_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
grad_sum = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
aux_states =  [mx.nd.zeros(shape, ctx=dev) for shape in aux_shapes]
pred = mx.nd.zeros(output_shapes[0])
reqs = ["write" for name in arg_names]

model = flatten.bind(ctx=dev, args=arg_arrays, args_grad = grad_arrays, grad_req=reqs, aux_states=aux_states)
arg_map = dict(zip(arg_names, arg_arrays))
grad_map = dict(zip(arg_names, grad_arrays))
sum_map = dict(zip(arg_names, grad_sum))
data_grad = grad_map["data"]
out_grad = mx.nd.zeros(model.outputs[0].shape, ctx=dev)

In [27]:
for name in arg_names:
    if "weight" in name:
        arr = arg_map[name]
        shape = arr.shape
        fan_in, fan_out = np.prod(shape[1:]), shape[0]
        factor = fan_in
        scale = np.sqrt(6 / factor)
        arr[:] = mx.rnd.uniform(-scale, scale, arr.shape)
    elif "gamma" in name:
        arr = arg_map[name]
        arr[:] = 1.0
    else:
        arr = arg_map[name]
        arr[:] = 0.

In [28]:
num_round = 50
train_acc = 0.
nbatch = 0
coe_pb = 0.4
lr= 0.015
for i in range(num_round):
    train_loss = 0.
    train_acc = 0.
    nbatch = 0
    train_iter.reset()
    for dbatch in train_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        arg_map["data"][:] = data
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        train_acc += CalAcc(alpha, label.asnumpy()) / batch_size
        train_loss += CalLoss(alpha, label.asnumpy()) / batch_size
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        
        for name in arg_names:
            if name != "data":
                sum_map[name][:] += grad_map[name]
        
        noise = np.sign(data_grad.asnumpy())
        for j in range(batch_size):
            noise[j] = noise[j]/np.linalg.norm(noise[j].flatten(),2)
        pdata = data.asnumpy() + coe_pb * noise
        arg_map["data"][:] = pdata
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        
        for name in arg_names:
            if name != "data":
                sum_map[name][:] += grad_map[name]

        for name in arg_names:
            if name != "data":
                if name.endswith("weight"):
                    SGD(arg_map[name], grad_map[name], lr)
                else:
                    SGD(arg_map[name], grad_map[name], lr, 0)
        
        nbatch += 1
    train_acc /= nbatch
    train_loss /= nbatch
    val_acc = acc_normal(model, val_iter,arg_map, grad_map)
    print("Train Accuracy: %.4f\t Val Accuracy: %.4f\t Train Loss: %.5f" % (train_acc, val_acc, train_loss))

Train Accuracy: 0.2969	 Val Accuracy: 0.4384	 Train Loss: 1.91756
Train Accuracy: 0.4114	 Val Accuracy: 0.4983	 Train Loss: 1.61419
Train Accuracy: 0.4634	 Val Accuracy: 0.5268	 Train Loss: 1.48178
Train Accuracy: 0.5031	 Val Accuracy: 0.5618	 Train Loss: 1.38250
Train Accuracy: 0.5342	 Val Accuracy: 0.5938	 Train Loss: 1.30510
Train Accuracy: 0.5593	 Val Accuracy: 0.5995	 Train Loss: 1.23167
Train Accuracy: 0.5832	 Val Accuracy: 0.6178	 Train Loss: 1.16884
Train Accuracy: 0.6058	 Val Accuracy: 0.6402	 Train Loss: 1.11001
Train Accuracy: 0.6256	 Val Accuracy: 0.6727	 Train Loss: 1.05707
Train Accuracy: 0.6452	 Val Accuracy: 0.6890	 Train Loss: 1.00764
Train Accuracy: 0.6598	 Val Accuracy: 0.6973	 Train Loss: 0.96993
Train Accuracy: 0.6750	 Val Accuracy: 0.7116	 Train Loss: 0.93040
Train Accuracy: 0.6876	 Val Accuracy: 0.7144	 Train Loss: 0.89760
Train Accuracy: 0.6996	 Val Accuracy: 0.7222	 Train Loss: 0.86359
Train Accuracy: 0.7076	 Val Accuracy: 0.7446	 Train Loss: 0.83492
Train Accu



In [29]:
print('Normal Validation: %.3f' % acc_normal(model,val_iter,arg_map, grad_map))
print('Fixed set perturbation: %.3f' % acc_normal(model, perb_iter,arg_map, grad_map))
print('L0 perturbation: %.3f' % acc_perb_L0(model, val_iter, 0.5,arg_map, grad_map))
print('L2 perturbation: %.3f' % acc_perb_L2(model, val_iter, 0.5,arg_map, grad_map))
print('Alpha perturbation: %.3f' % acc_perb_alpha(model, val_iter, 0.5,arg_map, grad_map))

Normal Validation: 0.838
Fixed set perturbation: 0.799
L0 perturbation: 0.776
L2 perturbation: 0.731




Alpha perturbation: 0.718




In [30]:
num_round = 20
train_acc = 0.
nbatch = 0
coe_pb = 0.4
lr= 0.0025
for i in range(num_round):
    train_loss = 0.
    train_acc = 0.
    nbatch = 0
    train_iter.reset()
    for dbatch in train_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        arg_map["data"][:] = data
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        train_acc += CalAcc(alpha, label.asnumpy()) / batch_size
        train_loss += CalLoss(alpha, label.asnumpy()) / batch_size
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        
        for name in arg_names:
            if name != "data":
                sum_map[name][:] += grad_map[name]
        #grad1 = grad_map
        
        noise = np.sign(data_grad.asnumpy())
        for j in range(batch_size):
            noise[j] = noise[j]/np.linalg.norm(noise[j].flatten(),2)
        pdata = data.asnumpy() + coe_pb * noise
        arg_map["data"][:] = pdata
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        
        for name in arg_names:
            if name != "data":
                sum_map[name][:] += grad_map[name]

        for name in arg_names:
            if name != "data":
                if name.endswith("weight"):
                    SGD(arg_map[name], grad_map[name], lr)
                else:
                    SGD(arg_map[name], grad_map[name], lr, 0)
        
        nbatch += 1
    train_acc /= nbatch
    train_loss /= nbatch
    val_acc = acc_normal(model, val_iter,arg_map, grad_map)
    print("Train Accuracy: %.4f\t Val Accuracy: %.4f\t Train Loss: %.5f" % (train_acc, val_acc, train_loss))

Train Accuracy: 0.8746	 Val Accuracy: 0.8461	 Train Loss: 0.36430
Train Accuracy: 0.8808	 Val Accuracy: 0.8478	 Train Loss: 0.35136
Train Accuracy: 0.8818	 Val Accuracy: 0.8459	 Train Loss: 0.34501
Train Accuracy: 0.8835	 Val Accuracy: 0.8504	 Train Loss: 0.34393
Train Accuracy: 0.8856	 Val Accuracy: 0.8485	 Train Loss: 0.33943
Train Accuracy: 0.8844	 Val Accuracy: 0.8515	 Train Loss: 0.33794
Train Accuracy: 0.8846	 Val Accuracy: 0.8517	 Train Loss: 0.33622
Train Accuracy: 0.8877	 Val Accuracy: 0.8515	 Train Loss: 0.33323
Train Accuracy: 0.8887	 Val Accuracy: 0.8505	 Train Loss: 0.32724
Train Accuracy: 0.8883	 Val Accuracy: 0.8516	 Train Loss: 0.32712
Train Accuracy: 0.8893	 Val Accuracy: 0.8515	 Train Loss: 0.32750
Train Accuracy: 0.8872	 Val Accuracy: 0.8508	 Train Loss: 0.32553
Train Accuracy: 0.8895	 Val Accuracy: 0.8510	 Train Loss: 0.32351
Train Accuracy: 0.8910	 Val Accuracy: 0.8497	 Train Loss: 0.32014
Train Accuracy: 0.8921	 Val Accuracy: 0.8509	 Train Loss: 0.31807
Train Accu



In [31]:
print('Normal Validation: %.3f' % acc_normal(model,val_iter,arg_map, grad_map))
print('Fixed set perturbation: %.3f' % acc_normal(model, perb_iter,arg_map, grad_map))
print('L0 perturbation: %.3f' % acc_perb_L0(model, val_iter, 0.5,arg_map, grad_map))
print('L2 perturbation: %.3f' % acc_perb_L2(model, val_iter, 0.5,arg_map, grad_map))
print('Alpha perturbation: %.3f' % acc_perb_alpha(model, val_iter, 0.5,arg_map, grad_map))

Normal Validation: 0.854
Fixed set perturbation: 0.817
L0 perturbation: 0.793
L2 perturbation: 0.744




Alpha perturbation: 0.729




In [32]:
num_round = 20
train_acc = 0.
nbatch = 0
coe_pb = 0.4
lr= 0.001
for i in range(num_round):
    train_loss = 0.
    train_acc = 0.
    nbatch = 0
    train_iter.reset()
    for dbatch in train_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        arg_map["data"][:] = data
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        train_acc += CalAcc(alpha, label.asnumpy()) / batch_size
        train_loss += CalLoss(alpha, label.asnumpy()) / batch_size
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        
        for name in arg_names:
            if name != "data":
                sum_map[name][:] += grad_map[name]
        #grad1 = grad_map
        
        noise = np.sign(data_grad.asnumpy())
        for j in range(batch_size):
            noise[j] = noise[j]/np.linalg.norm(noise[j].flatten(),2)
        pdata = data.asnumpy() + coe_pb * noise
        arg_map["data"][:] = pdata
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        
        for name in arg_names:
            if name != "data":
                sum_map[name][:] += grad_map[name]

        for name in arg_names:
            if name != "data":
                if name.endswith("weight"):
                    SGD(arg_map[name], grad_map[name], lr)
                else:
                    SGD(arg_map[name], grad_map[name], lr, 0)
        
        nbatch += 1
    train_acc /= nbatch
    train_loss /= nbatch
    val_acc = acc_normal(model, val_iter,arg_map, grad_map)
    print("Train Accuracy: %.4f\t Val Accuracy: %.4f\t Train Loss: %.5f" % (train_acc, val_acc, train_loss))

Train Accuracy: 0.8973	 Val Accuracy: 0.8549	 Train Loss: 0.30192
Train Accuracy: 0.8968	 Val Accuracy: 0.8542	 Train Loss: 0.30178
Train Accuracy: 0.8970	 Val Accuracy: 0.8531	 Train Loss: 0.30111
Train Accuracy: 0.8984	 Val Accuracy: 0.8550	 Train Loss: 0.29923
Train Accuracy: 0.8980	 Val Accuracy: 0.8544	 Train Loss: 0.30101
Train Accuracy: 0.8981	 Val Accuracy: 0.8537	 Train Loss: 0.29952
Train Accuracy: 0.8977	 Val Accuracy: 0.8536	 Train Loss: 0.29809
Train Accuracy: 0.8999	 Val Accuracy: 0.8537	 Train Loss: 0.29744
Train Accuracy: 0.8996	 Val Accuracy: 0.8539	 Train Loss: 0.29531
Train Accuracy: 0.8996	 Val Accuracy: 0.8546	 Train Loss: 0.29555
Train Accuracy: 0.9005	 Val Accuracy: 0.8561	 Train Loss: 0.29381
Train Accuracy: 0.8999	 Val Accuracy: 0.8555	 Train Loss: 0.29446
Train Accuracy: 0.9022	 Val Accuracy: 0.8555	 Train Loss: 0.28851
Train Accuracy: 0.9015	 Val Accuracy: 0.8560	 Train Loss: 0.28932
Train Accuracy: 0.9008	 Val Accuracy: 0.8557	 Train Loss: 0.29056
Train Accu



In [33]:
print('Normal Validation: %.3f' % acc_normal(model,val_iter,arg_map, grad_map))
print('Fixed set perturbation: %.3f' % acc_normal(model, perb_iter,arg_map, grad_map))
print('L0 perturbation: %.3f' % acc_perb_L0(model, val_iter, 0.5,arg_map, grad_map))
print('L2 perturbation: %.3f' % acc_perb_L2(model, val_iter, 0.5,arg_map, grad_map))
print('Alpha perturbation: %.3f' % acc_perb_alpha(model, val_iter, 0.5,arg_map, grad_map))

Normal Validation: 0.856
Fixed set perturbation: 0.818
L0 perturbation: 0.794
L2 perturbation: 0.745




Alpha perturbation: 0.734




# LWA

In [34]:
data = mx.sym.Variable('data')
conv1 = ConvFactory(data, 3, 1, 128)
conv2 = ConvFactory(conv1, 3, 1, 128)
conv3 = ConvFactory(conv2, 3, 1, 128)
mp1 = mx.sym.Pooling(data=conv3, pool_type="max", kernel=(3,3), stride=(2,2))
conv4 = ConvFactory(mp1, 3, 1, 256)
conv5 = ConvFactory(conv4, 3, 1, 256)
conv6 = ConvFactory(conv5, 3, 1, 256)
mp1 = mx.sym.Pooling(data=conv6, pool_type="max", kernel=(3,3), stride=(2,2))
fl = mx.sym.Flatten(data=mp1)
fc1 = mx.sym.FullyConnected(data=fl, num_hidden=2048)
act1 = mx.sym.Activation(data=fc1, act_type="relu")
dp1 = mx.sym.Dropout(data=act1, p=0.5)

fc2 = mx.sym.FullyConnected(data=dp1, num_hidden=2048)
act2 = mx.sym.Activation(data=fc2, act_type="relu")
dp2 = mx.sym.Dropout(data=act2, p=0.5)

flatten = mx.sym.FullyConnected(data=dp2, num_hidden=10)

In [35]:
data_shape = (batch_size, 3, 28, 28)
arg_names = flatten.list_arguments()
arg_shapes, output_shapes, aux_shapes = flatten.infer_shape(data=data_shape)
arg_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
grad_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
grad_sum = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
aux_states =  [mx.nd.zeros(shape, ctx=dev) for shape in aux_shapes]
pred = mx.nd.zeros(output_shapes[0])
reqs = ["write" for name in arg_names]

model = flatten.bind(ctx=dev, args=arg_arrays, args_grad = grad_arrays, grad_req=reqs, aux_states=aux_states)
arg_map = dict(zip(arg_names, arg_arrays))
grad_map = dict(zip(arg_names, grad_arrays))
sum_map = dict(zip(arg_names, grad_sum))
data_grad = grad_map["data"]
out_grad = mx.nd.zeros(model.outputs[0].shape, ctx=dev)

In [68]:
for name in arg_names:
    if "weight" in name:
        arr = arg_map[name]
        shape = arr.shape
        fan_in, fan_out = np.prod(shape[1:]), shape[0]
        factor = fan_in
        scale = np.sqrt(6 / factor)
        arr[:] = mx.rnd.uniform(-scale, scale, arr.shape)
    elif "gamma" in name:
        arr = arg_map[name]
        arr[:] = 1.0
    else:
        arr = arg_map[name]
        arr[:] = 0.

In [69]:
num_round = 50
train_acc = 0.
nbatch = 0
coe_pb = 0.3
lr = 0.025
for i in range(num_round):
    train_loss = 0.
    train_acc = 0.
    nbatch = 0
    train_iter.reset()
    for dbatch in train_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        arg_map["data"][:] = data
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        train_acc += CalAcc(alpha, label.asnumpy()) / batch_size
        train_loss += CalLoss(alpha, label.asnumpy()) / batch_size
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        
        noise = data_grad.asnumpy()
        for j in range(batch_size):
            noise[j] = noise[j]/np.linalg.norm(noise[j].flatten(),2)
        pdata = data.asnumpy() + coe_pb * noise
        arg_map["data"][:] = pdata
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        for name in arg_names:
            if name != "data":
                if name.endswith("weight"):
                    SGD(arg_map[name], grad_map[name], lr)
                else:
                    SGD(arg_map[name], grad_map[name], lr, 0)
        
        nbatch += 1
    train_acc /= nbatch
    train_loss /= nbatch
    val_acc = acc_normal(model, val_iter,arg_map, grad_map)
    print("Train Accuracy: %.4f\t Val Accuracy: %.4f\t Train Loss: %.5f" % (train_acc, val_acc, train_loss))

Train Accuracy: 0.3130	 Val Accuracy: 0.4404	 Train Loss: 1.88494
Train Accuracy: 0.4303	 Val Accuracy: 0.5049	 Train Loss: 1.56248
Train Accuracy: 0.4868	 Val Accuracy: 0.5517	 Train Loss: 1.42107
Train Accuracy: 0.5322	 Val Accuracy: 0.5834	 Train Loss: 1.30721
Train Accuracy: 0.5680	 Val Accuracy: 0.6208	 Train Loss: 1.20917
Train Accuracy: 0.5999	 Val Accuracy: 0.6240	 Train Loss: 1.13124
Train Accuracy: 0.6234	 Val Accuracy: 0.6675	 Train Loss: 1.06487
Train Accuracy: 0.6484	 Val Accuracy: 0.6961	 Train Loss: 1.00828
Train Accuracy: 0.6635	 Val Accuracy: 0.7184	 Train Loss: 0.95761
Train Accuracy: 0.6799	 Val Accuracy: 0.7108	 Train Loss: 0.91757
Train Accuracy: 0.6965	 Val Accuracy: 0.7263	 Train Loss: 0.87111
Train Accuracy: 0.7119	 Val Accuracy: 0.7387	 Train Loss: 0.83575
Train Accuracy: 0.7198	 Val Accuracy: 0.7536	 Train Loss: 0.80711
Train Accuracy: 0.7324	 Val Accuracy: 0.7497	 Train Loss: 0.77855
Train Accuracy: 0.7388	 Val Accuracy: 0.7583	 Train Loss: 0.75184
Train Accu



In [70]:
print('Normal Validation: %.3f' % acc_normal(model,val_iter,arg_map, grad_map))
print('Fixed set perturbation: %.3f' % acc_normal(model, perb_iter,arg_map, grad_map))
print('L0 perturbation: %.3f' % acc_perb_L0(model, val_iter, 0.5,arg_map, grad_map))
print('L2 perturbation: %.3f' % acc_perb_L2(model, val_iter, 0.5,arg_map, grad_map))
print('Alpha perturbation: %.3f' % acc_perb_alpha(model, val_iter, 0.5,arg_map, grad_map))

Normal Validation: 0.849
Fixed set perturbation: 0.817
L0 perturbation: 0.786
L2 perturbation: 0.746




Alpha perturbation: 0.730




In [73]:
num_round = 5
train_acc = 0.
nbatch = 0
coe_pb = 0.3
lr = 0.003
for i in range(num_round):
    train_loss = 0.
    train_acc = 0.
    nbatch = 0
    train_iter.reset()
    for dbatch in train_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        arg_map["data"][:] = data
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        train_acc += CalAcc(alpha, label.asnumpy()) / batch_size
        train_loss += CalLoss(alpha, label.asnumpy()) / batch_size
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        
        noise = data_grad.asnumpy()
        for j in range(batch_size):
            noise[j] = noise[j]/np.linalg.norm(noise[j].flatten(),2)
        pdata = data.asnumpy() + coe_pb * noise
        arg_map["data"][:] = pdata
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        for name in arg_names:
            if name != "data":
                if name.endswith("weight"):
                    SGD(arg_map[name], grad_map[name], lr)
                else:
                    SGD(arg_map[name], grad_map[name], lr, 0)
        
        nbatch += 1
    train_acc /= nbatch
    train_loss /= nbatch
    val_acc = acc_normal(model, val_iter,arg_map, grad_map)
    print("Train Accuracy: %.4f\t Val Accuracy: %.4f\t Train Loss: %.5f" % (train_acc, val_acc, train_loss))

Train Accuracy: 0.9218	 Val Accuracy: 0.8611	 Train Loss: 0.23065
Train Accuracy: 0.9233	 Val Accuracy: 0.8623	 Train Loss: 0.22957
Train Accuracy: 0.9240	 Val Accuracy: 0.8614	 Train Loss: 0.22575
Train Accuracy: 0.9260	 Val Accuracy: 0.8612	 Train Loss: 0.22299
Train Accuracy: 0.9233	 Val Accuracy: 0.8616	 Train Loss: 0.22349




In [74]:
print('Normal Validation: %.3f' % acc_normal(model,val_iter,arg_map, grad_map))
print('Fixed set perturbation: %.3f' % acc_normal(model, perb_iter,arg_map, grad_map))
print('L0 perturbation: %.3f' % acc_perb_L0(model, val_iter, 0.5,arg_map, grad_map))
print('L2 perturbation: %.3f' % acc_perb_L2(model, val_iter, 0.5,arg_map, grad_map))
print('Alpha perturbation: %.3f' % acc_perb_alpha(model, val_iter, 0.5,arg_map, grad_map))

Normal Validation: 0.862
Fixed set perturbation: 0.828
L0 perturbation: 0.798
L2 perturbation: 0.760




Alpha perturbation: 0.748




In [75]:
num_round = 5
train_acc = 0.
nbatch = 0
coe_pb = 0.3
lr = 0.001
for i in range(num_round):
    train_loss = 0.
    train_acc = 0.
    nbatch = 0
    train_iter.reset()
    for dbatch in train_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        arg_map["data"][:] = data
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        train_acc += CalAcc(alpha, label.asnumpy()) / batch_size
        train_loss += CalLoss(alpha, label.asnumpy()) / batch_size
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        
        noise = data_grad.asnumpy()
        for j in range(batch_size):
            noise[j] = noise[j]/np.linalg.norm(noise[j].flatten(),2)
        pdata = data.asnumpy() + coe_pb * noise
        arg_map["data"][:] = pdata
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        for name in arg_names:
            if name != "data":
                if name.endswith("weight"):
                    SGD(arg_map[name], grad_map[name], lr)
                else:
                    SGD(arg_map[name], grad_map[name], lr, 0)
        
        nbatch += 1
    train_acc /= nbatch
    train_loss /= nbatch
    val_acc = acc_normal(model, val_iter,arg_map, grad_map)
    print("Train Accuracy: %.4f\t Val Accuracy: %.4f\t Train Loss: %.5f" % (train_acc, val_acc, train_loss))

Train Accuracy: 0.9239	 Val Accuracy: 0.8619	 Train Loss: 0.22008
Train Accuracy: 0.9273	 Val Accuracy: 0.8633	 Train Loss: 0.21952
Train Accuracy: 0.9261	 Val Accuracy: 0.8628	 Train Loss: 0.21955
Train Accuracy: 0.9265	 Val Accuracy: 0.8631	 Train Loss: 0.21647
Train Accuracy: 0.9277	 Val Accuracy: 0.8639	 Train Loss: 0.21512




In [76]:
print('Normal Validation: %.3f' % acc_normal(model,val_iter,arg_map, grad_map))
print('Fixed set perturbation: %.3f' % acc_normal(model, perb_iter,arg_map, grad_map))
print('L0 perturbation: %.3f' % acc_perb_L0(model, val_iter, 0.5,arg_map, grad_map))
print('L2 perturbation: %.3f' % acc_perb_L2(model, val_iter, 0.5,arg_map, grad_map))
print('Alpha perturbation: %.3f' % acc_perb_alpha(model, val_iter, 0.5,arg_map, grad_map))

Normal Validation: 0.864
Fixed set perturbation: 0.830
L0 perturbation: 0.802
L2 perturbation: 0.762




Alpha perturbation: 0.750




In [80]:
num_round = 5
train_acc = 0.
nbatch = 0
coe_pb = 0.3
lr = 0.0001
for i in range(num_round):
    train_loss = 0.
    train_acc = 0.
    nbatch = 0
    train_iter.reset()
    for dbatch in train_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        arg_map["data"][:] = data
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        train_acc += CalAcc(alpha, label.asnumpy()) / batch_size
        train_loss += CalLoss(alpha, label.asnumpy()) / batch_size
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        
        noise = data_grad.asnumpy()
        for j in range(batch_size):
            noise[j] = noise[j]/np.linalg.norm(noise[j].flatten(),2)
        pdata = data.asnumpy() + coe_pb * noise
        arg_map["data"][:] = pdata
        model.forward(is_train=True)
        theta = model.outputs[0].asnumpy()
        alpha = Softmax(theta)
        losGrad_theta = LogLossGrad(alpha, label.asnumpy())
        out_grad[:] = losGrad_theta
        model.backward([out_grad])
        for name in arg_names:
            if name != "data":
                if name.endswith("weight"):
                    SGD(arg_map[name], grad_map[name], lr)
                else:
                    SGD(arg_map[name], grad_map[name], lr, 0)
        
        nbatch += 1
    train_acc /= nbatch
    train_loss /= nbatch
    val_acc = acc_normal(model, val_iter,arg_map, grad_map)
    print("Train Accuracy: %.4f\t Val Accuracy: %.4f\t Train Loss: %.5f" % (train_acc, val_acc, train_loss))

Train Accuracy: 0.9307	 Val Accuracy: 0.8642	 Train Loss: 0.20556
Train Accuracy: 0.9308	 Val Accuracy: 0.8643	 Train Loss: 0.20587
Train Accuracy: 0.9299	 Val Accuracy: 0.8643	 Train Loss: 0.20876
Train Accuracy: 0.9296	 Val Accuracy: 0.8639	 Train Loss: 0.20961
Train Accuracy: 0.9287	 Val Accuracy: 0.8641	 Train Loss: 0.21030




In [81]:
print('Normal Validation: %.3f' % acc_normal(model,val_iter,arg_map, grad_map))
print('Fixed set perturbation: %.3f' % acc_normal(model, perb_iter,arg_map, grad_map))
print('L0 perturbation: %.3f' % acc_perb_L0(model, val_iter, 0.5,arg_map, grad_map))
print('L2 perturbation: %.3f' % acc_perb_L2(model, val_iter, 0.5,arg_map, grad_map))
print('Alpha perturbation: %.3f' % acc_perb_alpha(model, val_iter, 0.5,arg_map, grad_map))

Normal Validation: 0.864
Fixed set perturbation: 0.831
L0 perturbation: 0.803
L2 perturbation: 0.760




Alpha perturbation: 0.750


