In [1]:
%matplotlib inline
import mxnet as mx
import numpy as np
import logging
import matplotlib.pyplot as plt

In [2]:
dev = mx.gpu()
batch_size = 100
data_shape = (1,28,28)
batch_shape = (100,1,28,28)
train_iter = mx.io.MNISTIter(
        image       = "../mxnet/mnist/train-images-idx3-ubyte",
        label       = "../mxnet/mnist/train-labels-idx1-ubyte",
        input_shape = data_shape,
        batch_size  = batch_size,
        shuffle     = True,
        flat        = False,
        ctx = dev)

val_iter = mx.io.MNISTIter(
        image       = "../mxnet/mnist/t10k-images-idx3-ubyte",
        label       = "../mxnet/mnist/t10k-labels-idx1-ubyte",
        input_shape = data_shape,
        batch_size  = batch_size,
        flat        = False,
        ctx = dev)

# Define Networks

In [3]:
def deconv2d(data, ishape, oshape, kshape, name, stride=(2,2)):
    pad0 = (stride[0] * (ishape[-2] - 1) + kshape[0] - oshape[-2])
    pad1 = (stride[1] * (ishape[-1] - 1) + kshape[1] - oshape[-1])
    assert pad0 >= 0
    assert pad1 >= 0
    assert pad0 % 2 == 0
    assert pad1 % 2 == 0
    net = mx.sym.Deconvolution(data,
                               kernel=kshape,
                               stride=stride,
                               pad=(pad0 / 2, pad1/2),
                               num_filter=oshape[0],
                               no_bias=True,
                               name=name)
    return net

def deconv2d_relu(data, name, **kwargs):
    net = deconv2d(data, name="%s_deconv"%name, **kwargs)
    net = mx.sym.Activation(net, name = "%s_act"%name, act_type = 'relu')
    return net
def deconv2d_act(data, name, act, **kwargs):
    net = deconv2d(data, name="%s_deconv"%name, **kwargs)
    net = mx.sym.Activation(net, name = "%s_act"%name, act_type = act)
    return net

def conv2d_relu(data, name, **kwargs):
    net = mx.sym.Convolution(data, name = "%s_conv"%name, **kwargs)
    net = mx.sym.Activation(net, act_type='relu', name = "%s_act"%name)
    return net
def conv2d_act(data, name, act, **kwargs):
    net = mx.sym.Convolution(data, name = "%s_conv"%name, **kwargs)
    net = mx.sym.Activation(net, act_type=act, name = "%s_act"%name)
    return net

In [4]:
def Decoder(oshape, final_act, data = None, ngf=20):
    assert oshape[-1]==28
    assert oshape[-2]==28
    data = mx.sym.Variable('data') if data is None else data
    net = mx.sym.FullyConnected(data, name = 'DC_fc1', num_hidden = 8*8*50, no_bias=True)
    net = mx.sym.Activation(net, name = 'dc_act1', act_type = 'relu')
    net = mx.sym.Reshape(net, shape = (-1, 50, 8, 8))
    #net = deconv2d_relu(net, name = 'DC2', ishape = (ngf*4,4,4), oshape=(ngf*2,8,8), kshape = (4,4))
    net = deconv2d_relu(net, name = 'DC3', ishape = (50,8,8), oshape=(ngf,14,14), kshape = (4,4))
    net = deconv2d_act(net, act = final_act, name = 'DC4', ishape = (ngf,14,14), oshape=oshape, kshape = (4,4))
    return net

In [5]:
def Encoder(oshape, data=None, ngf=20):
    data = mx.sym.Variable('data') if data is None else data
    net = conv2d_relu(data, kernel = (4,4), stride = (2,2), pad = (1,1), num_filter=ngf, name = 'EC_conv1')
    net = conv2d_relu(net, kernel = (4,4), stride = (2,2), pad = (2,2), num_filter=50, name = 'EC_conv2')
    #net = conv2d_relu(net, kernel = (4,4), stride = (2,2), pad = (1,1), num_filter=ngf*4, name = 'EC_conv3')
    net = mx.sym.Flatten(net)
    net = mx.sym.Activation(net, name = 'EC_act1', act_type = 'relu')
    net = mx.sym.FullyConnected(net, num_hidden = oshape[-1], name = 'EC_fc1')
    net = mx.sym.Activation(net, name = 'EC_act2', act_type = 'relu')
    return net

In [6]:
def Classifier(data = None, num_hidden=32):
    data = mx.sym.Variable('data') if data is None else data
    net = mx.sym.FullyConnected(data, num_hidden = num_hidden, name = "C_fc1")
    net = mx.sym.Activation(net, name = 'C_act1', act_type = 'relu')
    net = mx.sym.FullyConnected(net, num_hidden = 10, name = 'C_fc_2')
    #net = mx.sym.SoftmaxOutput(net, name = 'C_softmax')
    return net

## Initialization

In [7]:
coder_shape = (32,)
batch_coder_shape = (batch_size, 32)
D_net = Decoder(data_shape, final_act = 'sigmoid')
E_net = Encoder(coder_shape)
C_net = Classifier()

In [8]:
D_arg_names = D_net.list_arguments()
D_arg_shapes, D_output_shapes, D_aux_shapes = D_net.infer_shape(data = batch_coder_shape)
D_arg_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in D_arg_shapes]
D_grad_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in D_arg_shapes]
D_aux_states =  [mx.nd.zeros(shape, ctx=dev) for shape in D_aux_shapes]
D_reqs = ["write" for name in D_arg_names]

modD = D_net.bind(ctx=dev, args=D_arg_arrays, args_grad = D_grad_arrays, grad_req=D_reqs,  aux_states=D_aux_states)
D_arg_map = dict(zip(D_arg_names, D_arg_arrays))
D_grad_map = dict(zip(D_arg_names, D_grad_arrays))
D_data_grad = D_grad_map["data"]
D_out_grad = mx.nd.zeros(modD.outputs[0].shape, ctx=dev)

In [9]:
C_arg_names = C_net.list_arguments()
C_arg_shapes, C_output_shapes, C_aux_shapes = C_net.infer_shape(data = batch_coder_shape)
C_arg_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in C_arg_shapes]
C_grad_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in C_arg_shapes]
C_aux_states =  [mx.nd.zeros(shape, ctx=dev) for shape in C_aux_shapes]
C_reqs = ["write" for name in C_arg_names]

modC = C_net.bind(ctx=dev, args=C_arg_arrays, args_grad = C_grad_arrays, grad_req=C_reqs,  aux_states=C_aux_states)
C_arg_map = dict(zip(C_arg_names, C_arg_arrays))
C_grad_map = dict(zip(C_arg_names, C_grad_arrays))
C_data_grad = C_grad_map["data"]
C_out_grad = mx.nd.zeros(modC.outputs[0].shape, ctx=dev)

In [10]:
E_arg_names = E_net.list_arguments()
E_arg_shapes, E_output_shapes, E_aux_shapes = E_net.infer_shape(data = batch_shape)
E_arg_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in E_arg_shapes]
E_grad_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in E_arg_shapes]
E_aux_states =  [mx.nd.zeros(shape, ctx=dev) for shape in E_aux_shapes]
E_reqs = ["write" for name in E_arg_names]

modE = E_net.bind(ctx=dev, args=E_arg_arrays, args_grad = E_grad_arrays, grad_req=E_reqs,  aux_states=E_aux_states)
E_arg_map = dict(zip(E_arg_names, E_arg_arrays))
E_grad_map = dict(zip(E_arg_names, E_grad_arrays))
E_data_grad = C_grad_map["data"]
E_out_grad = mx.nd.zeros(modE.outputs[0].shape, ctx=dev)

In [11]:
print modE.outputs[0].shape
print D_arg_names
print modD.outputs[0].shape

(100L, 32L)
['data', 'DC_fc1_weight', 'DC3_deconv_weight', 'DC4_deconv_weight']
(100L, 1L, 28L, 28L)


## Attack Methods

### Fast Gradient

In [12]:
def Validate_Adv(val_iter, norm=2, coe= 2):
    val_iter.reset()
    val_acc = 0.0
    num_sin = 0
    num_batch = 0
    for dbatch in val_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        
        E_arg_map['data'][:] = data
        modE.forward(is_train = True)
        coder = modE.outputs[0]
        
        C_arg_map['data'][:] = coder
        modC.forward(is_train=True)
        theta = modC.outputs[0].asnumpy()
        alpha = softmax(theta)
        
        logGrad = logLossGrad(alpha, label.asnumpy())
        C_out_grad[:] = logGrad
        modC.backward([C_out_grad])
        
        modE.backward([C_data_grad])
        if norm==2:
            noise = E_grad_map['data'].asnumpy()
        else:
            noise = np.sign(E_grad_map['data'].asnumpy())
        
        for j in range(batch_size):
            if np.linalg.norm(noise[j].flatten(),2)==0:
                num_sin +=1
                noise[j] = mx.rnd.normal(0,0.07, shape = noise[j].shape).asnumpy()
            if (label.asnumpy()[j] == np.argmax(alpha[j])):
                noise[j] = noise[j]/ np.linalg.norm(noise[j].flatten(), norm)
                
            else:
                noise[j]=0
        
        data_adv = data.asnumpy() + coe * noise
        E_arg_map['data'][:] = data_adv
        modE.forward(is_train = False)
        coder = modE.outputs[0]
        
        C_arg_map['data'][:] = coder
        modC.forward(is_train=True)
        theta = modC.outputs[0].asnumpy()
        alpha = softmax(theta)
        val_acc += CalAcc(alpha, label.asnumpy())
        num_batch += 1
    if num_sin>0: 
        print ('Number of 0 gradient:', num_sin)
    return (val_acc/num_batch)  

## Aux Functions

In [13]:
def CalAcc(prob, label):
    return np.sum(np.argmax(prob, axis=1)==label)*1.0/batch_size
def EDLoss(decoder, data):
    res = 0.0
    
    temp = decoder - data
    for j in range(batch_size):
        res += mx.nd.norm(temp[j])
    return res/batch_size

def CLoss(prob, label):
    res = 0.0
    for j in range(batch_size):
        res -= np.log(prob[j][int(label[j])])
    return res/batch_size

def SGD(weight, grad, lr = 0.05, wd = 0.0001):
    weight[:] -= lr*(grad/batch_size + wd*weight) 
    
def softmax(theta):
    tmp = theta - np.max(theta, axis=1, keepdims = True)
    exp = np.exp(tmp)
    norm = np.sum(exp, axis=1, keepdims = True)
    return exp/norm
    
def logLossGrad(alpha,label):
    res = np.copy(alpha)
    for j in range(alpha.shape[0]):
        res[j][int(label[j])] -= 1
    return res

In [14]:
def Validate_loss(val_iter):
    val_loss = 0.0
    val_iter.reset()
    nbatch = 0
    for dbatch in val_iter:
        data = dbatch.data[0]
        E_arg_map['data'][:] = data
        modE.forward(is_train=False)
        coder = modE.outputs[0]
        
        D_arg_map['data'][:] = coder
        modD.forward(is_train = False)
        decoder = modD.outputs[0]
        
        data_gpu = mx.nd.zeros(shape = data.shape, ctx = dev)
        data.copyto(data_gpu)
        val_loss += EDLoss(decoder, data_gpu)
        nbatch +=1
    return val_loss/nbatch

## Normal Training

In [17]:
mx.rnd.seed(17214)
for name in E_arg_names:
    if "weight" in name:
        arr = E_arg_map[name]
        shape = arr.shape
        fan_in, fan_out = np.prod(shape[1:]), shape[0]
        factor = fan_in
        scale = np.sqrt(2.34 / factor)
        arr[:] = mx.rnd.uniform(-scale, scale, arr.shape)
    else:
        arr = E_arg_map[name]
        arr[:] = 0.
for name in C_arg_names:
    if "weight" in name:
        arr = C_arg_map[name]
        shape = arr.shape
        fan_in, fan_out = np.prod(shape[1:]), shape[0]
        factor = fan_in
        scale = np.sqrt(2.34 / factor)
        arr[:] = mx.rnd.uniform(-scale, scale, arr.shape)
    else:
        arr = C_arg_map[name]
        arr[:] = 0.



In [18]:
num_epoch = 60
lr = 0.2

Training_normal = np.zeros(shape = (num_epoch))
Validation_normal = np.zeros(shape = (num_epoch))
Adv_normal = np.zeros(shape = (num_epoch))

for i in range(num_epoch):
    if i%30==29: 
        lr = lr/2.0
    train_iter.reset()
    train_acc = 0.0
    loss_total = 0.0
    num_batch = 0
    
    for dbatch in train_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        
        
        E_arg_map['data'][:] = data
        modE.forward(is_train=True)
        coder = modE.outputs[0]
        
        C_arg_map['data'][:] = coder
        modC.forward(is_train=True)
        theta = modC.outputs[0].asnumpy()
        alpha = softmax(theta)
        #print coder.context, decoder.context, prob.context
        
        loss_total +=  CLoss(alpha, label.asnumpy())
        train_acc += CalAcc(alpha, label.asnumpy())
        logGrad = logLossGrad(alpha, label.asnumpy())
        
        C_out_grad[:] = logGrad
        modC.backward([C_out_grad])
        temp2 = C_grad_map['data']
        E_out_grad[:] = temp2
        modE.backward([E_out_grad])
                
        for name in C_arg_names:
            if name!='data':
                SGD(C_arg_map[name], C_grad_map[name], lr)
                
        for name in E_arg_names:
            if name!='data':
                SGD(E_arg_map[name], E_grad_map[name], lr)
        num_batch +=1
        if num_batch % 300==299:
            print "Training Loss: %.4f\t Training Accuracy: %.4f" %(loss_total/num_batch,train_acc/num_batch)
    Training_normal[i] = train_acc/num_batch
    Validation_normal[i] = Validate_Adv(val_iter, norm=2, coe=0)
    Adv_normal[i] = Validate_Adv(val_iter, norm=2, coe=2)
    print "epoch: %d Validation Accuracy: %.4f\t Adverserial Accuracy: %.4f" \
        %(i, Validation_normal[i], Adv_normal[i])

Training Loss: 0.5195	 Training Accuracy: 0.8312
Training Loss: 0.3245	 Training Accuracy: 0.8959
epoch: 0 Validation Accuracy: 0.9636	 Adverserial Accuracy: 0.3024
Training Loss: 0.0960	 Training Accuracy: 0.9701
Training Loss: 0.0837	 Training Accuracy: 0.9742
epoch: 1 Validation Accuracy: 0.9759	 Adverserial Accuracy: 0.3529
Training Loss: 0.0640	 Training Accuracy: 0.9809
Training Loss: 0.0576	 Training Accuracy: 0.9827
epoch: 2 Validation Accuracy: 0.9810	 Adverserial Accuracy: 0.3875
Training Loss: 0.0467	 Training Accuracy: 0.9861
Training Loss: 0.0433	 Training Accuracy: 0.9871
epoch: 3 Validation Accuracy: 0.9821	 Adverserial Accuracy: 0.3894
Training Loss: 0.0360	 Training Accuracy: 0.9893
Training Loss: 0.0335	 Training Accuracy: 0.9900
epoch: 4 Validation Accuracy: 0.9851	 Adverserial Accuracy: 0.4067
Training Loss: 0.0282	 Training Accuracy: 0.9914
Training Loss: 0.0260	 Training Accuracy: 0.9922
epoch: 5 Validation Accuracy: 0.9852	 Adverserial Accuracy: 0.4427
Training L

## DC Training

In [19]:
mx.rnd.seed(3128)
for name in E_arg_names:
    if "weight" in name:
        arr = E_arg_map[name]
        shape = arr.shape
        fan_in, fan_out = np.prod(shape[1:]), shape[0]
        factor = fan_in
        scale = np.sqrt(2.34 / factor)
        arr[:] = mx.rnd.uniform(-scale, scale, arr.shape)
    else:
        arr = E_arg_map[name]
        arr[:] = 0.
for name in D_arg_names:
    if "weight" in name:
        arr = D_arg_map[name]
        shape = arr.shape
        fan_in, fan_out = np.prod(shape[1:]), shape[0]
        factor = fan_in
        scale = np.sqrt(2.34 / factor)
        arr[:] = mx.rnd.uniform(-scale, scale, arr.shape)
    else:
        arr = D_arg_map[name]
        arr[:] = 0.
for name in C_arg_names:
    if "weight" in name:
        arr = C_arg_map[name]
        shape = arr.shape
        fan_in, fan_out = np.prod(shape[1:]), shape[0]
        factor = fan_in
        scale = np.sqrt(2.34 / factor)
        arr[:] = mx.rnd.uniform(-scale, scale, arr.shape)
    else:
        arr = C_arg_map[name]
        arr[:] = 0.

In [21]:
num_epoch = 60
lr = 0.1
c = 0.01

Training_DC = np.zeros(shape = (num_epoch))
Validation_DC = np.zeros(shape = (num_epoch))
Adv_DC = np.zeros(shape = (num_epoch))

for i in range(num_epoch):
    if i%30==29: 
        lr = lr/2.0
        c = c/1.2
    train_iter.reset()
    train_acc = 0.0
    loss_total = 0.0
    num_batch = 0
    
    for dbatch in train_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        
        
        E_arg_map['data'][:] = data
        modE.forward(is_train=True)
        coder = modE.outputs[0]
        
        D_arg_map['data'][:] = coder
        modD.forward(is_train=True)
        decoder = modD.outputs[0]
        
        C_arg_map['data'][:] = coder
        modC.forward(is_train=True)
        theta = modC.outputs[0].asnumpy()
        alpha = softmax(theta)
        #print coder.context, decoder.context, prob.context
        
        data_gpu = mx.nd.zeros(shape = data.shape, ctx = dev)
        data.copyto(data_gpu)
        
        loss_total += c* EDLoss(decoder, data_gpu).asnumpy() + CLoss(alpha, label.asnumpy())
        train_acc += CalAcc(alpha, label.asnumpy())
        
        logGrad = logLossGrad(alpha, label.asnumpy())
        D_out_grad[:] = decoder-data_gpu 
        C_out_grad[:] = logGrad
        
        modD.backward([D_out_grad])
        modC.backward([C_out_grad])
        temp1 = D_grad_map['data']
        temp2 = C_grad_map['data']
        E_out_grad[:] = 2*c*temp1+temp2
        modE.backward([E_out_grad])
        
        for name in D_arg_names:
            if name!='data':
                SGD(D_arg_map[name], D_grad_map[name], lr)
                
        for name in C_arg_names:
            if name!='data':
                SGD(C_arg_map[name], C_grad_map[name], lr)
                
        for name in E_arg_names:
            if name!='data':
                SGD(E_arg_map[name], E_grad_map[name], lr)
        num_batch +=1
        if num_batch % 300==299:
            print EDLoss(decoder, data_gpu).asnumpy(), CLoss(alpha, label.asnumpy())
            print "Training Loss: %.4f\t Training Accuracy: %.4f" %(loss_total/num_batch,train_acc/num_batch)
    Training_DC[i] = train_acc/num_batch
    Validation_DC[i] = Validate_Adv(val_iter, norm=2, coe=0)
    Adv_DC[i] = Validate_Adv(val_iter, norm=2, coe=2)
    print "epoch: %d Validation Accuracy: %.4f\t Adverserial Accuracy: %.4f" \
        %(i, Validation_DC[i], Adv_DC[i])

[ 2.50040483] 0.000837528332179
Training Loss: 0.0273	 Training Accuracy: 1.0000




[ 3.54174566] 0.058087625865
Training Loss: inf	 Training Accuracy: 0.9282
epoch: 0 Validation Accuracy: 0.9612	 Adverserial Accuracy: 0.3595
[ 3.08207774] 0.0867063972341
Training Loss: 0.1390	 Training Accuracy: 0.9690
[ 2.79468513] 0.016498919891
Training Loss: 0.1168	 Training Accuracy: 0.9746
epoch: 1 Validation Accuracy: 0.9800	 Adverserial Accuracy: 0.3729
[ 2.83800578] 0.0229289301246
Training Loss: 0.0818	 Training Accuracy: 0.9836
[ 2.62474298] 0.0156534225458
Training Loss: 0.0760	 Training Accuracy: 0.9850
epoch: 2 Validation Accuracy: 0.9841	 Adverserial Accuracy: 0.3698
[ 2.6612916] 0.0189133982289
Training Loss: 0.0640	 Training Accuracy: 0.9881
[ 2.60176301] 0.00579740682569
Training Loss: 0.0608	 Training Accuracy: 0.9892
epoch: 3 Validation Accuracy: 0.9855	 Adverserial Accuracy: 0.3543
[ 2.52420568] 0.0105934825015
Training Loss: 0.0539	 Training Accuracy: 0.9908
[ 2.29876471] 0.00501602858311
Training Loss: 0.0521	 Training Accuracy: 0.9918
epoch: 4 Validation Accur

In [None]:
for name in E_arg_names:
    print np.linalg.norm(E_arg_map[name].asnumpy().flatten(),2)
    print np.linalg.norm(E_grad_map[name].asnumpy().flatten(),2)
    print "="*8
print "="*18
for name in D_arg_names:
    print np.linalg.norm(D_arg_map[name].asnumpy().flatten(),2)
    print np.linalg.norm(D_grad_map[name].asnumpy().flatten(),2)
    print "="*8
print "="*18
for name in C_arg_names:
    print np.linalg.norm(C_arg_map[name].asnumpy().flatten(),2)
    print np.linalg.norm(C_grad_map[name].asnumpy().flatten(),2)
    print "="*8

## Normal Adversarial Training

## DC Adversarial Training

In [None]:
mx.rnd.seed(12229)
for name in E_arg_names:
    if "weight" in name:
        arr = E_arg_map[name]
        shape = arr.shape
        fan_in, fan_out = np.prod(shape[1:]), shape[0]
        factor = fan_in
        scale = np.sqrt(2.34 / factor)
        arr[:] = mx.rnd.uniform(-scale, scale, arr.shape)
    else:
        arr = E_arg_map[name]
        arr[:] = 0.
for name in D_arg_names:
    if "weight" in name:
        arr = D_arg_map[name]
        shape = arr.shape
        fan_in, fan_out = np.prod(shape[1:]), shape[0]
        factor = fan_in
        scale = np.sqrt(2.34 / factor)
        arr[:] = mx.rnd.uniform(-scale, scale, arr.shape)
    else:
        arr = D_arg_map[name]
        arr[:] = 0.
for name in C_arg_names:
    if "weight" in name:
        arr = C_arg_map[name]
        shape = arr.shape
        fan_in, fan_out = np.prod(shape[1:]), shape[0]
        factor = fan_in
        scale = np.sqrt(2.34 / factor)
        arr[:] = mx.rnd.uniform(-scale, scale, arr.shape)
    else:
        arr = C_arg_map[name]
        arr[:] = 0.

In [None]:
num_epoch = 2
lr = 0.015
c = 0.1

for i in range(num_epoch):
    train_iter.reset()
    train_acc = 0.0
    loss_total = 0.0
    num_batch = 0
    
    for dbatch in train_iter:
        data = dbatch.data[0]
        label = dbatch.label[0]
        
        
        E_arg_map['data'][:] = data
        modE.forward(is_train=True)
        coder = modE.outputs[0]
        
        D_arg_map['data'][:] = coder
        modD.forward(is_train=True)
        decoder = modD.outputs[0]
        
        C_arg_map['data'][:] = coder
        modC.forward(is_train=True)
        theta = modC.outputs[0].asnumpy()
        alpha = softmax(theta)
        #print coder.context, decoder.context, prob.context
        
        data_gpu = mx.nd.zeros(shape = data.shape, ctx = dev)
        data.copyto(data_gpu)
        
        loss_total += c* EDLoss(decoder, data_gpu).asnumpy() + CLoss(alpha, label.asnumpy())
        train_acc += CalAcc(alpha, label.asnumpy())
        
        logGrad = logLossGrad(alpha, label.asnumpy())
        D_out_grad[:] = decoder-data_gpu 
        C_out_grad[:] = logGrad
        
        modD.backward([D_out_grad])
        modC.backward([C_out_grad])
        temp1 = D_grad_map['data']
        temp2 = C_grad_map['data']
        E_out_grad[:] = 2*c*temp1+temp2
        modE.backward([E_out_grad])
        
        for name in D_arg_names:
            if name!='data':
                SGD(D_arg_map[name], D_grad_map[name], lr)
                
        for name in C_arg_names:
            if name!='data':
                SGD(C_arg_map[name], C_grad_map[name], lr)
                
        for name in E_arg_names:
            if name!='data':
                SGD(E_arg_map[name], E_grad_map[name], lr)
        num_batch +=1
        if num_batch % 300==299:
            print EDLoss(decoder, data_gpu).asnumpy(), CLoss(alpha, label.asnumpy())
            print "Training Loss: %.4f\t Training Accuracy: %.4f" %(loss_total/num_batch,train_acc/num_batch)
    
    print "epoch: %d Validation Accuracy: %.4f\t Adverserial Accuracy: %.4f" \
        %(i, Validate_Adv(val_iter, norm=2, coe=0),Validate_Adv(val_iter, norm=2, coe=2))

In [None]:
num_epoch = 200
lr = 0.3
c = 0.1
coe = 0.7

Training_ACC_adv = np.zeros(shape = (num_epoch))
Validation_ACC_adv = np.zeros(shape = (num_epoch))
Adv_ACC_adv = np.zeros(shape = (num_epoch))

for i in range(num_epoch):
    if i%50==49: 
        lr = lr/1.5
        c = c*1.2
    train_iter.reset()
    train_acc = 0.0
    loss_total = 0.0
    num_batch = 0
    
    for dbatch in train_iter:
        ##================Create adv examples
        data = dbatch.data[0]
        label = dbatch.label[0]
        
        E_arg_map['data'][:] = data
        modE.forward(is_train = True)
        coder = modE.outputs[0]
        
        C_arg_map['data'][:] = coder
        modC.forward(is_train=True)
        theta = modC.outputs[0].asnumpy()
        alpha = softmax(theta)
        
        logGrad = logLossGrad(alpha, label.asnumpy())
        C_out_grad[:] = logGrad
        modC.backward([C_out_grad])
        
        modE.backward([C_data_grad])
        noise = E_grad_map['data'].asnumpy()
        
        num_sin=0
        for j in range(batch_size):
            if np.linalg.norm(noise[j].flatten(),2)==0:
                num_sin +=1
                noise[j] = mx.rnd.normal(0, 0.07, shape = noise[j].shape).asnumpy()
            elif (label.asnumpy()[j] == np.argmax(alpha[j])):
                noise[j] = noise[j]/ np.linalg.norm(noise[j].flatten(), 2)
                
            else:
                noise[j]=0
        
        data_adv = data.asnumpy() + coe * noise
        ##===============================================
        
        E_arg_map['data'][:] = data_adv
        modE.forward(is_train=True)
        coder = modE.outputs[0]
        
        D_arg_map['data'][:] = coder
        modD.forward(is_train=True)
        decoder = modD.outputs[0]
        
        C_arg_map['data'][:] = coder
        modC.forward(is_train=True)
        theta = modC.outputs[0].asnumpy()
        alpha = softmax(theta)
        #print coder.context, decoder.context, prob.context
        
        data_gpu = mx.nd.zeros(shape = data.shape, ctx = dev)
        data.copyto(data_gpu)
        
        loss_total += c* EDLoss(decoder, data_gpu).asnumpy() + CLoss(alpha, label.asnumpy())
        train_acc += CalAcc(alpha, label.asnumpy())
        
        logGrad = logLossGrad(alpha, label.asnumpy())
        D_out_grad[:] = decoder-data_gpu 
        C_out_grad[:] = logGrad
        
        modD.backward([D_out_grad])
        modC.backward([C_out_grad])
        temp1 = D_grad_map['data']
        temp2 = C_grad_map['data']
        E_out_grad[:] = 2*c*temp1+temp2
        modE.backward([E_out_grad])
        
        for name in D_arg_names:
            if name!='data':
                SGD(D_arg_map[name], D_grad_map[name], lr)
        for name in C_arg_names:
            if name!='data':
                SGD(C_arg_map[name], C_grad_map[name], lr)
        for name in E_arg_names:
            if name!='data':
                SGD(E_arg_map[name], E_grad_map[name], lr)
        num_batch +=1
        if num_batch % 300==299:
            if num_sin>0: print "Number of 0 Gradient: %d" %num_sin
            print EDLoss(decoder, data_gpu).asnumpy(), CLoss(alpha, label.asnumpy())
            print "Training Loss: %.4f\t Training Accuracy: %.4f" %(loss_total/num_batch,train_acc/num_batch)
    
    Training_ACC_adv[i] = train_acc/num_batch
    Validation_ACC_adv[i] = Validate_Adv(val_iter, norm=2, coe=0)
    Adv_ACC_adv[i] = Validate_Adv(val_iter, norm=2, coe=2)
    print "epoch: %d Validation Accuracy: %.4f\t Adverserial Accuracy: %.4f" \
        %(i, Validation_ACC_adv[i],Adv_ACC_adv[i])
    

In [None]:
for name in E_arg_names:
    print np.linalg.norm(E_arg_map[name].asnumpy().flatten(),2)
    print np.linalg.norm(E_grad_map[name].asnumpy().flatten(),2)
    print "="*8
print "="*18
for name in D_arg_names:
    print np.linalg.norm(D_arg_map[name].asnumpy().flatten(),2)
    print np.linalg.norm(D_grad_map[name].asnumpy().flatten(),2)
    print "="*8
print "="*18
for name in C_arg_names:
    print np.linalg.norm(C_arg_map[name].asnumpy().flatten(),2)
    print np.linalg.norm(C_grad_map[name].asnumpy().flatten(),2)
    print "="*8

## Transferability

In [25]:
tt = np.array([[1,2,3],[2,3,4]])

In [26]:
tt = tt/13.0

In [27]:
tt

array([[ 0.07692308,  0.15384615,  0.23076923],
       [ 0.15384615,  0.23076923,  0.30769231]])

In [28]:
tt = np.floor(tt*16)/16

In [29]:
tt

array([[ 0.0625,  0.125 ,  0.1875],
       [ 0.125 ,  0.1875,  0.25  ]])