In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.autograd import grad
from utils import *

In [2]:
def redim(x):
    return(np.append(x, np.ones([x.shape[0],1]), axis = 1))

In [3]:
def stand(x, mean, std):
    x = x - mean
    x = x/std 
    return(x)

## Benchmark for regression

In [4]:
data = pd.read_csv("data/winequality-white.csv", sep = ";")
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [5]:
X = data.loc[:, data.columns != "quality"]
y = data.quality

Extract first few PCA components

In [6]:
pca = PCA(n_components=11, svd_solver='full')
pca.fit(X)                 
print(pca.explained_variance_ratio_) 
x = pca.fit_transform(X)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
x_train = np.asarray(x_train,dtype=np.float32)
y_train = np.asarray(y_train,dtype=np.float32).reshape(-1,1)
x_test = np.asarray(x_test,dtype=np.float32) #un poco trampa
y_test = np.asarray(y_test,dtype=np.float32).reshape(-1,1)
### to torch
x_train = Variable( torch.from_numpy(x_train) )
y_train = Variable( torch.from_numpy(y_train) )
x_test = torch.from_numpy(x_test) 
y_test = torch.from_numpy(y_test) 

[9.09657344e-01 7.93338631e-02 1.01542742e-02 5.06004450e-04
 3.23409395e-04 8.72769740e-06 6.72986618e-06 5.39060918e-06
 4.07002123e-06 1.86525322e-07 1.49217279e-10]


In [7]:
x_train_renom = stand(x_train, x_train.mean(dim=0), x_train.std(dim=0))
x_test_renom = stand(x_test, x_train.mean(dim=0), x_train.std(dim=0))

### Using torch

Convention: last weight will be the bias.

In [8]:
w = torch.randn(1, x_train.shape[1] + 1, requires_grad=True)
lmb = 0.0

In [9]:
def model(x, w):
    weights = w[0,:-1].view(1,-1)
    bias = w[0,-1]
    return( x @ weights.t() + bias )

def mse(t1, t2, w):
    diff = t1 - t2
    return( torch.sum(diff * diff) / diff.numel() + lmb*w @ w.t() )

In [10]:
lr = 0.01
epochs = 1000

In [11]:
for epoch in range(epochs):
    epoch += 1
    preds = model(x_train_renom, w)
    loss = mse(preds, y_train, w)
    loss.backward()
    with torch.no_grad():
        w -= w.grad * lr
        w.grad.zero_()
        
    if epoch%1000== 0:
        print('epoch {}, loss {}'.format(epoch,loss.data[0]))
    

epoch 1000, loss tensor([ 0.5743])


In [12]:
print(w)

tensor([[-0.1249,  0.1581, -0.0413,  0.3557,  0.0124,  0.0770,  0.0880,
         -0.0032, -0.1719, -0.0307, -0.0728,  5.8818]])


In [13]:
def rmse(y, pred):
    return torch.sqrt( torch.mean( (pred - y )**2 ) )

In [14]:
preds = model(x_test_renom, w)
rmse(y_test, preds)

tensor(0.7360)

## Adversarial attack. Non-Bayesian case

Let $X$ denote the clean dataset, and $X^* = T(X, \beta)$ the attacked dataset, when the classifier choose parameters $\beta$. We try to solve the following Defend-Attack game

$$
\beta^* = \arg\min_{\beta} \widehat{\theta}_C [\beta, T(X, \beta)] = \arg\min_{\beta} \sum_{i=1}^n \left( T(x, \beta)^{[i]}\beta^{\top} - y_i \right)^2 + \lambda \beta \beta^{\top}
$$

subject to

$$
X^* = T(X, \beta) = \arg\min_{X'} \widehat{\theta}_A [\beta, X'] = \arg\min_{X'} \sum_{i=1}^n c_{i}\left( X'^{[i]}\beta^{\top} - z_i \right)^2 + ||X-X'||^2_{F}
$$

Where $y$ are the true labels, $z$ are the targets and $c$ are instance-specific factors, which are common knowledge here. We can solve exactly the attacker's problem, yielding

$$
X^* = T(X, \beta) = X - \left(\text{diag}(c_d)^{-1} + \beta \beta^{\top} I_n \right)^{-1} (X\beta - z)\beta^\top
$$

We could then compute the gradient for the classifier problem using

$$
\nabla \widehat{\theta}_C [\beta, T(X, \beta)] = \nabla_{\beta} \widehat{\theta}_C [\beta, T(X, \beta)] + \nabla_T  \widehat{\theta}_C [\beta, T(X, \beta)] \frac{\partial T(X,\beta)}{\partial \beta}
$$

and use gradient descent to find $\beta^*$.

## Defense - Forward mode

### Attack - Analytic form

In [15]:
# Exact solution to the attacker problem
def attack(w, instance, c_d, z):
    weights = w[0,:-1].view(1,-1)
    bias = w[0,-1]
    ##
    p1 = ( 1/c_d + weights @ weights.t() )**(-1)
    p1 = torch.diag( p1.squeeze(1) )
    p2 = ( instance @ weights.t() - (z - bias) ) @ weights
    out = instance - p1 @ p2
    return(out)

In [16]:
value = 0.5 ## Same c_i for every instance
c_d = torch.ones([len(y_test), 1])*value
z = torch.zeros([len(y_test),1]) 
out = attack(w, x_test_renom, c_d, z)

In [17]:
pred_at =  model(out, w)
pred_clean = model(x_test_renom, w)
print("Clean test RMSE: ", torch.sqrt( torch.mean( (pred_clean - y_test )**2 ) ) )
print("Attacked est RMSE: ", torch.sqrt( torch.mean( (pred_at- y_test )**2 ) ) )

Clean test RMSE:  tensor(0.7360)
Attacked est RMSE:  tensor(0.9263)


### Attack - Using torch

In [18]:
lr = 10e-2
epochs = 100
value = 0.5
#
c_d = torch.ones([len(y_test), 1])*value
z = torch.zeros([len(y_test),1]) 
#

In [19]:
def attacker_cost_flatten(w, x, x_old, c_d, z):
    weights = w[0,:-1].view(1,-1)
    bias = w[0,-1]
    x = x.view(x_old.shape[0],-1)
    ##
    diff = x_old - x
    return  torch.sum( c_d*(x @ weights.t() + bias)**2 )  +  torch.sum(diff**2)

In [20]:
instance = x_test_renom
out = attack(w, instance, c_d, z)
attacked_instance = torch.randn(x_test_renom.shape[0]*x_test.shape[1], requires_grad=True)
##
for epoch in range(epochs):
    epoch += 1
    loss = attacker_cost_flatten(w, attacked_instance, instance, c_d, z) 
    loss.backward()
    with torch.no_grad():
        attacked_instance -= attacked_instance.grad * lr
        attacked_instance.grad.zero_()

        
    if epoch%10 == 0:
        print('epoch {}, loss {}'.format(epoch,loss.data[0]))


epoch 10, loss 23466.92578125
epoch 20, loss 22885.5625
epoch 30, loss 22879.201171875
epoch 40, loss 22879.130859375
epoch 50, loss 22879.12890625
epoch 60, loss 22879.12890625
epoch 70, loss 22879.12890625
epoch 80, loss 22879.12890625
epoch 90, loss 22879.12890625
epoch 100, loss 22879.12890625




In [21]:
print(attacker_cost_flatten(w, attacked_instance, instance, c_d, z))
print(attacker_cost_flatten(w, out.view(-1,1), instance, c_d, z))

tensor(22879.1289)
tensor(22879.1289)


In [18]:
def learner_cost_flatten(w, x, y, lmb):
    x = x.view(-1,w.shape[1]-1) 
    weights = w[0,:-1].view(1,-1)
    bias = w[0,-1]
    return torch.sum( (x @ weights.t() + bias - y)**2 ) +  lmb * weights @ weights.t()

def attacker_cost_flatten(w, x, x_old, c_d, z):
    weights = w[0,:-1].view(1,-1)
    bias = w[0,-1]
    x = x.view(x_old.shape[0],-1)
    ##
    diff = x_old - x
    return  torch.sum( c_d*(x @ weights.t() + bias)**2 )  +  torch.sum(diff**2)

### Defense Forward Mode

In [25]:
##
def compute_full_second_derivative(vec_func, var):
    tmp = torch.zeros( int(np.max(var.shape)), vec_func.shape[0])
    for i, loss in enumerate(vec_func):
        tmp[:,i] = torch.autograd.grad(loss, var, retain_graph=True)[0]
    return tmp

##
def do_forward_multidim(w, x, x_clean, c_d, z, y_train, lmb, T=100):
    lr = 10e-6 # Outer learning rate
    ilr = 0.01 # Inner learning rate
    ##
    gm = lambda w, x: attacker_cost_flatten(w, x, x_clean, c_d, z)
    fm = lambda w, x: learner_cost_flatten(w, x, y_train, lmb)
    ##
    Z = torch.zeros(x.shape[0], w.shape[1]) 


    for i in range(T):
        # We nee to compute the total derivative of f wrt x
        #y = 0.0

        for j in range(T):
            grad_x = torch.autograd.grad(gm(w,x), x, create_graph=True)[0]
            new_x = x - ilr*grad_x
            ##
            A_tensor = compute_full_second_derivative(new_x, x)
            B_tensor = compute_full_second_derivative(new_x, w)
            ##
            Z = A_tensor @ Z + B_tensor.t()
            #Z = Z @ A_tensor + B_tensor
            x = Variable(new_x, requires_grad=True)

        grad_w = torch.autograd.grad(fm(w,x), w, retain_graph=True)[0] 
        grad_x = torch.autograd.grad(fm(w,x), x)[0]
        ##
        # print(grad_x.shape, Z.shape, grad_w.shape)
        w = w - lr*(grad_w + grad_x @ Z)
        print(fm(w,x))
    return(w)

In [26]:
value = 0.5
c_d = torch.ones([len(y_train), 1])*value
z = torch.zeros([len(y_train),1]) 
w_clean = torch.randn(1, x_train.shape[1] + 1, requires_grad=True)
instance = x_train_renom
attacked_instance = torch.randn(x_train_renom.shape[0]*x_train_renom.shape[1], requires_grad=True)
w_clean_fw = do_forward_multidim(w_clean, attacked_instance, instance, c_d, z, y_train, lmb=0.0, T=10)

KeyboardInterrupt: 

### Defense Backward Mode

In [19]:
def do_backward_multidim(w, x, x_clean, c_d, z, y_train, lmb, T=100):
    lr = 10e-6 # Outer learning rate
    ilr = 0.01 # Inner learning rate
    ##
    gm = lambda w, x: attacker_cost_flatten(w, x, x_clean, c_d, z)
    fm = lambda w, x: learner_cost_flatten(w, x, y_train, lmb)
    ##
    xt = torch.zeros(int(T), x.shape[0]) 

    for i in range(T):
        # We nee to compute the total derivative of f wrt x
        ##    
        for j in range(T):
            grad_x = torch.autograd.grad(gm(w,x), x, create_graph=True)[0]
            new_x = x - ilr*grad_x
            x = Variable(new_x, requires_grad=True)
            xt[j] = x
        ## CHECK WITH ANALYTICAL SOLUTION
        ###
        alpha = -torch.autograd.grad(fm(w,x), x, retain_graph=True)[0]
        gr = torch.zeros_like(w)
        ###
        for j in range(T-1,-1,-1):
            x_tmp = Variable(xt[j], requires_grad=True)
            grad_x, = torch.autograd.grad( gm(w,x_tmp), x_tmp, create_graph=True )
            loss = x_tmp - ilr*grad_x
            loss = loss@alpha
            aux1 = torch.autograd.grad(loss, w, retain_graph=True)[0]
            aux2 = torch.autograd.grad(loss, x_tmp)[0]
            gr -= aux1
            alpha = aux2 

        grad_w = torch.autograd.grad(fm(w,x), w)[0] 
        ##
        w = w - lr*(grad_w + gr)
        
        if i%10 == 0:
            print( 'epoch {}, loss {}'.format(i,fm(w,x)) )
    return w

In [20]:
value = 0.5
c_d = torch.ones([len(y_train), 1])*value
z = torch.zeros([len(y_train),1]) 
w_clean = torch.randn(1, x_train.shape[1] + 1, requires_grad=True)
instance = x_train_renom

In [None]:
attacked_instance = torch.randn(x_train_renom.shape[0]*x_train_renom.shape[1], requires_grad=True)
w_clean_bw = do_backward_multidim(w_clean, attacked_instance, instance, c_d, z, y_train, lmb=0.0, T=450)

epoch 0, loss tensor([[nan.]])


In [None]:
w_clean_bw

#### Test Nash Solution

In [23]:
value = 0.5
c_d = torch.ones([len(y_test), 1])*value
z = torch.zeros([len(y_test),1]) 
##
out = attack(w, x_test_renom, c_d, z)
preds = model(out, w_clean)
rmse(y_test, preds)

tensor(9.4470)

In [24]:
preds = model(out, w)
rmse(y_test, preds)

tensor(0.9263)

### Defense Analytical Solution

In [None]:
w_a = w[0][:-1]
b_a = w[0][-1]
##
w_nash = torch.randn(1, x_train.shape[1], requires_grad=True)
b_nash = torch.randn(1, requires_grad=True)

In [None]:
def attack_a(w, b, test, c_d, z):
    c_d = ( 1/c_d + w @ w.t() )**(-1)
    p1 = torch.diag( c_d[0] )
    #p1 = torch.inverse( torch.inverse( torch.diag(c_d) ) +  w @ w.t() * torch.eye( test.shape[0] ) )
    p2 = ( test @ w.t() + b - z)@w 
    out = test - p1 @ p2
    return(out)

def learner_cost_a(w, b, x, y, lmb, c_d, z):
    out = attack_a(w, b, x, c_d, z)
    #out = stand(out, out.mean(dim=0), out.std(dim=0))
    #print(out.std(dim=0))
    return torch.sum( (out @ w.t() + b - y)**2 ) +  lmb * w @ w.t()

In [None]:
lr = 10e-6
epochs = 400
value = 0.5
c_d = torch.ones(len(y_train))*value
z = torch.zeros([len(y_train),1]) 
print("Initial Cost", learner_cost(w_nash, b_nash, x_train_renom, y_train, lmb, c_d, z))
for epoch in range(epochs):
    epoch += 1
    loss = learner_cost(w_nash, b_nash, x_train_renom, y_train, lmb, c_d, z)
    loss.backward()
    with torch.no_grad():
        w_nash -= w_nash.grad * lr
        b_nash -= b_nash.grad * lr
        w_nash.grad.zero_()
        b_nash.grad.zero_()
        
    if epoch%100 == 0:
        print('epoch {}, loss {}'.format(epoch,loss.data[0]))

In [None]:
print(w_nash)
print(b_nash)
      

In [None]:
w_clean_bw