# k-Anonymity
k -Anonymity is the first formal privacy definition we have seen. The definition of  k -Anonymity is designed to formalize our intuition that a piece of auxiliary information should not narrow down the set of possible records for an individual "too much." Stated another way,  k -Anonymity is designed to ensure that each individual can "blend into the crowd."

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

raw_data = {
    'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
    'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'], 
    'age': [42, 52, 36, 24, 73], 
    'preTestScore': [4, 24, 31, 2, 3],
    'postTestScore': [25, 94, 57, 62, 70]}
#df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])
df = pd.DataFrame(raw_data, columns = ['age', 'preTestScore', 'postTestScore'])
df

Unnamed: 0,age,preTestScore,postTestScore
0,42,4,25
1,52,24,94
2,36,31,57
3,24,2,62
4,73,3,70


In [3]:
# To implement a function to check whether a dataframe satisfies  k -Anonymity, 
# we loop over the rows; for each row, we query the dataframe to see how many 
# rows match its values for the quasi-identifiers. If the number of rows in any 
# group is less than  k , the dataframe does not satisfy  k -Anonymity for that 
# value of  k , and we return False. Note that in this simple definition, we 
# consider all columns to contain quasi-identifiers; to limit our check to a 
# subset of all columns, we would need to replace the df.columns expression with something else.


def isKAnonymized(df, k):
    for index, row in df.iterrows():
        query = ' & '.join([f'{col} == {row[col]}' for col in df.columns])
        rows = df.query(query)
        if (rows.shape[0] < k):
            return False
        
    return True

In [4]:
isKAnonymized(df, 1)

True

In [5]:
isKAnonymized(df, 2)

False

# Differential Privacy

In [62]:
import torch

def get_parallel_db(db, remove_index):
    return torch.cat((db[0 : remove_index],
                      db[remove_index + 1:]))

def get_parallel_dbs(db):
    parallel_dbs = list()
    
    for i in range(len(db)):
        pdb = get_parallel_db(db, i)
        parallel_dbs.append(pdb)
        
    return parallel_dbs

def create_db_and_parallels(num_entries):
    db = torch.randint(0, 2, [num_entries])
    pdbs = get_parallel_dbs(db)
    return db, pdbs

In [78]:
db, pdbs = create_db_and_parallels(20)
print(pdbs[0])
print(pdbs[1])

tensor([1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1])
tensor([0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1])


This definition does not create differential privacy, instead it is a measure of how much privacy is afforded by a query M. Specifically, it's a comparison between running the query M on a database (x) and a parallel database (y). As you remember, parallel databases are defined to be the same as a full database (x) with one entry/person removed.

This definition says that FOR ALL parallel databases, the maximum distance between a query on database (x) and the same query on database (y) will be e^epsilon, but that occasionally this constraint won't hold with probability delta. Thus, this theorem is called "epsilon delta" differential privacy.

In [70]:
def query(db):
    return torch.sum(db.float())

def M(db):
    query(db) + noise

query(db)

tensor(9.)

In [11]:
import numpy as np

epsilon = 0.5

def sum_query(db):
    return db.sum()


def laplacian_mechanism(db, query, sensitivity):
    beta = sensitivity / epsilon
    noise = torch.tensor(np.random.laplace(0, beta, 1))
    return query(db) + noise

In [12]:
db.sum()   # real 

tensor(10)

In [13]:
laplacian_mechanism(db, sum_query, 1)

tensor([11.5615], dtype=torch.float64)

In [14]:
def mean_query(db):
    return torch.mean(db.float())

torch.mean(db.float())

tensor(0.5000)

In [28]:
laplacian_mechanism(db, mean_query, 1/100)

tensor([0.4958], dtype=torch.float64)

# Federated Learning

In [5]:
import torch as th
import syft as sy
hook = sy.TorchHook(th)
th.tensor([1,2,3,4,5])



tensor([1, 2, 3, 4, 5])

In [6]:
bob = sy.VirtualWorker(hook, id="bob")
alice = sy.VirtualWorker(hook, id="alice")
secure_worker = sy.VirtualWorker(hook, id="secure_worker").add_worker(sy.local_worker)



In [81]:
x = th.tensor([1,2,3,4,5]).send(bob)
y = th.tensor([1,1,1,1,1]).send(bob)

In [82]:
z = x + y

In [83]:
z = z.get()
z

tensor([2, 3, 4, 5, 6])

In [84]:
bob.clear_objects()
alice.clear_objects()

<VirtualWorker id:alice #objects:0>

In [86]:
from torch import nn, optim

data = th.tensor([[1.,1],[0,1],[1,0],[0,0]], requires_grad=True)
target = th.tensor([[1.],[1], [0], [0]], requires_grad=True)

model = nn.Linear(2,1)
opt = optim.SGD(params=model.parameters(), lr=0.1)

def train(iterations=20):
    for iter in range(iterations):
        opt.zero_grad()

        pred = model(data)

        loss = ((pred - target)**2).sum()

        loss.backward()

        opt.step()

        print(loss.data)
        
train()

tensor(5.6608)
tensor(0.6240)
tensor(0.1971)
tensor(0.1290)
tensor(0.0966)
tensor(0.0734)
tensor(0.0559)
tensor(0.0427)
tensor(0.0325)
tensor(0.0248)
tensor(0.0190)
tensor(0.0145)
tensor(0.0111)
tensor(0.0084)
tensor(0.0065)
tensor(0.0049)
tensor(0.0038)
tensor(0.0029)
tensor(0.0022)
tensor(0.0017)


In [87]:
data_bob = data[0:2].send(bob)
target_bob = target[0:2].send(bob)

data_alice = data[2:4].send(alice)
target_alice = target[2:4].send(alice)

datasets = [(data_bob, target_bob), (data_alice, target_alice)]

In [88]:
def train(iterations=20):

    model = nn.Linear(2,1)
    opt = optim.SGD(params=model.parameters(), lr=0.1)
    
    for iter in range(iterations):

        for _data, _target in datasets:

            # send model to the data
            model = model.send(_data.location)

            # do normal training
            opt.zero_grad()
            pred = model(_data)
            loss = ((pred - _target)**2).sum()
            loss.backward()
            opt.step()

            # get smarter model back
            model = model.get()

            print(loss.get())

In [89]:
train(iterations=20)

tensor(0.4831, requires_grad=True)
tensor(0.6873, requires_grad=True)
tensor(0.3186, requires_grad=True)
tensor(0.3934, requires_grad=True)
tensor(0.1865, requires_grad=True)
tensor(0.2269, requires_grad=True)
tensor(0.1082, requires_grad=True)
tensor(0.1309, requires_grad=True)
tensor(0.0628, requires_grad=True)
tensor(0.0755, requires_grad=True)
tensor(0.0365, requires_grad=True)
tensor(0.0435, requires_grad=True)
tensor(0.0212, requires_grad=True)
tensor(0.0251, requires_grad=True)
tensor(0.0123, requires_grad=True)
tensor(0.0145, requires_grad=True)
tensor(0.0072, requires_grad=True)
tensor(0.0083, requires_grad=True)
tensor(0.0042, requires_grad=True)
tensor(0.0048, requires_grad=True)
tensor(0.0024, requires_grad=True)
tensor(0.0028, requires_grad=True)
tensor(0.0014, requires_grad=True)
tensor(0.0016, requires_grad=True)
tensor(0.0008, requires_grad=True)
tensor(0.0009, requires_grad=True)
tensor(0.0005, requires_grad=True)
tensor(0.0005, requires_grad=True)
tensor(0.0003, requi

In [90]:
bob.clear_objects()
alice.clear_objects()

<VirtualWorker id:alice #objects:0>

In [91]:
# Problem: We still can discover what was the customers data in some
# cases by just checking the diff between the model sent and the
# model received (reverse engineer)

# Strategy: Train different models in parallel in different workers 
# with different peoples data's and then we average those models 
# together and the only model that comes back to us is the average
# of multiple people's models

# Theft: the A.I. is put at risk.
# Privacy: Gradients reveal information about the data

# Multi-Party Computation: Additive Secret Sharing

In [92]:
import random
import numpy as np

BASE = 10

PRECISION_INTEGRAL = 8
PRECISION_FRACTIONAL = 8
Q = 293973345475167247070445277780365744413

PRECISION = PRECISION_INTEGRAL + PRECISION_FRACTIONAL

assert(Q > BASE**PRECISION)

def encode(rational):
    upscaled = int(rational * BASE**PRECISION_FRACTIONAL)
    field_element = upscaled % Q
    return field_element

def decode(field_element):
    upscaled = field_element if field_element <= Q/2 else field_element - Q
    rational = upscaled / BASE**PRECISION_FRACTIONAL
    return rational

def encrypt(secret):
    first  = random.randrange(Q)
    second = random.randrange(Q)
    third  = (secret - first - second) % Q
    return [first, second, third]

def decrypt(sharing):
    return sum(sharing) % Q

def add(a, b):
    c = list()
    for i in range(len(a)):
        c.append((a[i] + b[i]) % Q)
    return tuple(c)

In [93]:
x = encrypt(encode(5.5))
x

[97725012581601251155038937045756790147,
 76245576288205565300167134540622623789,
 120002756605360430615239206194536330477]

In [94]:
y = encrypt(encode(2.3))
y

[49609452794220575757049732088962291994,
 258021965988245354693477849206667890687,
 280315272167868563690362974265331306144]

In [95]:
z = add(x,y)
z

(147334465375821826912088669134719082141,
 40294196801283672923199705966924770063,
 106344683298061747235156902679501892208)

In [96]:
decode(decrypt(z))

7.79999999

In [7]:
from torch import nn
from torch import optim
import torch.nn.functional as F

# A Toy Dataset
data = th.tensor([[0,0],[0,1],[1,0],[1,1.]], requires_grad=True)
target = th.tensor([[0],[0],[1],[1.]], requires_grad=True)

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(2, 20)
        self.fc2 = nn.Linear(20, 1)

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        return x

# A Toy Model
model = Net()

def train():
    # Training Logic
    opt = optim.SGD(params=model.parameters(),lr=0.1)
    for iter in range(20):

        # 1) erase previous gradients (if they exist)
        opt.zero_grad()

        # 2) make a prediction
        pred = model(data)

        # 3) calculate how much we missed
        loss = ((pred - target)**2).sum()

        # 4) figure out which weights caused us to miss
        loss.backward()

        # 5) change those weights
        opt.step()

        # 6) print our progress
        print(loss.data)
        
train()

tensor(4.1855)
tensor(52.4512)
tensor(32.6691)
tensor(1.9737)
tensor(1.0418)
tensor(1.0003)
tensor(0.9933)
tensor(0.9871)
tensor(0.9800)
tensor(0.9718)
tensor(0.9619)
tensor(0.9500)
tensor(0.9357)
tensor(0.9194)
tensor(0.9097)
tensor(0.8943)
tensor(0.8797)
tensor(0.8662)
tensor(0.8466)
tensor(0.8345)


In [8]:
model(data)

tensor([[0.5428],
        [0.3483],
        [0.5552],
        [0.5552]], grad_fn=<AddmmBackward>)

## Encrypt the Model and Data

In [9]:
encrypted_model = model.fix_precision().share(alice, bob, crypto_provider=secure_worker)
list(encrypted_model.parameters())

[Parameter containing:
 (Wrapper)>FixedPrecisionTensor>[AdditiveSharingTensor]
 	-> [PointerTensor | me:46718819900 -> alice:95308790420]
 	-> [PointerTensor | me:36089668581 -> bob:75658340913]
 	*crypto provider: secure_worker*, Parameter containing:
 (Wrapper)>FixedPrecisionTensor>[AdditiveSharingTensor]
 	-> [PointerTensor | me:70337796323 -> alice:53282148552]
 	-> [PointerTensor | me:34719338819 -> bob:6028168956]
 	*crypto provider: secure_worker*, Parameter containing:
 (Wrapper)>FixedPrecisionTensor>[AdditiveSharingTensor]
 	-> [PointerTensor | me:6370518541 -> alice:28927344300]
 	-> [PointerTensor | me:12924592588 -> bob:12293015043]
 	*crypto provider: secure_worker*, Parameter containing:
 (Wrapper)>FixedPrecisionTensor>[AdditiveSharingTensor]
 	-> [PointerTensor | me:47408194528 -> alice:49409608368]
 	-> [PointerTensor | me:10249571741 -> bob:29748783833]
 	*crypto provider: secure_worker*]

In [10]:
encrypted_data = data.fix_precision().share(alice, bob, crypto_provider=secure_worker)
encrypted_data

(Wrapper)>FixedPrecisionTensor>[AdditiveSharingTensor]
	-> [PointerTensor | me:46170896006 -> alice:77834496713]
	-> [PointerTensor | me:39315272593 -> bob:45306838610]
	*crypto provider: secure_worker*

In [11]:
encrypted_prediction = encrypted_model(encrypted_data)
encrypted_prediction.get().float_precision()

tensor([[0.5430],
        [0.3480],
        [0.5550],
        [0.5550]])

In [12]:
# Keras

In [1]:
from __future__ import print_function
import tensorflow.keras as keras
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv2D, AveragePooling2D
from tensorflow.keras.layers import Activation

batch_size = 128
num_classes = 10
epochs = 2

# input image dimensions
img_rows, img_cols = 28, 28

# the data, split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()

x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
input_shape = (img_rows, img_cols, 1)

x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

model = Sequential()

model.add(Conv2D(10, (3, 3), input_shape=input_shape))
model.add(AveragePooling2D((2, 2)))
model.add(Activation('relu'))
model.add(Conv2D(32, (3, 3)))
model.add(AveragePooling2D((2, 2)))
model.add(Activation('relu'))
model.add(Conv2D(64, (3, 3)))
model.add(AveragePooling2D((2, 2)))
model.add(Activation('relu'))
model.add(Flatten())
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(x_test, y_test))
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

x_train shape: (60000, 28, 28, 1)
60000 train samples
10000 test samples
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Train on 60000 samples, validate on 10000 samples
Epoch 1/2
Epoch 2/2
Test loss: 2.3019429817199706
Test accuracy: 0.0978


In [2]:
## Save your model's weights for future private prediction
model.save('short-conv-mnist.h5')

In [3]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import AveragePooling2D, Conv2D, Dense, Activation, Flatten, ReLU, Activation

import syft as sy
hook = sy.KerasHook(tf.keras)

num_classes = 10
input_shape = (1, 28, 28, 1)

model = Sequential()

model.add(Conv2D(10, (3, 3), batch_input_shape=input_shape))
model.add(AveragePooling2D((2, 2)))
model.add(Activation('relu'))
model.add(Conv2D(32, (3, 3)))
model.add(AveragePooling2D((2, 2)))
model.add(Activation('relu'))
model.add(Conv2D(64, (3, 3)))
model.add(AveragePooling2D((2, 2)))
model.add(Activation('relu'))
model.add(Flatten())
model.add(Dense(num_classes, name="logit"))

pre_trained_weights = 'short-conv-mnist.h5'
model.load_weights(pre_trained_weights)

Falling back to insecure randomness since the required custom op could not be found for the installed version of TensorFlow. Fix this by compiling custom ops. Missing file was '/home/sarai/.virtualenvs/pytorch/lib/python3.7/site-packages/tf_encrypted/operations/secure_random/secure_random_module_tf_1.15.2.so'





In [4]:
AUTO = False

alice = sy.TFEWorker(host='localhost:4000', auto_managed=AUTO)
bob = sy.TFEWorker(host='localhost:4001', auto_managed=AUTO)
carol = sy.TFEWorker(host='localhost:4002', auto_managed=AUTO)

cluster = sy.TFECluster(alice, bob, carol)
cluster.start()

If `AUTO = False` then you now need to launch 3 servers:

```
python -m tf_encrypted.player --config /tmp/tfe.config server0
python -m tf_encrypted.player --config /tmp/tfe.config server1
python -m tf_encrypted.player --config /tmp/tfe.config server2
```

In [5]:
model.share(cluster)         # Transforms the model in a TF Encrypted Keras model.




Instructions for updating:
ksizes is deprecated, use sizes instead

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where





In [None]:
model.serve()

In [1]:
model.stop()
cluster.stop()

if not AUTO:
    process_ids = !ps aux | grep '[p]ython -m tf_encrypted.player --config' | awk '{print $2}'
    for process_id in process_ids:
        !kill {process_id}
        print("Process ID {id} has been killed.".format(id=process_id))

Process ID 27155 has been killed.
Process ID 27245 has been killed.
Process ID 27346 has been killed.
