# k-Anonymity
k -Anonymity is the first formal privacy definition we have seen. The definition of  k -Anonymity is designed to formalize our intuition that a piece of auxiliary information should not narrow down the set of possible records for an individual "too much." Stated another way,  k -Anonymity is designed to ensure that each individual can "blend into the crowd."

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

raw_data = {
    'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
    'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'], 
    'age': [42, 52, 36, 24, 73], 
    'preTestScore': [4, 24, 31, 2, 3],
    'postTestScore': [25, 94, 57, 62, 70]}
#df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])
df = pd.DataFrame(raw_data, columns = ['age', 'preTestScore', 'postTestScore'])
df

Unnamed: 0,age,preTestScore,postTestScore
0,42,4,25
1,52,24,94
2,36,31,57
3,24,2,62
4,73,3,70


In [3]:
# To implement a function to check whether a dataframe satisfies  k -Anonymity, 
# we loop over the rows; for each row, we query the dataframe to see how many 
# rows match its values for the quasi-identifiers. If the number of rows in any 
# group is less than  k , the dataframe does not satisfy  k -Anonymity for that 
# value of  k , and we return False. Note that in this simple definition, we 
# consider all columns to contain quasi-identifiers; to limit our check to a 
# subset of all columns, we would need to replace the df.columns expression with something else.


def isKAnonymized(df, k):
    for index, row in df.iterrows():
        query = ' & '.join([f'{col} == {row[col]}' for col in df.columns])
        rows = df.query(query)
        if (rows.shape[0] < k):
            return False
        
    return True

In [4]:
isKAnonymized(df, 1)

True

In [5]:
isKAnonymized(df, 2)

False

# Differential Privacy

In [62]:
import torch

def get_parallel_db(db, remove_index):
    return torch.cat((db[0 : remove_index],
                      db[remove_index + 1:]))

def get_parallel_dbs(db):
    parallel_dbs = list()
    
    for i in range(len(db)):
        pdb = get_parallel_db(db, i)
        parallel_dbs.append(pdb)
        
    return parallel_dbs

def create_db_and_parallels(num_entries):
    db = torch.randint(0, 2, [num_entries])
    pdbs = get_parallel_dbs(db)
    return db, pdbs

In [78]:
db, pdbs = create_db_and_parallels(20)
print(pdbs[0])
print(pdbs[1])

tensor([1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1])
tensor([0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1])


This definition does not create differential privacy, instead it is a measure of how much privacy is afforded by a query M. Specifically, it's a comparison between running the query M on a database (x) and a parallel database (y). As you remember, parallel databases are defined to be the same as a full database (x) with one entry/person removed.

This definition says that FOR ALL parallel databases, the maximum distance between a query on database (x) and the same query on database (y) will be e^epsilon, but that occasionally this constraint won't hold with probability delta. Thus, this theorem is called "epsilon delta" differential privacy.

In [70]:
def query(db):
    return torch.sum(db.float())

def M(db):
    query(db) + noise

query(db)

tensor(9.)

In [11]:
import numpy as np

epsilon = 0.5

def sum_query(db):
    return db.sum()


def laplacian_mechanism(db, query, sensitivity):
    beta = sensitivity / epsilon
    noise = torch.tensor(np.random.laplace(0, beta, 1))
    return query(db) + noise

In [12]:
db.sum()   # real 

tensor(10)

In [13]:
laplacian_mechanism(db, sum_query, 1)

tensor([11.5615], dtype=torch.float64)

In [14]:
def mean_query(db):
    return torch.mean(db.float())

torch.mean(db.float())

tensor(0.5000)

In [28]:
laplacian_mechanism(db, mean_query, 1/100)

tensor([0.4958], dtype=torch.float64)

# Federated Learning

In [None]:
import torch as th
import syft as sy
hook = sy.TorchHook(th)
th.tensor([1,2,3,4,5])

In [None]:
bob = sy.VirtualWorker(hook, id="bob")
alice = sy.VirtualWorker(hook, id="alice")

In [None]:
x = th.tensor([1,2,3,4,5]).send(bob)
y = th.tensor([1,1,1,1,1]).send(bob)

In [None]:
z = x + y

In [None]:
z = z.get()
z

In [None]:
bob.clear_objects()
alice.clear_objects()

In [None]:
from torch import nn, optim

data = th.tensor([[1.,1],[0,1],[1,0],[0,0]], requires_grad=True)
target = th.tensor([[1.],[1], [0], [0]], requires_grad=True)

model = nn.Linear(2,1)
opt = optim.SGD(params=model.parameters(), lr=0.1)

def train(iterations=20):
    for iter in range(iterations):
        opt.zero_grad()

        pred = model(data)

        loss = ((pred - target)**2).sum()

        loss.backward()

        opt.step()

        print(loss.data)
        
train()

In [None]:
data_bob = data[0:2].send(bob)
target_bob = target[0:2].send(bob)

data_alice = data[2:4].send(alice)
target_alice = target[2:4].send(alice)

datasets = [(data_bob, target_bob), (data_alice, target_alice)]

In [None]:
def train(iterations=20):

    model = nn.Linear(2,1)
    opt = optim.SGD(params=model.parameters(), lr=0.1)
    
    for iter in range(iterations):

        for _data, _target in datasets:

            # send model to the data
            model = model.send(_data.location)

            # do normal training
            opt.zero_grad()
            pred = model(_data)
            loss = ((pred - _target)**2).sum()
            loss.backward()
            opt.step()

            # get smarter model back
            model = model.get()

            print(loss.get())
            
            
# Problem: We still can discover what was the customers data in some
# cases by just checking the diff between the model sent and the
# model received (reverse engineer)

# Strategy: Train different models in parallel in different workers 
# with different peoples data's and then we average those models 
# together and the only model that comes back to us is the average
# of multiple people's models

In [None]:
bob.clear_objects()
alice.clear_objects()