<a href="https://colab.research.google.com/github/reallygooday/60daysofudacity/blob/master/Differential_Privacy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://github.com/udacity/private-ai/blob/master/.ipynb_checkpoints/Section%201%20-%20Differential%20Privacy-checkpoint.ipynb

https://colab.research.google.com/drive/1k9bw4NASkfE61Id-9jnyBpCzVuVp0Zfj#scrollTo=70PZvKXbIceY

Read:

- Algorithmic Foundations of Differential Privacy: https://www.cis.upenn.edu/~aaroth/Papers/privacybook.pdf
- Deep Learning with Differential Privacy: https://arxiv.org/pdf/1607.00133.pdf
- The Ethical Algorithm: https://www.amazon.com/Ethical-Algorithm-Science-Socially-Design/dp/0190948205

Topics:

- The Exponential Mechanism
- The Moment's Accountant
- Differentially Private Stochastic Gradient Descent

Advice:

- For deployments - stick with public frameworks!
- Join the Differential Privacy Community
- Don't get ahead of yourself - DP is still in the early days

# Toy Differential Privacy - Simple Database Queries

In [1]:
# creating a simple database

import torch

# the number of entries in our database
num_entries = 5000

db = torch.rand(num_entries) > 0.5
db

tensor([0, 1, 0,  ..., 1, 0, 0], dtype=torch.uint8)

# Generate Parallel Databases

In [2]:
# creating parallel database
db = torch.rand(num_entries) > 0.5
db

tensor([0, 0, 1,  ..., 1, 0, 1], dtype=torch.uint8)

In [0]:
def get_parallel_db(db, remove_index):

    return torch.cat((db[0:remove_index], 
                      db[remove_index+1:]))

In [4]:
get_parallel_db(db, 52352)

tensor([0, 0, 1,  ..., 1, 0, 1], dtype=torch.uint8)

In [0]:
def get_parallel_dbs(db):

    parallel_dbs = list()

    for i in range(len(db)):
        pdb = get_parallel_db(db, i)
        parallel_dbs.append(pdb)
    
    return parallel_dbs

In [0]:

pdbs = get_parallel_dbs(db)

In [0]:

def create_db_and_parallels(num_entries):
    
    db = torch.rand(num_entries) > 0.5
    pdbs = get_parallel_dbs(db)
    
    return db, pdbs

In [0]:
db, pdbs = create_db_and_parallels(20)


# Towards Evaluating The Differential Privacy of a Function

In [0]:
db, pdbs = create_db_and_parallels(5000)

In [0]:
def query(db):
    return db.sum()

In [0]:
full_db_result = query(db)

In [0]:
sensitivity = 0
for pdb in pdbs:
    pdb_result = query(pdb)
    
    db_distance = torch.abs(pdb_result - full_db_result)
    
    if(db_distance > sensitivity):
        sensitivity = db_distance

In [14]:

sensitivity


tensor(1)

# Evaluating the Privacy of a Function

In [0]:

def sensitivity(query, n_entries=1000):

    db, pdbs = create_db_and_parallels(n_entries)
    
    full_db_result = query(db)
    
    max_distance = 0
    for pdb in pdbs:
        pdb_result = query(pdb)

        db_distance = torch.abs(pdb_result - full_db_result)

        if(db_distance > max_distance):
            max_distance = db_distance
            
    return max_distance

In [0]:

def query(db):
    return db.float().mean()

In [17]:
sensitivity(query)

tensor(0.0005)

In [0]:

db, pdbs = create_db_and_parallels(20)

In [19]:
db

tensor([0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1],
       dtype=torch.uint8)

# Calculate L1 Sensitivity For Threshold


In [0]:
def query(db, threshold=5):
    return (db.sum() > threshold).float()

In [21]:
for i in range(10):
    sens_f = sensitivity(query, n_entries=10)
    print(sens_f)

0
0
tensor(1.)
tensor(1.)
0
0
0
tensor(1.)
0
tensor(1.)


# A Basic Differencing Attack

In [0]:
db, _ = create_db_and_parallels(100)

# Perform a Differencing Attack on Row 10


In [0]:

pdb = get_parallel_db(db, remove_index=10)

In [0]:
db[10]

In [24]:
sum(db)

tensor(41, dtype=torch.uint8)

In [25]:
# differencing attack using sum query

sum(db) - sum(pdb)

tensor(1, dtype=torch.uint8)

In [26]:
# differencing attack using mean query

(sum(db).float() / len(db)) - (sum(pdb).float() / len(pdb))

tensor(0.0060)

In [27]:

# differencing attack using threshold

(sum(db).float() > 49) - (sum(pdb).float()  > 49)

tensor(0, dtype=torch.uint8)

# Local Differential Privacy

In [28]:
# actual values from people
db

tensor([0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
        1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0,
        0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,
        0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
        1, 1, 0, 0], dtype=torch.uint8)

In [0]:
def query(db):

    true_result = torch.mean(db.float())
    
    first_coin_flip = (torch.rand(len(db)) < 0.5).float()
    second_coin_flip = (torch.rand(len(db)) < 0.5).float()

    augmented_database = db.float() * first_coin_flip + (1 - first_coin_flip) * second_coin_flip

    db_result = torch.mean(augmented_database.float()) * 2 - 0.5
    
    return db_result, true_result

In [31]:
db, pdbs = create_db_and_parallels(10)
private_result, true_result = query(db)
print("With Noise:" + str(private_result))
print("Without Noise:" + str(true_result))

With Noise:tensor(0.9000)
Without Noise:tensor(0.7000)


In [32]:
db, pdbs = create_db_and_parallels(100)
private_result, true_result = query(db)
print("With Noise:" + str(private_result))
print("Without Noise:" + str(true_result))

With Noise:tensor(0.4600)
Without Noise:tensor(0.5300)


In [33]:

db, pdbs = create_db_and_parallels(1000)
private_result, true_result = query(db)
print("With Noise:" + str(private_result))
print("Without Noise:" + str(true_result))

With Noise:tensor(0.4560)
Without Noise:tensor(0.4490)


In [34]:

db, pdbs = create_db_and_parallels(10000)
private_result, true_result = query(db)
print("With Noise:" + str(private_result))
print("Without Noise:" + str(true_result))

With Noise:tensor(0.4930)
Without Noise:tensor(0.4951)


# Varying Amounts of Noise

In [0]:
def query(db, noise=0.2):
    
    true_result = torch.mean(db.float())

    first_coin_flip = (torch.rand(len(db)) < noise).float()
    second_coin_flip = (torch.rand(len(db)) < 0.5).float()

    augmented_database = db.float() * first_coin_flip + (1 - first_coin_flip) * second_coin_flip

    sk_result = augmented_database.float().mean()

    private_result = ((sk_result / noise) - 0.5) * noise / (1 - noise)

    return private_result, true_result

In [36]:
db, pdbs = create_db_and_parallels(100)
private_result, true_result = query(db, noise=0.1)
print("With Noise:" + str(private_result))
print("Without Noise:" + str(true_result))

With Noise:tensor(0.4333)
Without Noise:tensor(0.4600)


In [37]:
db, pdbs = create_db_and_parallels(100)
private_result, true_result = query(db, noise=0.2)
print("With Noise:" + str(private_result))
print("Without Noise:" + str(true_result))

With Noise:tensor(0.5000)
Without Noise:tensor(0.4900)


In [38]:

db, pdbs = create_db_and_parallels(100)
private_result, true_result = query(db, noise=0.4)
print("With Noise:" + str(private_result))
print("Without Noise:" + str(true_result))

With Noise:tensor(0.5500)
Without Noise:tensor(0.4900)


In [40]:

db, pdbs = create_db_and_parallels(100)
private_result, true_result = query(db, noise=0.8)
print("With Noise:" + str(private_result))
print("Without Noise:" + str(true_result))

With Noise:tensor(1.1000)
Without Noise:tensor(0.5900)


In [41]:
db, pdbs = create_db_and_parallels(10000)
private_result, true_result = query(db, noise=0.8)
print("With Noise:" + str(private_result))
print("Without Noise:" + str(true_result))

With Noise:tensor(0.5130)
Without Noise:tensor(0.4970)


# The Formal Definition of Differential Privacy, Epsilon, Delta

In [42]:
db, pdbs = create_db_and_parallels(100)

def query(db):
    return torch.sum(db.float())

def M(db):
    query(db) + noise

query(db)

tensor(52.)

# Adding Noise for Global Differential Privacy

# Differentially Private Query

In [0]:
epsilon = 0.0001

In [0]:
import numpy as np

In [0]:
db, pdbs = create_db_and_parallels(100)

In [0]:
def sum_query(db):
    return db.sum()

In [0]:
def laplacian_mechanism(db, query, sensitivity):
    
    beta = sensitivity / epsilon
    noise = torch.tensor(np.random.laplace(0, beta, 1))
    
    return query(db) + noise

In [0]:
def mean_query(db):
    return torch.mean(db.float())

In [51]:
laplacian_mechanism(db, sum_query, 1)

tensor([21613.4938], dtype=torch.float64)

In [52]:
laplacian_mechanism(db, mean_query, 1/100)

tensor([349.8273], dtype=torch.float64)

# Differential Privacy for Deep Learning

# An Example Scenario: A Health Neural Network

In [0]:
import numpy as np

In [0]:

num_teachers = 10 # we're working with 10 partner hospitals
num_examples = 10000 # the size of OUR dataset
num_labels = 10 # number of lablels for our classifier

In [0]:
preds = (np.random.rand(num_teachers, num_examples) * num_labels).astype(int).transpose(1,0) # fake predictions

In [0]:
new_labels = list()
for an_image in preds:

    label_counts = np.bincount(an_image, minlength=num_labels)

    epsilon = 0.1
    beta = 1 / epsilon

    for i in range(len(label_counts)):
        label_counts[i] += np.random.laplace(0, beta, 1)

    new_label = np.argmax(label_counts)
    
    new_labels.append(new_label)

# PATE Analysis

In [57]:
labels = np.array([9, 9, 3, 6, 9, 9, 9, 9, 8, 2])
counts = np.bincount(labels, minlength=10)
query_result = np.argmax(counts)
query_result

9

In [60]:
!pip install syft

Collecting syft
[?25l  Downloading https://files.pythonhosted.org/packages/38/2e/16bdefc78eb089e1efa9704c33b8f76f035a30dc935bedd7cbb22f6dabaa/syft-0.1.21a1-py3-none-any.whl (219kB)
[K     |████████████████████████████████| 225kB 2.8MB/s 
[?25hCollecting flask-socketio>=3.3.2 (from syft)
  Downloading https://files.pythonhosted.org/packages/33/31/f779e69e59f528684d8c9925b3c82a9303d148655d9671ba2975ab8c3894/Flask_SocketIO-4.2.0-py2.py3-none-any.whl
Collecting msgpack>=0.6.1 (from syft)
[?25l  Downloading https://files.pythonhosted.org/packages/92/7e/ae9e91c1bb8d846efafd1f353476e3fd7309778b582d2fb4cea4cc15b9a2/msgpack-0.6.1-cp36-cp36m-manylinux1_x86_64.whl (248kB)
[K     |████████████████████████████████| 256kB 41.8MB/s 
[?25hCollecting tf-encrypted>=0.5.4 (from syft)
[?25l  Downloading https://files.pythonhosted.org/packages/55/ff/7dbd5fc77fcec0df1798268a6b72a2ab0150b854761bc39c77d566798f0b/tf_encrypted-0.5.7-py3-none-manylinux1_x86_64.whl (2.1MB)
[K     |████████████████████████

In [62]:
!pip install --upgrade --force-reinstall zstd

Collecting zstd
Installing collected packages: zstd
  Found existing installation: zstd 1.4.1.0
    Uninstalling zstd-1.4.1.0:
      Successfully uninstalled zstd-1.4.1.0
Successfully installed zstd-1.4.1.0


In [0]:

from syft.frameworks.torch.differential_privacy import pate

In [64]:
num_teachers, num_examples, num_labels = (100, 100, 10)
preds = (np.random.rand(num_teachers, num_examples) * num_labels).astype(int) #fake preds
indices = (np.random.rand(num_examples) * num_labels).astype(int) # true answers

preds[:,0:10] *= 0

data_dep_eps, data_ind_eps = pate.perform_analysis(teacher_preds=preds, indices=indices, noise_eps=0.1, delta=1e-5)

assert data_dep_eps < data_ind_eps



In [65]:
data_dep_eps, data_ind_eps = pate.perform_analysis(teacher_preds=preds, indices=indices, noise_eps=0.1, delta=1e-5)
print("Data Independent Epsilon:", data_ind_eps)
print("Data Dependent Epsilon:", data_dep_eps)

Data Independent Epsilon: 11.756462732485115
Data Dependent Epsilon: 1.52655213289881


In [0]:
preds[:,0:50] *= 0

In [67]:

data_dep_eps, data_ind_eps = pate.perform_analysis(teacher_preds=preds, indices=indices, noise_eps=0.1, delta=1e-5, moments=20)
print("Data Independent Epsilon:", data_ind_eps)
print("Data Dependent Epsilon:", data_dep_eps)

Data Independent Epsilon: 11.756462732485115
Data Dependent Epsilon: 0.9029013677789843
