<a href="https://colab.research.google.com/github/silviutroscot/Machine-Learning-Playground/blob/master/DifferentialPrivacy101.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch

# we will build a simple database, with one column, which can be either 0 or 1
num_entries = 5000

db = torch.rand(num_entries) > 0.5


In [0]:
# build a list of databases which contain all the elements of the initial db
# excluding the element on the i-th position (i in range [0, num_entries - 1])

def generate_parallel_dbs(db):
    db_list = []
    for i in range(0, len(db)):
        db_list.append(torch.cat([db[0:i], db[i+1:]]))
    
    return db_list

# function which creates a database with size given as param, 
# and all the parallel dbs from it

def create_db_and_parallels(num_entries):
    db = torch.rand(num_entries) > 0.5
    pdb = generate_parallel_dbs(db)
    return (db, pdb)

Compute **L1 sensitivity** for a function, which is the biggest difference between the output of a query on a DB and the output of that function on the same DB from where we remove one element.

In [0]:
def compute_l1_sensitivity(function, db, parallel_dbs):
    max_difference = 0
    res = function(db)
    for parallel_db in parallel_dbs:
        temp = function(parallel_db)
        if torch.abs(res - temp) > max_difference:
            max_difference = res - temp
    
    return max_difference

def test_l1_sensitivity_on_random_db(function, num_entries=3000):
    (db, pdb) = create_db_and_parallels(num_entries)
    res = compute_l1_sensitivity(function, db, pdb)
    return res


In [9]:
test_l1_sensitivity_on_random_db(torch.sum, 30)

tensor(1)

## Run a test where we have the query as "is the sum of elements larger than 5" with 10 randomly generated databases of size 10

In [0]:
def query(db):
    return torch.sum(db) > 5

def test_dbs(no_of_dbs, size_of_db):
    result = []
    for i in range(0, no_of_dbs):
        (db, pdb) = create_db_and_parallels(10)
        res = compute_l1_sensitivity(query, db, pdb)
        result.append(res)
    return result


In [11]:
test_dbs(10, 10)

[0, 0, 0, 0, 0, 0, 0, 0, 0, tensor(1, dtype=torch.uint8)]

## Perform differential attacks against <u>sum</u>, <u>mean</u> and <u> threshold sum</u> queries. 

In [0]:
def sum_query(db):
    return torch.sum(db)

def attack_sum_function(no_of_entries, entry_id):
    
    if (entry_id > no_of_entries):
            return "invalid call"
    (db, pdb) = create_db_and_parallels(no_of_entries)
    print (db)
    parallel_db = torch.cat([db[0:entry_id], db[entry_id+1:]])
    # if the sum of the db without that entry is equal with the sum of the full
    # db, it means that this value is 0; otherwise it is 1
    if (sum_query(db) == sum_query(parallel_db)):
        return 0
    else:
        return 1
    

In [13]:
attack_sum_function(10, 3)

tensor([0, 1, 0, 1, 1, 0, 1, 0, 0, 1], dtype=torch.uint8)


1

In [0]:
def mean_query(db):
    return torch.mean(db.type(torch.FloatTensor))

def attack_mean_function(no_of_entries, entry_id):
    
    if (entry_id > no_of_entries):
            return "invalid call"
    (db, pdb) = create_db_and_parallels(no_of_entries)
    print (db)
    parallel_db = torch.cat([db[0:entry_id], db[entry_id+1:]])
    if (mean_query(db) > mean_query(parallel_db)):
        return 1
    else:
        return 0

In [15]:
attack_mean_function(10, 3)

tensor([1, 0, 0, 1, 1, 0, 0, 1, 0, 1], dtype=torch.uint8)


1

In [0]:
def threshold_query(db):
    return torch.sum(db) > 5

def attack_threshold_function(no_of_entries, entry_id):
    
    if (entry_id > no_of_entries):
            return "invalid call"
    (db, pdb) = create_db_and_parallels(no_of_entries)
    print (db)
    parallel_db = torch.cat([db[0:entry_id], db[entry_id+1:]])
    if (threshold_query(db) > threshold_query(parallel_db)):
        return 1
    else:
        return "unknown"

In [17]:
attack_threshold_function(10, 2)

tensor([1, 0, 0, 0, 0, 0, 1, 1, 1, 0], dtype=torch.uint8)


'unknown'

# Implement local privacy by adding the noise to the input, following this pattern:
## For each entry in the DB we toss a coin (with possible outcomes 0 or 1); 
## If the value is 1, we let the value unaltered;
## If the value is 0, we toss one more time and we put the value of the toss as the value of that entry

In [0]:
def generate_locally_private_db(db, randomness_probability):
    result = []
    for entry in db:
        first_toss = (torch.rand(1) > randomness_probability).float()
        if first_toss is 1:
            result.append(entry)
        else:
            second_toss = (torch.rand(1) > 0.5).float()
            result.append(second_toss)
    return torch.IntTensor(result)

def compare_locally_private_and_raw_db_l1_sensitivity(function, db_size, randomness_probability):
    (raw_db, parallel_raw_dbs) = create_db_and_parallels(db_size)
    raw_db_function_value = function(raw_db)
    randomized_db = generate_locally_private_db(raw_db, randomness_probability)
    noisy_db_function_value = function(randomized_db)
    parallel_randomized_dbs = generate_parallel_dbs(randomized_db)
    raw_sensitivity = compute_l1_sensitivity(function, raw_db, parallel_raw_dbs)
    randomized_sensitivity = compute_l1_sensitivity(function, randomized_db, parallel_randomized_dbs)
    
    return [raw_sensitivity, randomized_sensitivity, raw_db_function_value, noisy_db_function_value]

def display_local_privacy_result(results):
    print ("sensitivity of unaltered data: " + str(results[0]))
    print ("sensitivity of noisy data: " + str(results[1]))
    print ("query result on the unaltered data: " + str(results[2]))
    print ("query result of the noisy data: " + str(results[3]))

In [163]:
print(compare_locally_private_and_raw_db_l1_sensitivity(mean_query, 10, 0.5))

[tensor(-0.0778), tensor(0.0778), tensor(0.7000), tensor(0.3000)]


In [99]:
display_local_privacy_result (compare_locally_private_and_raw_db_l1_sensitivity(mean_query, 100, 0.5))

sensitivity of unaltered data: tensor(0.0056, dtype=torch.float64)
sensitivity of noisy data: tensor(0.0052, dtype=torch.float64)
query result on the unaltered data: tensor(0.4500, dtype=torch.float64)
query result of the noisy data: tensor(0.4900, dtype=torch.float64)


In [100]:
display_local_privacy_result (compare_locally_private_and_raw_db_l1_sensitivity(mean_query, 1000, 0.5))

sensitivity of unaltered data: tensor(0.0005, dtype=torch.float64)
sensitivity of noisy data: tensor(-0.0005, dtype=torch.float64)
query result on the unaltered data: tensor(0.4900, dtype=torch.float64)
query result of the noisy data: tensor(0.5010, dtype=torch.float64)


In [165]:
display_local_privacy_result (compare_locally_private_and_raw_db_l1_sensitivity(mean_query, 10000, 0.5))

sensitivity of unaltered data: tensor(-5.0962e-05)
sensitivity of noisy data: tensor(5.0962e-05)
query result on the unaltered data: tensor(0.5097)
query result of the noisy data: tensor(0.4903)


## The approach above showed how the query behaves when we add noise with a probability of 0.5. 
### Below we will analyze what happens when the proability of adding noise is not 0.5 but it is changeable, and we will provide a function which takes the result of the mean function applied on the noisy data and apply a "reverse" statistical function to get the correct result, if the randomness follows the expected distribution.
For the mean function, the value is 
$$ probabilityPersonDidTheEvent = \frac{noisyMean - 0.5noiseProbability}{1-noiseProbability} $$

In [0]:
def get_mean_result_from_nosy_data_mean_result(noisy_result, noise_probability):
    return float(noisy_result - 0.5*noise_probability)/float(1-noise_probability)

In [0]:
def display_all_local_privacy_data(db_size, noise_probability):
    db_result = compare_locally_private_and_raw_db_l1_sensitivity(mean_query, db_size, noise_probability)
    display_local_privacy_result(db_result)
    print (get_mean_result_from_nosy_data_mean_result(db_result[3], noise_probability))

In [175]:
display_all_local_privacy_data(100, 0.5)

sensitivity of unaltered data: tensor(0.0052)
sensitivity of noisy data: tensor(-0.0053)
query result on the unaltered data: tensor(0.4900)
query result of the noisy data: tensor(0.5200)
0.5399999618530273


In [176]:
display_all_local_privacy_data(10000, 0.5)

sensitivity of unaltered data: tensor(4.9710e-05)
sensitivity of noisy data: tensor(5.0008e-05)
query result on the unaltered data: tensor(0.5033)
query result of the noisy data: tensor(0.5001)
0.5002000331878662
