In [4]:
import psycopg2 as pg
import pandas as pd

# Database setup
host = "localhost"
database = "cdm"
user = "postgres"
password = %env PGPASSWORD
connection_string = "host={} dbname={} user={} password={}".format(host, database, user, password)

db = pg.connect(connection_string)

# Differential Privacy to a GROUP BY query
In previous experiments, we shown that a single count perturbation is possible to ensure differential privacy.
In this next experiment, we are going to be applying differential privacy to a group by SQL query.


In [8]:
# The subset of SynPUF data we use only has these two concepts for gender
female_concept_id = 8532;
male_concept_id = 8507;

gender_count_query = """
SELECT p.gender_concept_id, COUNT(*)
FROM person p
GROUP BY p.gender_concept_id;"""

pd.read_sql(gender_count_query, con=db)

Unnamed: 0,gender_concept_id,count
0,8532,64347
1,8507,52005


So in our dataset, we see that our counts are 64347 females and 52005 males.
Lets do a simple perturbation using diff privacy

In [18]:
from IPython.display import display
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import numpy as np

def run(button):
    result_f, result_m = multi_value_differential_privacy(gender_count_query, slider.value)
    box.children = [widgets.Label("Female Count: "), widgets.Label(value=str(result_f)),
                    widgets.Label("\nMale Count: "), widgets.Label(value=str(result_m))]
    

button = widgets.Button(description="Run Query")
button.on_click(run)

box = widgets.Box()

def multi_value_differential_privacy(query=gender_count_query, epsilon=1):
    
    # Run the query
    results = pd.read_sql(query, con=db)
    count_female = results['count'][0]
    count_male   = results['count'][1]
    
    # Apply Laplacian randomness with $\lamda = \frac{1}{\epsilon}$
    # see https://docs.scipy.org/doc/numpy/reference/generated/numpy.random.laplace.html
    noise = np.random.laplace(scale=1/epsilon);
    
    # Round the number since having a floating point count doesn't really make sense
    return np.round(count_female + noise), np.round(count_male + noise)

slider = widgets.FloatSlider(min=0.001, max=10, value=1, step=0.001, description='Epsilon')

display(slider)
display(button)

box

So now we have added diff privacy to both results to add uncertainty to the results

## Groups on Groups on Groups

Now that we can perform differential privacy on two results in a GROUP BY, can we do the same thing for more groups, and for groups in those groups....how deep can we go? Can we get our query down to results of one?

In [39]:
query = """
    SELECT p.gender_concept_id, p.year_of_birth, p.month_of_birth, p.day_of_birth, COUNT(*)
    FROM person p
    GROUP BY p.year_of_birth, p.gender_concept_id, p.month_of_birth, p.day_of_birth, p.time_of_birth
    HAVING count(*) < 3
    ORDER BY p.year_of_birth;
"""

results = pd.read_sql(query, con=db)
print results

   gender_concept_id  year_of_birth  month_of_birth  day_of_birth  count
0               8532           1974               4             1      2
1               8507           1975               5             1      2
2               8532           1976              10             1      2
3               8532           1977              12             1      2
4               8507           1979               1             1      2
5               8507           1979               6             1      1
6               8532           1982               8             1      2
7               8507           1982               9             1      2
8               8532           1982               7             1      1
9               8532           1982               1             1      2


We have been able to narrow down to very precise set of individuals, how does perturbation hide these individuals.

In [75]:
def query_differential_privacy(query, epsilon=1):
    
    results = pd.read_sql(query, con=db)
    
    temp = []
    
    for i in range(len(results)):
        
        noise = np.random.laplace(scale=1/epsilon);
        
        new_count = int(np.abs(np.round(results['count'][i] + noise)))
        
        temp.append((results['gender_concept_id'][i], results['year_of_birth'][i], new_count))
    
    return temp


query1 = """
    SELECT p.gender_concept_id, p.year_of_birth, p.month_of_birth, p.day_of_birth, COUNT(*)
    FROM person p
    GROUP BY p.year_of_birth, p.gender_concept_id, p.month_of_birth, p.day_of_birth, p.time_of_birth
    HAVING count(*) < 3
    ORDER BY p.year_of_birth;
"""

        
def run(button):
    results = query_differential_privacy(query1, slider.value)
    for i in results:
        print i

button = widgets.Button(description="Run Query")
button.on_click(run)

box = widgets.Box()

slider = widgets.FloatSlider(min=0.001, max=10, value=1, step=0.001, description='Epsilon')

display(slider)
display(button)

box

(8532, 1974, 2)
(8507, 1975, 4)
(8532, 1976, 2)
(8532, 1977, 4)
(8507, 1979, 1)
(8507, 1979, 3)
(8532, 1982, 5)
(8507, 1982, 2)
(8532, 1982, 1)
(8532, 1982, 1)


Using perturbation, we have adjusted the counts to mask the true values.

The take away from this experiment is that we can combine multiple columns to narrow down to a very few number of people and differential privacy should be able to protect their privacy to a L1 norm. 

## Privacy Budget with multi-column GROUP BY

Can we protect these individuals who are at risk for multi-variate discovery.

In [82]:
from IPython.display import display
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import numpy as np
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
import plotly.graph_objs as go

# This is to use ployly offline
init_notebook_mode(connected=True)

iframe = None

# Number of query runs (n)
n = widgets.BoundedIntText(value=100, min=1, max=1000, description='n:')

# Privacy budget (\epsilon_{total})
budget = widgets.BoundedFloatText(value=10, min=1, description=r'$\epsilon_{total}$:')

# Fancy progress bar 🎩
progress = widgets.FloatProgress(min=0,max=100, step=1, description='Progress:')

def multi_variate_differential_privacy(query, epsilon=1):
    
    results = pd.read_sql(query, con=db)
    
    temp = []
    
    for i in range(len(results)):
        
        noise = np.random.laplace(scale=1/epsilon);
        
        new_count = int(np.abs(np.round(results['count'][i] + noise)))
        
        temp.append((results['gender_concept_id'][i], results['year_of_birth'][i], new_count))
    
    return new_count

query2 = """
    SELECT p.gender_concept_id, p.year_of_birth, p.month_of_birth, p.day_of_birth, COUNT(*)
    FROM person p
    WHERE p.year_of_birth=1979 and p.month_of_birth=6 and p.day_of_birth=1 and p.gender_concept_id=8507
    GROUP BY p.year_of_birth, p.gender_concept_id, p.month_of_birth, p.day_of_birth, p.time_of_birth
    HAVING count(*) < 3
    ORDER BY p.year_of_birth;
"""

def run(button):
    results = []
    epsilon_i = budget.value / n.value
    
    for i in range(0, n.value):
        results.append(multi_variate_differential_privacy(query2, epsilon_i))
        progress.value = (i + 1) / n.value * 100
        
    data = [go.Histogram(x=results, name="n = {}, budget = {}".format(n.value, budget.value))]
    layout = go.Layout(
        title='Specific Count Attack Results', 
        xaxis={'title':'Specific Count', 'tickangle': 300, 'exponentformat': 'none'}, 
        yaxis={'title':'Occurences'},
        showlegend=True,
        bargap=0.1)
    
    iplot({"data": data, "layout": layout})        

button = widgets.Button(description="Run Attack")
button.on_click(run)

display(n)
display(budget)
display(button)

So with this experiment, we can show that even very specific attacks, using multi-variate columns where it would be possible to narrow down to individuals, we can apply a privacy budget to protect very specific attacks.