In [3]:
import psycopg2 as pg
import pandas as pd

# Database setup
host = "localhost"
database = "cdm"
user = "postgres"
password = %env PGPASSWORD
connection_string = "host={} dbname={} user={} password={}".format(host, database, user, password)

db = pg.connect(connection_string)

## Diabetes and Differential Privacy

We have shown that single column counts can be perturbed to L1 norms.
Now we will be looking at diabetes as a case study for the usefulness of differential privacy. We will be exploring the effects of perturbing results, and how perturbing results effect the statistical analysis.

##### Main Question: Does differentially private data still prove useful for accurate statistical analysis?

### Collecting diabetics

In [4]:
diabetics_query = """
    SELECT ages.age, COUNT(*)
    FROM (
    SELECT 
        p.person_id,
        min((EXTRACT (YEAR from con.condition_start_date)) - p.year_of_birth) as age
    FROM concept c, person p, condition_occurrence con
    WHERE
        con.person_id=p.person_id AND
        con.condition_concept_id=c.concept_id AND
        (c.concept_name LIKE '%Diabetes%' OR
        c.concept_name LIKE '%diabetes%')
    GROUP BY
        p.person_id) as ages
    GROUP BY ages.age;
    
"""
diabetics = pd.read_sql(diabetics_query, con=db)

### Collecting the non-diabetics

In [5]:
not_diabetics_query = """
    SELECT ages.age, count(*)
    FROM (SELECT 
        p.person_id,
        min((EXTRACT (YEAR from con.condition_start_date)) - p.year_of_birth) as age
    FROM concept c, person p, condition_occurrence con
    WHERE
        (con.person_id=p.person_id AND
        con.condition_concept_id=c.concept_id) AND p.person_id NOT IN
    """ + """(SELECT p.person_id
    FROM concept c, person p, condition_occurrence con
    WHERE
        con.person_id=p.person_id AND
        con.condition_concept_id=c.concept_id AND
        (c.concept_name LIKE '%Diabetes%' OR
        c.concept_name LIKE '%diabetes%')
        )
    GROUP BY
        p.person_id) as ages
    GROUP BY ages.age;
"""

#collects the age and counts of ages of 
non_diabetics = pd.read_sql(not_diabetics_query, con=db)

### Diabetic vs non-Diabetic age distributions

In [6]:
import numpy as np
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
import plotly.graph_objs as go

# This is to use ployly offline
init_notebook_mode(connected=True)

iframe = None

#Original data
diab_data = go.Bar(x=diabetics['age'], y=diabetics['count'], name="diabetics", marker=dict(
        color='#FFA500'), opacity=0.75)
non_diab_data = go.Bar(x=non_diabetics['age'], y=non_diabetics['count'], name="non-diabetics", marker=dict(
        color='#0000FF'), opacity=0.75)

layout = go.Layout(
    title='Age Distribution of Diabetics vs Non-Diabetics', 
    xaxis={'title':'Age', 'tickangle': 300, 'exponentformat': 'none'}, 
    yaxis={'title':'Occurences'},
    showlegend=True,
    bargap=0.1,
    barmode='overlay')

data_all = [diab_data, non_diab_data]

fig = go.Figure(data=data_all, layout=layout)

iplot(fig, filename='overlaid histogram')

Now that we've shown the age distributions for the diabetic and non-diabetic, lets look at the differenitally private versions of these distributions

## Differentially Private Diabetics

In [8]:
from IPython.display import display
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import numpy as np
from collections import Counter

def mean(dataframe):
    tuples = zip(dataframe['age'], dataframe['count'])
    sum_of_numbers = sum(number*count for number, count in tuples)
    count = sum(count for n, count in tuples)
    mean = sum_of_numbers / count
    
    return mean

def perturb_age(counts, epsilon=1):
    
    temp = []
    
    for k,v in zip(counts['age'], counts['count']):
        noise = np.random.laplace(scale=1/epsilon);
        cur_count = np.round(v + noise)
        temp.append((k, cur_count))
    
    return pd.DataFrame(temp, columns=['age', 'count'])

def run(button):
    
    DP_diabetics = perturb_age(diabetics, slider.value)
    
    diab_data = go.Bar(x=diabetics['age'], y=diabetics['count'], name="Diabetics", marker=dict(
        color='#FFA500'), opacity=0.75)

    DP_diab_data = go.Bar(x=DP_diabetics['age'], y=DP_diabetics['count'], name="Diff Private Diabetics", marker=dict(
        color='#0000FF'), opacity=0.75)

    layout = go.Layout(
        title='Age Distribution of Diabetics vs Perturbed Diabetics', 
        xaxis={'title':'Age', 'tickangle': 300, 'exponentformat': 'none'}, 
        yaxis={'title':'Occurences'},
        showlegend=True,
        bargap=0.1,
        barmode='overlay')

    data_all = [diab_data, DP_diab_data]

    fig = go.Figure(data=data_all, layout=layout)
    
    iplot(fig, filename='overlaid histogram')
    

button = widgets.Button(description="Run Query")
button.on_click(run)

box = widgets.Box()

slider = widgets.FloatSlider(min=0.001, max=10, value=0.01, step=0.001, description='Epsilon')

display(slider)
display(button)

box

## Statistical Comparison

So visually, these two distributions seem extremely similar. Even when the epsilon is very small (0.01), the perturbations don't seem to make huge difference. But how low can we go?

Lets look at different values of epsilon and its effect on the distribution.

We will compare the perturbed distributions to the expected distribution using a student's two-tailed t-test. We will assume that both distributions have the same population variance.

We will run simulations of the perturbations at decreasing epsilon values, running 100 simulations at each epsilon value and calculating the t-test score and p-value for each simulated distribution.


In [None]:
from scipy.stats import ttest_ind

def age_array(df):
    temp = []
    
    for age,count in zip(df['age'], df['count']):
        for i in range(int(count)):
            temp.append(age)
    return temp

diab = age_array(diabetics)

ttests = {}

#starting at epsilon 10, we decrease the epsilon by a factor of 10 for each series of simulation
start = 0.1
for i in range(5):
    epsilon = (start / float(start**2))
    print epsilon
    ttests[epsilon] = []
    start *= 10
        
    DP_diabetics = perturb_age(diabetics, epsilon)
    
    DP_diab = age_array(DP_diabetics)
        
    t_score, p_value = ttest_ind(DP_diab, diab)

    print t_score, p_value
    
    
    