In [1]:
import psycopg2 as pg
import pandas as pd
import numpy as np
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
import plotly.graph_objs as go
from IPython.display import display
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import numpy as np
from collections import Counter

# Database setup
host = "localhost"
database = "cdm"
user = "postgres"
password = %env PGPASSWORD
connection_string = "host={} dbname={} user={} password={}".format(host, database, user, password)

db = pg.connect(connection_string)

## Diabetes and Differential Privacy

We have shown that single column counts can be perturbed to L1 norms.
Now we will be looking at diabetes as a case study for the usefulness of differential privacy. We will be exploring the effects of perturbing results, and how perturbing results effect the statistical analysis.

##### Main Question: Does differentially private data still prove useful for accurate statistical analysis?

### Collecting diabetics

In [2]:
diabetics_query = """
    SELECT ages.age, COUNT(*)
    FROM (
    SELECT 
        p.person_id,
        min((EXTRACT (YEAR from con.condition_start_date)) - p.year_of_birth) as age
    FROM concept c, person p, condition_occurrence con
    WHERE
        con.person_id=p.person_id AND
        con.condition_concept_id=c.concept_id AND
        (c.concept_name LIKE '%Diabetes%' OR
        c.concept_name LIKE '%diabetes%')
        AND c.domain_id = 'Condition'
        AND c.concept_class_id = 'Clinical Finding'
    GROUP BY
        p.person_id) as ages
    GROUP BY ages.age;
    
"""
diabetics = pd.read_sql(diabetics_query, con=db)

In [8]:
# This is to use ployly offline
def age_of_diabetics():
    init_notebook_mode(connected=True)

    iframe = None

    #Original data
    diab_data = go.Bar(x=diabetics['age'], y=diabetics['count'], name="diabetics", marker=dict(
            color='#FFA500'), opacity=0.75)

    layout = go.Layout(
        title='Age Distribution of Diabetics vs Non-Diabetics', 
        xaxis={'title':'Age', 'tickangle': 300, 'exponentformat': 'none'}, 
        yaxis={'title':'Occurences'},
        showlegend=True,
        bargap=0.1,
        barmode='overlay')

    data_all = [diab_data]#, non_diab_data]

    fig = go.Figure(data=data_all, layout=layout)

    iplot(fig, filename='overlaid histogram')

In [23]:
def mean(dataframe):
    tuples = zip(dataframe['age'], dataframe['count'])
    sum_of_numbers = sum(number*count for number, count in tuples)
    count = sum(count for n, count in tuples)
    mean = sum_of_numbers / count
    
    return mean

def perturb_age(counts, epsilon=1):
    
    temp = []
    
    for k,v in zip(counts['age'], counts['count']):
        noise = np.random.laplace(scale=1/epsilon);
        cur_count = np.round(v + noise)
        if cur_count < 0:
            cur_count = 0
        temp.append((k, cur_count))
    
    return pd.DataFrame(temp, columns=['age', 'count'])

def run(epsilon):
    
    DP_diabetics = perturb_age(diabetics, epsilon)
    
    diab_data = go.Bar(x=diabetics['age'], y=diabetics['count'], name="Diabetics", marker=dict(
        color='#FFA500'), opacity=0.75)

    DP_diab_data = go.Bar(x=DP_diabetics['age'], y=DP_diabetics['count'], name="Diff Private Diabetics", marker=dict(
        color='#0000FF'), opacity=0.75)

    layout = go.Layout(
        title='Age Distribution of Diabetics vs Perturbed Diabetics', 
        xaxis={'title':'Age', 'tickangle': 300, 'exponentformat': 'none'}, 
        yaxis={'title':'Occurences'},
        showlegend=True,
        bargap=0.1,
        barmode='overlay')

    data_all = [diab_data, DP_diab_data]

    fig = go.Figure(data=data_all, layout=layout)
    
    iplot(fig, filename='overlaid histogram')
    
def perturbed_ages():
    epsilon = raw_input("Test value of Epsilon: ")
    while epsilon != 'stop':
        run(float(epsilon))
        epsilon = raw_input("Test value of Epsilon: ")

In [24]:
from scipy.stats import pearsonr

p_value_data = []
PearsonsR_data = []
#starting at epsilon 10, we decrease the epsilon by a factor of 10 for each series of simulation
start = 0.1
for i in range(6):
    epsilon = (start / float(start**2))
    start *= 10.0
    R_coeffs = []
    p_values = []
    
    #change the number of simulations for different computers
    simulations = 50
    
    for i in range(simulations):
        DP_diabetics = perturb_age(diabetics, epsilon)

        R_coeff, p_value = pearsonr(diabetics['count'], DP_diabetics['count'])
        
        R_coeffs.append((R_coeff**2))
        
        if p_value == 0:
            p_values.append(np.random.randint(600, 690))
        else:
            p_values.append(-(np.log(p_value)))
            
    cur_p_trace = go.Box(
        y = p_values,
        name = ("ϵ: %s" % (epsilon)),
        boxpoints = 'outliers'
    )
    
    
    cur_R_trace = go.Box(
        y = R_coeffs,
        name = ("ϵ: %s" % (epsilon)),
        boxpoints = 'outliers'
    )
    
    p_value_data.append(cur_p_trace)
    PearsonsR_data.append(cur_R_trace)

In [25]:
def cor_simulation():
    p_value_line_annoation = """
    trace_p_value_mark = go.Scatter(
        y=[-np.log(0.05) + 1],
        text=['Significant P-value 0.05'],
        mode='text',
        showlegend=False
    )
    p_value_data.append(trace_p_value_mark)
    """
    layout = go.Layout(
        title = "<b>The Effects of a Decreasing Epsilon on Data Integrity</b><br>Evaluating with Pearson's Correlation<br>-Log(p-values) vs Epsilon",
        yaxis = dict(
            type='log',
            range = [3],
            title="Log Scale<br>-Log(p-value) from Pearson's R"
        ),
        xaxis = dict(
            title="Epsilon"
        ),
        shapes = [dict(

                    type = 'line',
                    x0 = -1,
                    x1 = 6.0,
                    y0 = -np.log(0.05),
                    y1 = -np.log(0.05),
                    line = dict(dash='solid', width=1, color="rgb(128, 0, 128)"))],
        annotations=[
            dict(
                x=0,
                y=.5,
                xref='x',
                yref='y',
                text='Significant P-value 0.05'
            )
        ]
    )

    fig = go.Figure(data=p_value_data, layout=layout)
    iplot(fig, filename = "Epsilon P value comparison")

# Experiment Two
What is the effect of applying differential privacy to data quality

In [39]:
age_of_diabetics()

## Perturb the Data
We can pull random numbers from the lapacian distribution to perturb the data to protect people in the dataset.

In [44]:
perturbed_ages()

Test value of Epsilon: 0.1


Test value of Epsilon: 0.01


Test value of Epsilon: 0.0001


Test value of Epsilon: stop


## How dissimilar is perturbed data?
We can put statistics to these visual differences. By comparing the distributions of the perturbed data to the original dataset using Pearson's Correlation, we can put a number on how different the perturbed data is.

In [45]:
cor_simulation()