# CS211 Final Project 
Stanhope Nwosu and Clasby Chope 

Preforming workload queries and comparing methods of differential privacy on dataset of 911 calls in Montgomery Country PA

In [1]:
# Load the data and libraries
import pandas as pd
import numpy as np
import random
from datetime import datetime

from scipy import stats
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def laplace_mech_vec(vec, sensitivity, epsilon):
    return [v + np.random.laplace(loc=0, scale=sensitivity / epsilon) for v in vec]

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def gaussian_mech_vec(vec, sensitivity, epsilon, delta):
    return [v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)
            for v in vec]

def gaussian_mech_RDP_vec(vec, sensitivity, alpha, epsilon):
    sigma = np.sqrt((sensitivity**2 * alpha) / (2 * epsilon))
    return [v + np.random.normal(loc=0, scale=sigma) for v in vec]

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

call = pd.read_csv('https://raw.githubusercontent.com/stan0fHope/CS211Projecct/main/911.csv', low_memory=False)
calls = call.dropna() #removing N/A from csv
times = [d.split()[1] for d in calls['timeStamp']]
minutes = [int(d.split(':')[0])*60 + int(d.split(':')[1]) for d in times]
calls.insert(9, 'mnt', minutes) #converted time to minutes and added to calls



## Range Queries

A *range query* counts the number of rows in the dataset which have a value lying in a given range. For example, "how many participants are between the ages of 21 and 33?" is a range query. A *workload* of range queries is just a list of range queries. The code below generates 100 random range queries over ages in the adult dataset.

In [4]:
def range_query(df, col, a, b):
    return len(df[(df[col] >= a) & (df[col] < b)])

#can cahange for more ranges (18036, 19525) outlier 36107, 77316
random_zip_bounds = [random.randint(18000, 19000) for _ in range(100)]

#are float types tho so i cut 3 from top/bot
random_lat_bounds = [random.randint(25, 40) for _ in range(100)]

#same for lat, but neg (-95, -74), make neg
random_lng_bounds = [random.randint(-100, -70) for _ in range(100)]

#for minutes the call was made from 0-1000
random_mnt_bounds = [random.randint(0, 1000) for _ in range(100)]

zip_workload = [(lb, random.randint(lb, 20000)) for lb in random_zip_bounds]
lat_workload = [(lb, random.randint(lb, 50)) for lb in random_lat_bounds]
lng_workload = [(lb, random.randint(lb, -65)) for lb in random_lng_bounds]
mnt_workload = [(lb, random.randint(lb, 1500)) for lb in random_lng_bounds]

print('First 5 queries: ', zip_workload[:5])
real_zip = [range_query(calls, 'zip', lb, ub) for (lb, ub) in zip_workload]
real_lat = [range_query(calls, 'lat', lb, ub) for (lb, ub) in lat_workload]
real_lng = [range_query(calls, 'lng', lb, ub) for (lb, ub) in lng_workload]
real_mnt = [range_query(calls, 'mnt', lb, ub) for (lb, ub) in mnt_workload]


First 5 queries:  [(18645, 19404), (18240, 18548), (18735, 18804), (18845, 18884), (18498, 19145)]


In [14]:
def workload_laplace(workload, epsilon, col):
    ans_list = []#apply range query
    for work in workload: #seq comp
        a, b = work
        rng = range_query(calls, col, a, b)
        noised = laplace_mech(rng, len(workload), epsilon)
        ans_list.append(noised)
    return ans_list

print('First 4 answers:', workload_laplace(zip_workload, 1.0, 'zip')[:4])
print('First 4 answers:', workload_laplace(lat_workload, 1.0, 'lat')[:4])
print('First 4 answers:', workload_laplace(lng_workload, 1.0, 'lng')[:4])
print('First 4 answers:', workload_laplace(mnt_workload, 1.0, 'mnt')[:4])

First 4 answers: [40931.92040302613, 201.0283079179485, -119.19377151572057, -44.77093669031614]
First 4 answers: [75823.46569109162, -100.55182117006771, 75638.2127884444, -321.5158702355202]
First 4 answers: [-61.27166804626181, -73.04413976615689, 75712.14708511179, 75736.53232025153]
First 4 answers: [75666.3332243104, -117.19095142979894, 47200.805788547696, 221.68826768360935]


In [7]:
##Our errors testing
zip_errors = [abs(r_a - l_a) for (r_a, l_a) in zip(real_zip, workload_laplace(zip_workload, 1.0, 'zip'))]
lat_errors = [abs(r_a - l_a) for (r_a, l_a) in zip(real_lat, workload_laplace(lat_workload, 1.0, 'lat'))]
lng_errors = [abs(r_a - l_a) for (r_a, l_a) in zip(real_lng, workload_laplace(lng_workload, 1.0, 'lng'))]
mnt_errors = [abs(r_a - l_a) for (r_a, l_a) in zip(real_mnt, workload_laplace(mnt_workload, 1.0, 'mnt'))]


print('Average absolute error:', np.mean(zip_errors))
assert np.mean(zip_errors) > 70
assert np.mean(zip_errors) < 150

print('Average absolute error:', np.mean(lat_errors))
assert np.mean(lat_errors) > 70
assert np.mean(lat_errors) < 150

print('Average absolute error:', np.mean(lng_errors))
assert np.mean(lng_errors) > 70
assert np.mean(lng_errors) < 150

print('Average absolute error:', np.mean(mnt_errors))
assert np.mean(lng_errors) > 70
assert np.mean(lng_errors) < 150


Average absolute error: 96.40110837593751
Average absolute error: 93.63748162680842
Average absolute error: 117.57134866281355
Average absolute error: 88.82004825615105


In [8]:
def workload_laplace_vec(workload, epsilon, col):
    L1 = len(workload)
    #L1 sens is sum of vector sens
    rng = [range_query(calls, col, work[0], work[1]) for work in workload] #1st of tuple
    noise = laplace_mech_vec(rng, L1, epsilon)
    return noise

print('First 4 answers:', workload_laplace_vec(zip_workload, 1.0, 'zip')[:4])
print('First 4 answers:', workload_laplace_vec(lat_workload, 1.0, 'lat')[:4])
print('First 4 answers:', workload_laplace_vec(lng_workload, 1.0, 'lng')[:4])
print('First 4 answers:', workload_laplace_vec(mnt_workload, 1.0, 'mnt')[:4])

First 4 answers: [41208.94792892701, -61.10692887059906, 235.6187204989514, 95.0833452331183]
First 4 answers: [75544.0556209569, 150.73767925777605, 75696.18692518199, -261.59274964147954]
First 4 answers: [19.725250611053266, -129.10371333695034, 75696.9625141613, 75763.0881600042]
First 4 answers: [75680.83781523265, 102.54716412002844, 47116.167965726345, -143.99319181018885]


In [9]:
##Our errors testing
zip_errors = [abs(r_a - l_a) for (r_a, l_a) in zip(real_zip, workload_laplace_vec(zip_workload, 1.0, 'zip'))]
lat_errors = [abs(r_a - l_a) for (r_a, l_a) in zip(real_lat, workload_laplace_vec(lat_workload, 1.0, 'lat'))]
lng_errors = [abs(r_a - l_a) for (r_a, l_a) in zip(real_lng, workload_laplace_vec(lng_workload, 1.0, 'lng'))]
mnt_errors = [abs(r_a - l_a) for (r_a, l_a) in zip(real_mnt, workload_laplace_vec(mnt_workload, 1.0, 'mnt'))]

print('Average absolute error:', np.mean(zip_errors))
assert np.mean(zip_errors) > 70
assert np.mean(zip_errors) < 150

print('Average absolute error:', np.mean(lat_errors))
assert np.mean(lat_errors) > 70
assert np.mean(lat_errors) < 150

print('Average absolute error:', np.mean(lng_errors))
assert np.mean(lng_errors) > 70
assert np.mean(lng_errors) < 150

print('Average absolute error:', np.mean(mnt_errors))
assert np.mean(mnt_errors) > 70
assert np.mean(mnt_errors) < 150

Average absolute error: 89.21495931071681
Average absolute error: 91.10213549022494
Average absolute error: 110.90083301247262
Average absolute error: 122.33553958806291


In [10]:
def workload_gaussian_vec(workload, epsilon, delta, col):
    L2 = np.sqrt(len(workload))
    rng = [range_query(calls, col, work[0], work[1]) for work in workload] #1st of tuple
    noise = gaussian_mech_vec(rng, L2, epsilon, delta)
    return noise

print('First 4 answers:', workload_gaussian_vec(zip_workload, 1.0, 1e-5, 'zip')[:4])
print('First 4 answers:', workload_gaussian_vec(lat_workload, 1.0, 1e-5, 'lat')[:4])
print('First 4 answers:', workload_gaussian_vec(lng_workload, 1.0, 1e-5, 'lng')[:4])
print('First 4 answers:', workload_gaussian_vec(mnt_workload, 1.0, 1e-5, 'mnt')[:4])

First 4 answers: [41022.055251158046, -36.23016500784305, 29.275208073918222, -93.70486115391753]
First 4 answers: [75654.48505436491, 27.493887659649328, 75661.27792410192, -95.46958567376066]
First 4 answers: [9.846390107584169, 1.0433528446407714, 75693.1166493789, 75732.9208377977]
First 4 answers: [75783.05959529048, 25.86907875010975, 47150.65167019612, 91.27608485078548]


In [11]:
zip_errors = [abs(r_a - l_a) for (r_a, l_a) in zip(real_zip, workload_gaussian_vec(zip_workload, 1.0, 1e-5,'zip'))]
lat_errors = [abs(r_a - l_a) for (r_a, l_a) in zip(real_lat, workload_gaussian_vec(lat_workload, 1.0, 1e-5,'lat'))]
lng_errors = [abs(r_a - l_a) for (r_a, l_a) in zip(real_lng, workload_gaussian_vec(lng_workload, 1.0, 1e-5,'lng'))]
mnt_errors = [abs(r_a - l_a) for (r_a, l_a) in zip(real_mnt, workload_gaussian_vec(mnt_workload, 1.0, 1e-5,'mnt'))]

print('Average absolute error:', np.mean(zip_errors))
assert np.mean(zip_errors) > 10
assert np.mean(zip_errors) < 100

print('Average absolute error:', np.mean(lat_errors))
assert np.mean(lat_errors) > 10
assert np.mean(lat_errors) < 100

print('Average absolute error:', np.mean(lng_errors))
assert np.mean(lng_errors) > 10
assert np.mean(lng_errors) < 100

print('Average absolute error:', np.mean(mnt_errors))
assert np.mean(mnt_errors) > 10
assert np.mean(mnt_errors) < 100

Average absolute error: 39.84022705230033
Average absolute error: 40.57695660966696
Average absolute error: 36.968889266718826
Average absolute error: 43.369395976052154
