# CS211 Final Project 
Stanhope Nwosu and Clasby Chope 

Preforming workload queries and comparing methods of differential privacy on dataset of 911 calls in Montgomery Country PA

In [18]:
# Load the data and libraries
import pandas as pd
import numpy as np
import random
from datetime import datetime

from scipy import stats
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def laplace_mech_vec(vec, sensitivity, epsilon):
    return [v + np.random.laplace(loc=0, scale=sensitivity / epsilon) for v in vec]

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def gaussian_mech_vec(vec, sensitivity, epsilon, delta):
    return [v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)
            for v in vec]

def gaussian_mech_RDP_vec(vec, sensitivity, alpha, epsilon):
    sigma = np.sqrt((sensitivity**2 * alpha) / (2 * epsilon))
    return [v + np.random.normal(loc=0, scale=sigma) for v in vec]

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

call = pd.read_csv('https://raw.githubusercontent.com/stan0fHope/CS211Projecct/main/911.csv', low_memory=False)
calls = call.dropna() #removing N/A from csv
times = [d.split()[1] for d in calls['timeStamp']]
minutes = [int(d.split(':')[0])*60 + int(d.split(':')[1]) for d in times]
calls.insert(9, 'mnt', minutes) #converted time to minutes and added to calls



## Range Queries

A *range query* counts the number of rows in the dataset which have a value lying in a given range. For example, "how many participants are between the ages of 21 and 33?" is a range query. A *workload* of range queries is just a list of range queries. The code below generates 100 random range queries over ages in the adult dataset.

In [29]:
def range_query(df, col, a, b):
    return len(df[(df[col] >= a) & (df[col] < b)])

#are float types tho so i cut 3 from top/bot
random_lat_bounds = [random.randint(25, 40) for _ in range(100)]

#same for lat, but neg (-95, -74), make neg
random_lng_bounds = [random.randint(-100, -70) for _ in range(100)]

#for minutes the call was made from 0-1000
random_mnt_bounds = [random.randint(0, 1000) for _ in range(100)]

lat_workload = [(lb, random.randint(lb, 50)) for lb in random_lat_bounds]
lng_workload = [(lb, random.randint(lb, -65)) for lb in random_lng_bounds]
mnt_workload = [(lb, random.randint(lb, 1500)) for lb in random_lng_bounds]

real_lat = [range_query(calls, 'lat', lb, ub) for (lb, ub) in lat_workload]
real_lng = [range_query(calls, 'lng', lb, ub) for (lb, ub) in lng_workload]
real_mnt = [range_query(calls, 'mnt', lb, ub) for (lb, ub) in mnt_workload]


In [30]:
def workload_laplace(workload, epsilon, col):
    ans_list = []#apply range query
    for work in workload: #seq comp
        a, b = work
        rng = range_query(calls, col, a, b)
        noised = laplace_mech(rng, len(workload), epsilon)
        ans_list.append(noised)
    return ans_list

In [31]:
##Our errors testing
lat_errors = [abs(r_a - l_a) for (r_a, l_a) in zip(real_lat, workload_laplace(lat_workload, 1.0, 'lat'))]
lng_errors = [abs(r_a - l_a) for (r_a, l_a) in zip(real_lng, workload_laplace(lng_workload, 1.0, 'lng'))]
mnt_errors = [abs(r_a - l_a) for (r_a, l_a) in zip(real_mnt, workload_laplace(mnt_workload, 1.0, 'mnt'))]

print('Average absolute error:', np.mean(lat_errors))
assert np.mean(lat_errors) > 70
assert np.mean(lat_errors) < 150

print('Average absolute error:', np.mean(lng_errors))
assert np.mean(lng_errors) > 70
assert np.mean(lng_errors) < 150

print('Average absolute error:', np.mean(mnt_errors))
assert np.mean(lng_errors) > 70
assert np.mean(lng_errors) < 150


Average absolute error: 105.06688636869605
Average absolute error: 96.73449170430092
Average absolute error: 82.9742425092313


In [32]:
def workload_laplace_vec(workload, epsilon, col):
    L1 = len(workload)
    #L1 sens is sum of vector sens
    rng = [range_query(calls, col, work[0], work[1]) for work in workload] #1st of tuple
    noise = laplace_mech_vec(rng, L1, epsilon)
    return noise

In [33]:
##Our errors testing
lat_errors = [abs(r_a - l_a) for (r_a, l_a) in zip(real_lat, workload_laplace_vec(lat_workload, 1.0, 'lat'))]
lng_errors = [abs(r_a - l_a) for (r_a, l_a) in zip(real_lng, workload_laplace_vec(lng_workload, 1.0, 'lng'))]
mnt_errors = [abs(r_a - l_a) for (r_a, l_a) in zip(real_mnt, workload_laplace_vec(mnt_workload, 1.0, 'mnt'))]

print('Average absolute error:', np.mean(lat_errors))
assert np.mean(lat_errors) > 70
assert np.mean(lat_errors) < 150

print('Average absolute error:', np.mean(lng_errors))
assert np.mean(lng_errors) > 70
assert np.mean(lng_errors) < 150

print('Average absolute error:', np.mean(mnt_errors))
assert np.mean(mnt_errors) > 70
assert np.mean(mnt_errors) < 150

Average absolute error: 105.59964772119372
Average absolute error: 105.37124006950991
Average absolute error: 90.22419594796342


In [34]:
def workload_gaussian_vec(workload, epsilon, delta, col):
    L2 = np.sqrt(len(workload))
    rng = [range_query(calls, col, work[0], work[1]) for work in workload] #1st of tuple
    noise = gaussian_mech_vec(rng, L2, epsilon, delta)
    return noise

In [35]:
lat_errors = [abs(r_a - l_a) for (r_a, l_a) in zip(real_lat, workload_gaussian_vec(lat_workload, 1.0, 1e-5,'lat'))]
lng_errors = [abs(r_a - l_a) for (r_a, l_a) in zip(real_lng, workload_gaussian_vec(lng_workload, 1.0, 1e-5,'lng'))]
mnt_errors = [abs(r_a - l_a) for (r_a, l_a) in zip(real_mnt, workload_gaussian_vec(mnt_workload, 1.0, 1e-5,'mnt'))]

print('Average absolute error:', np.mean(lat_errors))
assert np.mean(lat_errors) > 10
assert np.mean(lat_errors) < 100

print('Average absolute error:', np.mean(lng_errors))
assert np.mean(lng_errors) > 10
assert np.mean(lng_errors) < 100

print('Average absolute error:', np.mean(mnt_errors))
assert np.mean(mnt_errors) > 10
assert np.mean(mnt_errors) < 100

Average absolute error: 39.40203285481954
Average absolute error: 38.5722074568039
Average absolute error: 38.85490573592798
