In [1]:
from utils import get_sensitive_preprocessed_data, get_public_histogram
from dpsu_policy import run_policy
from dpsu_gw import run_gw
import numpy as np

# Load sensitive datasets

In [2]:
sensitive_data_dict = {}

In [3]:
for sensitive_dataset_name in ['reddit', 'twitter', 'finance']:
    print(sensitive_dataset_name)
    sensitive_data_dict[sensitive_dataset_name] = get_sensitive_preprocessed_data(sensitive_dataset_name)

reddit
File data/reddit_cleaned.csv does NOT exist. Skipping...

twitter
File data/twitter_cleaned.csv does NOT exist. Skipping...

finance


100%|██████████| 1400465/1400465 [03:05<00:00, 7560.21it/s]


# Load public datasets

In [4]:
public_data_dict = {}

In [5]:
for public_dataset_name in ['imdb', 'covid', 'songs', 'wikipedia', 'enron']:
    print(public_dataset_name)
    public_data_dict[public_dataset_name] = get_public_histogram(public_dataset_name)

imdb


100%|██████████| 50000/50000 [00:12<00:00, 3917.62it/s]


covid
File data/covid_cleaned.csv does NOT exist. Skipping...

songs
File data/songs_cleaned.csv does NOT exist. Skipping...

wikipedia
File data/wikipedia_cleaned.csv does NOT exist. Skipping...

enron
File data/enron_cleaned.csv does NOT exist. Skipping...



# Experiments

## Settings

In [6]:
main_dataset = 'finance' # One of: 'reddit', 'twitter', 'finance'

dp_epsilon = 3
dp_delta = np.exp(-10)

alpha = 3

In [7]:
# ONLY for GW-KT
public_dataset = 'imdb' # One of: 'imdb', 'covid', 'songs', 'wikipedia', 'enron'

In [8]:
# ONLY for Policy 
Delta_0 = 100

In [9]:
# Define datasets
input_df = sensitive_data_dict[main_dataset]
public_df = public_data_dict[public_dataset]

## Run GW

In [10]:
# Run GW
run_gw(input_df, alpha, dp_epsilon, dp_delta, 'ci', None)

100%|██████████| 1400465/1400465 [00:14<00:00, 96237.64it/s] 


49864

# Run GW-KT

In [11]:
# Run GW
run_gw(input_df, alpha, dp_epsilon, dp_delta, 'kt-ci', public_df)

100%|██████████| 1400465/1400465 [00:15<00:00, 88381.95it/s] 


50470

## Run Policy Laplace

In [12]:
# Run Pol. LAPLACE
run_policy(input_df, Delta_0, 'LAPLACE', alpha, dp_epsilon, dp_delta)

100%|██████████| 1400465/1400465 [00:13<00:00, 104644.17it/s]


43774

## Run Policy Gaussian

In [13]:
# Run Pol. GAUSSIAN
run_policy(input_df, Delta_0, 'GAUSSIAN', alpha, dp_epsilon, dp_delta)

100%|██████████| 1400465/1400465 [00:31<00:00, 44622.06it/s]


41403