Data Privacy Final Project by Sage Hahn
Implementation based on "Plausible Deniability for Privacy-Preserving Data Synthesis" (https://arxiv.org/pdf/1708.07975.pdf)

Using data from the https://www.nist.gov/ctl/pscr/funding-opportunities/prizes-challenges/2018-differential-privacy-synthetic-data-challenge

Project goal: to generate differential private synthetic data from an original dataset

In [1]:
from utils import load_data, flip, convert
from sklearn.model_selection import train_test_split

import numpy as np
import pickle

from config import config
from structure_learn import learn_structure
from param_learn import learn_cond_marginals
from generate_data import generate_fake_data

In [2]:
data, names, encoders = load_data()

#Generate unique val counts on the whole dataset, before any splits
unique_vals = [list(np.unique(data[i])) for i in range(len(data))]

data = np.swapaxes(data,0,1)
train, test = train_test_split(data, test_size=config['test_size'], random_state=config['ran_state'])
train = flip(train)

In [3]:
parents, order = learn_structure(train, unique_vals)

In [4]:
count_dicts = learn_cond_marginals(train, parents, unique_vals, order)

In [5]:
to_release = generate_fake_data(train, order, parents, count_dicts, unique_vals)

generating samples


  probs = probs / np.sum(probs)
  new_val = np.random.choice(unique_vals[i], p=list(probs))


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000


In [6]:
len(to_release)

10000

In [7]:
with open('generated.pkl', 'wb') as f:
    pickle.dump(to_release, f)