In [26]:
import pandas as pd
import numpy as np
from scipy.stats import wasserstein_distance
from scipy.spatial.distance import jensenshannon
import os

original_data = pd.read_csv('../data/acs-2019-nist/national/national2019.csv')
generated_data_files = [f for f in os.listdir('output_data') if f.endswith('.csv')]

continuous_columns = ['AGEP', 'PINCP', 'DENSITY']

epsilon = 0.1
sensitivity = 1.0 # TODO: adjust, altho i think fine

generated_datasets = [(file, pd.read_csv(f'output_data/{file}')) for file in generated_data_files]

def jensen_shannon_distance(original_series, generated_series):
    # bounded between 0 and 1
    original_counts = original_series.value_counts(normalize=True)
    generated_counts = generated_series.value_counts(normalize=True)
    
    original_counts, generated_counts = original_counts.align(generated_counts, fill_value=0)
    
    return jensenshannon(original_counts, generated_counts)

def calculate_total_distance(original_df, generated_df):
    total_distance = 0
    for col in generated_df.columns: 
        total_distance += jensen_shannon_distance(original_df[col], generated_df[col])
    return total_distance

min_noisy_distance = 1e-6 
distances = []
for file, gen_data in generated_datasets:
    try:
        true_distance = calculate_total_distance(original_data, gen_data)
        noisy_distance = true_distance # + np.random.laplace(loc=0, scale=sensitivity / epsilon)
        
        noisy_distance = max(noisy_distance, min_noisy_distance)
        distances.append((file, gen_data, noisy_distance))
    except ValueError as e:
        print(f"skip {file} due to error: {e}")

weights = np.array([1 / dist[2] for dist in distances])
weights /= weights.sum()  

sample_size = len(generated_datasets[0][1])

sampled_data = []
for (file, gen_data, _), weight in zip(distances, weights):
    num_samples = int(round(weight * sample_size))
    sampled_data.append(gen_data.sample(n=num_samples, replace=True))

final_sampled_dataset = pd.concat(sampled_data, ignore_index=True)

print("final sampled dataset size:", len(final_sampled_dataset))
print("dist of samples per dataset:", [len(data) for data in sampled_data])

# it seems to basically be uniform most of the time...

final sampled dataset size: 1000
dist of samples per dataset: [79, 73, 79, 85, 88, 91, 71, 81, 84, 84, 102, 83]


In [24]:
# store final_sampled_dataset
final_sampled_dataset.to_csv('final_sampled_dataset.csv', index=False)