# Analysis of Conditional Means Release

In [1]:
import json
import os
import sys
from collections import defaultdict

import numpy as np
import pandas as pd
from diffprivlib.mechanisms import Laplace

sys.path.insert(0, "..")

from synthflow.__main__ import generate
from synthflow.birth import (
    BOUNDARIES,
    CONDITIONAL_MEAN_EPSILONS,
    EVALUATION_COLUMN_BINS,
    THRESHOLD_ALL_K_WAYS_MAX_ABS_DIFF,
    USER_ANALYSIS,
    USER_ANALYSIS_BY_METRIC,
)
from synthflow.evaluation import numerify
from synthflow.evaluation.utility.analysis import _bin_column, _get_groupby_2d
from synthflow.evaluation.utility.metrics import _compute_mean_resized_by_second
from tests.utils import GEN_CONFIG_PATH, REAL_DATA_200K_PATH, REAL_DATASET_SCHEMA

os.environ["WANDB_MODE"] = "offline"
os.environ["WANB_SILENT"] = "true"
os.environ["PRIVBAYES_BIN"] = "/Users/shlomi/projects/synthflow/privbayes"
!wandb offline

W&B offline. Running your script from this directory will only write metadata locally. Use wandb disabled to completely turn off W&B.


In [2]:
real_df = pd.read_csv(REAL_DATA_200K_PATH)
real_df = REAL_DATASET_SCHEMA.validate(real_df)
real_df = real_df.sample(160000, replace=False)

with open(GEN_CONFIG_PATH) as f:
    gen_config = json.load(f)

synth_df, _, _, processed_df, _, _ = generate(
    real_df, gen_config, REAL_DATASET_SCHEMA, [], ubersampling=1.1
)

numerified_processed_df = numerify(processed_df)
numerified_synth_df = numerify(synth_df)

real
30	


In [3]:
processed_one_way_eval_bin_counts = {
    column: _bin_column(numerified_processed_df, column, EVALUATION_COLUMN_BINS)
    .value_counts()
    .values
    for column in synth_df.columns
}

In [4]:
margin = int(np.ceil(THRESHOLD_ALL_K_WAYS_MAX_ABS_DIFF * len(processed_df)) + 1)

In [5]:
{
    column: (100 * (1 - margin / counts)).round().astype(int)
    for column, counts in processed_one_way_eval_bin_counts.items()
}

{'mother_age': array([97, 97, 96, 95]),
 'parity': array([98, 97, 92]),
 'gestation_week': array([99, 90]),
 'is_female': array([98, 98]),
 'date_of_birth': array([89, 89, 88, 88, 88, 88, 88, 88, 88, 88, 87, 86]),
 'birth_weight': array([99, 87, 85])}

In [6]:
synth_one_way_eval_bin_counts = {
    column: _bin_column(numerified_synth_df, column, EVALUATION_COLUMN_BINS)
    .value_counts()
    .values
    for column in synth_df.columns
}

min_one_way_eval_bin_counts = {
    column: counts.min() for column, counts in synth_one_way_eval_bin_counts.items()
}

conditioning = defaultdict(list)

for analysis in USER_ANALYSIS:
    if analysis.metric == "mean":
        conditioning[analysis.target].append(analysis.by)

mean_by_second_sensitivities = {
    target: min(min_one_way_eval_bin_counts[by] for by in by_columns)
    for target, by_columns in conditioning.items()
}

In [7]:
conditioning

defaultdict(list,
            {'birth_weight': ['is_female',
              'parity',
              'gestation_week',
              'mother_age'],
             'gestation_week': ['parity', 'mother_age'],
             'parity': ['mother_age']})

In [8]:
conditioning_by_columns = set(sum(map(list, conditioning.values()), []))

conditioning_by_columns

{'gestation_week', 'is_female', 'mother_age', 'parity'}

In [9]:
USER_ANALYSIS_MEAN_RESIZE_BY_SECOND = USER_ANALYSIS_BY_METRIC[2]
assert USER_ANALYSIS_MEAN_RESIZE_BY_SECOND[0].metric == "mean-resize-by-second"

In [10]:
max_diff_results = defaultdict(lambda: defaultdict(list))
sizes_results = defaultdict(list)

for an in USER_ANALYSIS_MEAN_RESIZE_BY_SECOND:
    processed_grp2d = _get_groupby_2d(
        numerified_processed_df, an.target, an.by, EVALUATION_COLUMN_BINS
    )
    synth_grp2d = _get_groupby_2d(
        numerified_synth_df, an.target, an.by, EVALUATION_COLUMN_BINS
    )

    sized_saved = False

    for _ in range(100):
        result = _compute_mean_resized_by_second(processed_grp2d, synth_grp2d)
        max_diff_results[an.target][an.by].append(result["max/diff"])

        if not sized_saved:
            sizes_results[an.target].append(result["min/size_by_second"])
            sized_saved = True

sensitivity_results = {column: min(sizes) for column, sizes in sizes_results.items()}

In [11]:
# without taking into account limit with 1st acceptance criterion
print(f"{mean_by_second_sensitivities=}")

# with
print(f"{sensitivity_results=}")

mean_by_second_sensitivities={'birth_weight': 16600, 'gestation_week': 20460, 'parity': 29873}
sensitivity_results={'birth_weight': 14999, 'gestation_week': 18859, 'parity': 28272}


In [12]:
{
    target: np.std(sum(map(list, by_results.values()), []))
    for target, by_results in max_diff_results.items()
}

{'birth_weight': 16.104435883644555,
 'gestation_week': 0.02190815584289972,
 'parity': 0.0012355979318248163}

In [13]:
for column, epsilon in CONDITIONAL_MEAN_EPSILONS.items():
    sensitivity = (BOUNDARIES[column][1] - BOUNDARIES[column][0]) / sensitivity_results[
        column
    ]

    mech = Laplace(epsilon=epsilon, delta=0, sensitivity=sensitivity)

    print(column, mech.variance(None))

parity 0.0025021706831110115
birth_weight 3.149977845406782
gestation_week 0.0031631166006232756
