# Environment setting
環境設定

In [14]:
import os
import requests
from pathlib import Path


# determine branch, default is main
branch = "main"

# Check if running in Google Colab
is_colab = "COLAB_GPU" in os.environ

if is_colab:
    # Download the utils.py file from GitHub
    utils_url = (
        f"https://raw.githubusercontent.com/nics-tw/petsard/{branch}/demo/utils.py"
    )
    response = requests.get(utils_url)

    if response.status_code == 200:
        # Save the utils.py file
        with open("utils.py", "w") as f:
            f.write(response.text)

        # Create an empty __init__.py
        Path("__init__.py").touch()
    else:
        raise RuntimeError(
            f"Failed to download utils.py. Status code: {response.status_code}"
        )

In [None]:
# Now import and run the setup
from utils import (
    get_yaml_path,
    setup_environment,
)


setup_environment(
    is_colab,
    branch,
    benchmark_data=[
        "adult-income",
    ],
)

In [16]:
from petsard import Executor

# YAML Configuration for PETsARD
PETsARD 的 YAML 設定

## Default Synthesis and Default Evaluation
預設合成與預設評測

In [17]:
yaml_file_case: str = "default-synthesis-default-evaluation.yaml"

yaml_path_case: str = get_yaml_path(
    is_colab=is_colab,
    yaml_file=yaml_file_case,
    branch=branch,
)

Configuration content:
---
Loader:
  data:
    filepath: 'benchmark/adult-income.csv'
Splitter:
  demo:
    num_samples: 1
    train_split_ratio: 0.8
Preprocessor:
  demo:
    method: 'default'
Synthesizer:
  demo:
    method: 'default'
Postprocessor:
  demo:
    method: 'default'
Evaluator:
  demo-diagnostic:
    method: 'sdmetrics-diagnosticreport'
  demo-quality:
    method: 'sdmetrics-qualityreport'
  demo-singlingout:
    method: 'anonymeter-singlingout'
  demo-linkability:
    method: 'anonymeter-linkability'
    aux_cols:
      -
        - 'age'
        - 'marital-status'
        - 'relationship'
        - 'gender'
      -
        - 'workclass'
        - 'educational-num'
        - 'occupation'
        - 'income'
  demo-inference:
    method: 'anonymeter-inference'
    secret: 'income'
  demo-classification:
    method: 'mlutility-classification'
    target: 'income'
Reporter:
  output:
    method: 'save_data'
    source: 'Synthesizer'
  save_report_global:
    method: 'save_rep

### Execution and Result
執行與結果

In [18]:
exec_case = Executor(config=yaml_path_case)
exec_case.run()

Generating report ...

(1/2) Evaluating Data Validity: |██████████| 15/15 [00:00<00:00, 1154.23it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 1328.15it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%

Now is petsard_Loader[data]_Splitter[demo_[1-1]]_Preprocessor[demo]_Synthesizer[demo] save to csv...
Now is petsard[Report]_[global] save to csv...
Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 15/15 [00:00<00:00, 132.11it/s]|
Column Shapes Score: 68.53%

(2/2) Evaluating Column Pair Trends: |██████████| 105/105 [00:00<00:00, 332.24it/s]|
Column Pair Trends Score: 55.3%

Overall Score (Average): 61.91%

Now is petsard_Loader[data]_Splitter[demo_[1-1]]_Preprocessor[demo]_Synthesizer[demo] save to csv...
Now is petsard[Report]_[global] save to csv...


Found 1659 failed queries out of 2000. Check DEBUG messages for more details.
Reached maximum number of attempts 500000 when generating singling out queries. Returning 103 instead of the requested 2000.To avoid this, increase the number of attempts or set it to ``None`` to disable The limitation entirely.
Attack `multivariate` could generate only 103 singling out queries out of the requested 2000. This can probably lead to an underestimate of the singling out risk.


Now is petsard_Loader[data]_Splitter[demo_[1-1]]_Preprocessor[demo]_Synthesizer[demo] save to csv...
Now is petsard[Report]_[global] save to csv...
Now is petsard_Loader[data]_Splitter[demo_[1-1]]_Preprocessor[demo]_Synthesizer[demo] save to csv...
Now is petsard[Report]_[global] save to csv...
Now is petsard_Loader[data]_Splitter[demo_[1-1]]_Preprocessor[demo]_Synthesizer[demo] save to csv...
Now is petsard[Report]_[global] save to csv...
Now is petsard_Loader[data]_Splitter[demo_[1-1]]_Preprocessor[demo]_Synthesizer[demo] save to csv...
Now is petsard[Report]_[global] save to csv...


In [19]:
exec_case.status.get_report()

{'Loader[data]_Splitter[demo_[1-1]]_Preprocessor[demo]_Synthesizer[demo]':        age     workclass  fnlwgt     education  educational-num  \
 0       38             ?  152837  Some-college               12   
 1       64             ?  160293          11th               10   
 2       25  Self-emp-inc  255388          12th                8   
 3       22     State-gov  145029  Some-college               12   
 4       25     Local-gov  208350  Some-college                5   
 ...    ...           ...     ...           ...              ...   
 39068   33     Local-gov  193139  Some-college               13   
 39069   25     Local-gov  370843  Some-college               12   
 39070   59  Self-emp-inc  100188  Some-college                6   
 39071   71     Local-gov   67851     Bachelors                8   
 39072   47             ?  112905     Preschool                9   
 
            marital-status         occupation    relationship   race  gender  \
 0      Married-civ-spouse  

In [20]:
exec_case.status.status["Evaluator"]["operator"].get_result()

{'global':    ori_mean  ori_std  syn_mean  syn_std  diff
 0      0.86     0.01      0.79     0.05 -0.07,
 'details': {'ori': {'logistic_regression': 0.8567918927218753,
   'svc': 0.8551540587572934,
   'random_forest': 0.8601699252738254,
   'gradient_boosting': 0.8690756474562391},
  'syn': {'logistic_regression': 0.7058040741119869,
   'svc': 0.8155389497389702,
   'random_forest': 0.8115467294503019,
   'gradient_boosting': 0.8205548162555021}}}

In [21]:
exec_case.get_result()[
    "Loader[data]_Splitter[demo_[1-1]]_Preprocessor[demo]_Synthesizer[demo]_Postprocessor[demo]_Evaluator[demo-classification]_Reporter[save_report_global]"
]["[global]"]

Unnamed: 0,full_expt_name,Loader,Splitter,Preprocessor,Synthesizer,Postprocessor,Evaluator,demo-diagnostic_Score,demo-diagnostic_Data Validity,demo-diagnostic_Data Structure,...,demo-inference_attack_rate_err,demo-inference_baseline_rate,demo-inference_baseline_rate_err,demo-inference_control_rate,demo-inference_control_rate_err,demo-classification_ori_mean,demo-classification_ori_std,demo-classification_syn_mean,demo-classification_syn_std,demo-classification_diff
0,Loader[data]_Splitter[demo_[1-1]]_Preprocessor...,data,demo_[1-1],demo,demo,demo,[global],1.0,,,...,0.02,0.6,0.02,0.66,0.02,0.86,0.01,0.79,0.05,-0.07
