# Environment setting
環境設定

In [1]:
import os
import requests
import sys
from pathlib import Path
from pprint import pprint
from typing import Optional

import pandas as pd


# determine branch, default is main
branch: str = "main"

# determine subfolder, default is None (petsard/demo/)
subfolder: Optional[str] = "use-cases"


# Check if running in Google Colab, if so, download the utils.py file from GitHub
is_colab: bool = "COLAB_GPU" in os.environ
if is_colab:
    utils_url = (
        f"https://raw.githubusercontent.com/nics-tw/petsard/{branch}/demo/utils.py"
    )
    response = requests.get(utils_url)

    if response.status_code == 200:
        with open("utils.py", "w") as f:
            f.write(response.text)

        Path("__init__.py").touch()
    else:
        raise RuntimeError(
            f"Failed to download utils.py. Status code: {response.status_code}"
        )


# If not colab, and also contains subfolderl, add the correct path of util.py
else:
    if subfolder:
        sys.path.append(os.path.dirname(os.getcwd()))

In [None]:
# Now import and run the setup
from utils import (
    get_yaml_path,
    setup_environment,
)


setup_environment(
    is_colab,
    branch,
    benchmark_data=[
        "adult-income",
    ],
    subfolder=subfolder,
)

In [3]:
from petsard import Executor

# YAML Configuration for PETsARD
PETsARD 的 YAML 設定

## Data Constraining
資料約束

In [4]:
yaml_file_case: str = "data-constraining.yaml"

yaml_path_case: str = get_yaml_path(
    is_colab=is_colab,
    yaml_file=yaml_file_case,
    branch=branch,
    subfolder=subfolder,
)

Configuration content:
---
Loader:
  data:
    filepath: 'benchmark/adult-income.csv'
Preprocessor:
  demo:
    method: 'default'
Synthesizer:
  demo:
    method: 'default'
Postprocessor:
  demo:
    method: 'default'
Constrainer:
  demo:
    nan_groups:
      # Delete entire row when workclass is NA
      workclass: 'delete'
      # Set income to NA if occupation is NA
      occupation:
        'erase':
          - 'income'
      # Copy educational-num value to age when educational-num exists but age is NA
      age:
        'copy':
          'educational-num'
    field_constraints:
      - "age >= 18 & age <= 65"
      - "hours-per-week >= 20 & hours-per-week <= 60"
    field_combinations:
      -
        - education: income
        - Doctorate: ['>50K']
          Masters: ['>50K', '<=50K']
    field_proportions:
      # Maintain education distribution with default tolerance (10%)
      - fields: 'education'
        mode: 'all'
        # tolerance 使用預設值 0.1 (10%)
      # Maintain inc

### Execution and Result
執行與結果

In [5]:
# Initialize and run executor
exec_case = Executor(config=yaml_path_case)
exec_case.run()

Now is petsard_Loader[data]_Preprocessor[demo]_Synthesizer[demo]_Postprocessor[demo]_Constrainer[demo] save to csv...


In [6]:
exec_case.get_result()[
    "Loader[data]_Preprocessor[demo]_Synthesizer[demo]_Postprocessor[demo]_Constrainer[demo]_Reporter[output]"
][
    "Loader[data]_Preprocessor[demo]_Synthesizer[demo]_Postprocessor[demo]_Constrainer[demo]"
]

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,41.620393,Private,234024.684688,Assoc-acdm,15.193738,Divorced,Exec-managerial,Not-in-family,Black,Male,0.0,0.0,41.085963,United-States,>50K
1,47.542178,Private,121589.788252,11th,12.563175,Married-civ-spouse,Sales,Not-in-family,White,Male,0.0,0.0,38.673017,United-States,>50K
2,48.251812,Private,206989.912542,HS-grad,13.533538,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0.0,0.0,35.606627,United-States,<=50K
3,29.143875,Private,69550.968183,Some-college,14.547483,Never-married,Craft-repair,Wife,White,Male,0.0,0.0,36.499117,United-States,<=50K
4,22.702130,Local-gov,327252.363876,HS-grad,8.532593,Married-civ-spouse,Farming-fishing,Own-child,White,Female,0.0,0.0,39.684126,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28553,27.935229,Private,237078.144824,HS-grad,11.576392,Never-married,Farming-fishing,Husband,Amer-Indian-Eskimo,Male,0.0,0.0,34.686904,South,<=50K
28554,26.010563,Private,107642.289270,HS-grad,8.935934,Married-civ-spouse,Exec-managerial,Not-in-family,White,Male,0.0,0.0,45.789854,United-States,<=50K
28555,27.260095,State-gov,371555.500327,Some-college,9.719323,Married-civ-spouse,Sales,Husband,White,Male,0.0,0.0,48.229105,United-States,<=50K
28556,26.693570,Federal-gov,114947.524136,Some-college,12.001313,Never-married,Prof-specialty,Husband,White,Female,0.0,0.0,33.310773,United-States,>50K
