# Environment setting
環境設定

In [1]:
import os
import requests
from pathlib import Path


# determine branch, default is main
branch = 'main'

# Check if running in Google Colab
is_colab = 'COLAB_GPU' in os.environ

if is_colab:
    # Download the utils.py file from GitHub
    utils_url = f"https://raw.githubusercontent.com/nics-tw/petsard/{branch}/demo/utils.py"
    response = requests.get(utils_url)

    if response.status_code == 200:
        # Save the utils.py file
        with open('utils.py', 'w') as f:
            f.write(response.text)

        # Create an empty __init__.py
        Path('__init__.py').touch()
    else:
        raise RuntimeError(f"Failed to download utils.py. Status code: {response.status_code}")

In [2]:
# Now import and run the setup
from utils import (
    get_yaml_path,
    setup_environment,
)


setup_environment(
    is_colab,
    branch,
    benchmark_data=[
        'adult-income',
    ]
)

Looking in links: /var/folders/lb/lwjvbr314wj7bt1k63plw4bw0000gn/T/tmpsve1yfqq
Obtaining file:///Users/justyn.chen/Dropbox/310_Career_%E5%B7%A5%E4%BD%9C/20231016_NICS_%E8%B3%87%E5%AE%89%E9%99%A2/41_PETsARD/petsard
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Checking if build backend supports build_editable: started
  Checking if build backend supports build_editable: finished with status 'done'
  Getting requirements to build editable: started
  Getting requirements to build editable: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
  Preparing editable metadata (pyproject.toml): started
  Preparing editable metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: petsard
  Building editable for petsard (pyproject.toml): started
  Building editable for petsard (pyproject.toml): finished with status 'done'


In [3]:
from petsard import Executor

# YAML Configuration for PETsARD
PETsARD 的 YAML 設定

## Data Constraining
資料約束

In [4]:
yaml_file_case: str = 'data-constraining.yaml'

yaml_path_case: str = get_yaml_path(
    is_colab=is_colab,
    yaml_file=yaml_file_case,
    branch=branch,
)

Configuration content:
---
Loader:
  data:
    filepath: 'benchmark/adult-income.csv'
Preprocessor:
  demo:
    method: 'default'
Synthesizer:
  demo:
    method: 'default'
Postprocessor:
  demo:
    method: 'default'
Constrainer:
  demo:
    nan_groups:
      # Delete entire row when workclass is NA
      workclass: 'delete'
      # Set income to NA if occupation is NA
      occupation:
        'erase':
          - 'income'
      # Copy educational-num value to age when educational-num exists but age is NA
      age:
        'copy':
          'educational-num'
    field_constraints:
      - "age >= 18 & age <= 65"
      - "hours-per-week >= 20 & hours-per-week <= 60"
    field_combinations:
      -
        - {'education': 'income'}
        - {'Doctorate': ['>50K'], 'Masters': ['>50K', '<=50K']}
Reporter:
  output:
    method: 'save_data'
    source: 'Constrainer'
...


### Execution and Result
執行與結果

In [5]:
# Initialize and run executor
exec_case = Executor(config=yaml_path_case)
exec_case.run()



Synthesizer (SDV): Fitting GaussianCopula.
Synthesizer (SDV): Fitting GaussianCopula spent 2.1887 sec.


INFO:root:age changes data dtype from float64 to int8 for metadata alignment.
INFO:root:workclass changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:fnlwgt changes data dtype from float64 to int32 for metadata alignment.
INFO:root:education changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:educational-num changes data dtype from float64 to int8 for metadata alignment.
INFO:root:marital-status changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:occupation changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:relationship changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:race changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:gender changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:capi

Synthesizer (SDV): Sampling GaussianCopula # 48842 rows (same as Loader data) in 0.6137 sec.
Now is petsard_Loader[data]_Preprocessor[demo]_Synthesizer[demo]_Postprocessor[demo]_Constrainer[demo] save to csv...


In [6]:
exec_case.get_result()[
    'Loader[data]_Preprocessor[demo]_Synthesizer[demo]_Postprocessor[demo]_Constrainer[demo]_Reporter[output]'
]['Loader[data]_Preprocessor[demo]_Synthesizer[demo]_Postprocessor[demo]_Constrainer[demo]']

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,43,Private,84678,Some-college,16,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,45,United-States,<=50K
1,21,Private,82688,HS-grad,11,Married-civ-spouse,Transport-moving,Not-in-family,White,Male,0,0,42,United-States,>50K
2,32,Self-emp-not-inc,38359,11th,7,Never-married,Machine-op-inspct,Own-child,White,Female,0,0,38,United-States,<=50K
3,47,Private,208752,Some-college,10,Never-married,Machine-op-inspct,Husband,White,Male,0,0,43,United-States,<=50K
4,51,State-gov,235015,Bachelors,11,Separated,Prof-specialty,Not-in-family,White,Female,0,0,34,United-States,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47501,39,Private,278804,Some-college,8,Married-civ-spouse,?,Own-child,White,Male,0,0,39,United-States,<=50K
47502,18,Self-emp-inc,164515,Some-college,13,Married-civ-spouse,Transport-moving,Not-in-family,White,Female,0,0,33,United-States,<=50K
47503,27,Self-emp-not-inc,329545,HS-grad,10,Never-married,Craft-repair,Husband,White,Male,0,0,47,Mexico,<=50K
47504,53,State-gov,139329,Some-college,12,Married-civ-spouse,?,Husband,White,Male,0,0,37,United-States,>50K
