# Environment setting / 環境設定
Specify Data Schema / 指定資料表架構

In [18]:
import os
import sys
from pathlib import Path

# 自動載入 utils / Auto-load utils
if "COLAB_GPU" in os.environ:
    url = "https://raw.githubusercontent.com/nics-tw/petsard/main/demo/utils.py"
    exec(open(url).read())
else:
    # 靜默搜尋 utils.py / Silent search for utils.py
    current = Path.cwd()
    for _ in range(5):
        if (current / "utils.py").exists():
            sys.path.insert(0, str(current))
            break
        current = current.parent

    # 匯入 utils 模組 / Import utils module
    from utils import quick_setup

# 快速設定 / Quick setup
is_colab, branch, yaml_path = quick_setup(
    yaml_file=[
        "specify-schema.yaml",
        "specify-schema-external.yaml",
        "specify-schema-comprehensive.yaml",
    ],
    benchmark_data=[
        "adult-income",
        "adult-income_ori",
        "adult-income_control",
        "adult-income_syn",
    ],
    branch="main",  # 可選，預設為 "main"
)

from petsard import Executor

🚀 PETsARD v1.5.1
📅 2025-08-04 11:50:11 UTC+8
📁 Subfolder: tutorial/use-cases
📄 YAML path (1/3): petsard/demo/tutorial/use-cases/specify-schema.yaml
📄 YAML path (2/3): petsard/demo/tutorial/use-cases/specify-schema-external.yaml
📄 YAML path (3/3): petsard/demo/tutorial/use-cases/specify-schema-comprehensive.yaml
⚙️ Configuration content (1/3) - specify-schema.yaml:
---
Loader:
  data:
    filepath: 'benchmark/adult-income.csv'
  data-w-schema:
    filepath: 'benchmark/adult-income.csv'
    schema:
      # 全域參數
      optimize_dtypes: true
      nullable_int: 'force'

      # 欄位參數
      fields:
        # 數值型欄位
        age:
          type: 'int'
        fnlwgt:
          type: 'int'
        # 字串型欄位
        gender:
          type: 'str'
          category_method: 'force'
        # 含自訂缺失值的欄位
        native-country:
          type: 'str'
          na_values: '?'
        workclass:
          type: 'str'
          na_values: '?'
        occupation:
          type: 'str'
          na_values: '?'

# Execution and Result / 執行與結果

In [19]:
import pandas as pd


def specify_schema_loader_demo(exec_case: Executor) -> pd.DataFrame:
    df = exec_case.get_result()[
        "Loader[data]_Describer[summary]_Reporter[save_report_columnwise]"
    ]["[columnwise]"].copy()

    df_w_schema = exec_case.get_result()[
        "Loader[data-w-schema]_Describer[summary]_Reporter[save_report_columnwise]"
    ]["[columnwise]"].copy()
    df_w_schema = df_w_schema.loc[df_w_schema["Loader"] == "data-w-schema", :]

    comparison_df = pd.merge(
        df[["column", "summary_na_count", "summary_nunique"]],
        df_w_schema[["column", "summary_na_count", "summary_nunique"]],
        on=["column"],
        how="left",
        suffixes=("_data", "_data_w_schema"),
    )
    return comparison_df


## specify-schema.yaml

In [20]:
print(f"YAML Path: {Path(yaml_path[0]).name}")
exec_case = Executor(config=yaml_path[0])
exec_case.run()

YAML Path: specify-schema.yaml
Now is petsard[Report]_[columnwise] save to csv...
Now is petsard[Report]_[columnwise] save to csv...


In [21]:
specify_schema_loader_demo(exec_case)

Unnamed: 0,column,summary_na_count_data,summary_nunique_data,summary_na_count_data_w_schema,summary_nunique_data_w_schema
0,age,0.0,,0.0,
1,fnlwgt,0.0,,0.0,
2,educational-num,0.0,,0.0,
3,capital-gain,0.0,,0.0,
4,capital-loss,0.0,,0.0,
5,hours-per-week,0.0,,0.0,
6,workclass,0.0,9.0,2799.0,8.0
7,education,0.0,16.0,0.0,16.0
8,marital-status,0.0,7.0,0.0,7.0
9,occupation,0.0,15.0,2809.0,14.0


## specify-schema-external.yaml

In [22]:
print(f"YAML Path: {Path(yaml_path[1]).name}")
exec_case = Executor(config=yaml_path[1])
exec_case.run()

YAML Path: specify-schema-external.yaml
Now is petsard[Report]_[columnwise] save to csv...
Now is petsard[Report]_[columnwise] save to csv...


In [23]:
specify_schema_loader_demo(exec_case)

Unnamed: 0,column,summary_na_count_data,summary_nunique_data,summary_na_count_data_w_schema,summary_nunique_data_w_schema
0,age,0.0,,0.0,
1,fnlwgt,0.0,,0.0,
2,educational-num,0.0,,0.0,
3,capital-gain,0.0,,0.0,
4,capital-loss,0.0,,0.0,
5,hours-per-week,0.0,,0.0,
6,workclass,0.0,9.0,2799.0,8.0
7,education,0.0,16.0,0.0,16.0
8,marital-status,0.0,7.0,0.0,7.0
9,occupation,0.0,15.0,2809.0,14.0


## specify-schema-comprehensive.yaml

In [24]:
print(f"YAML Path: {Path(yaml_path[2]).name}")
exec_case = Executor(config=yaml_path[2])
exec_case.run()

YAML Path: specify-schema-comprehensive.yaml
Generating report ...

(1/2) Evaluating Data Validity: |██████████| 15/15 [00:00<00:00, 442.78it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 1355.19it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%

Now is petsard[Report]_[global] save to csv...
Now is petsard[Report]_[columnwise] save to csv...
Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 15/15 [00:00<00:00, 94.39it/s]|
Column Shapes Score: 95.27%

(2/2) Evaluating Column Pair Trends: |██████████| 105/105 [00:00<00:00, 140.25it/s]|
Column Pair Trends Score: 61.56%

Overall Score (Average): 78.42%

Now is petsard[Report]_[global] save to csv...
Now is petsard[Report]_[columnwise] save to csv...


In [25]:
exec_case.get_result()[
    "Splitter[custom_[1-1]]_Synthesizer[custom]_Evaluator[demo-quality]_Reporter[rpt_global]"
]["[global]"]

Unnamed: 0,full_expt_name,Splitter,Synthesizer,Evaluator,demo-diagnostic_Score,demo-diagnostic_Data Validity,demo-diagnostic_Data Structure,demo-quality_Score,demo-quality_Column Shapes,demo-quality_Column Pair Trends
0,Splitter[custom_[1-1]]_Synthesizer[custom]_Eva...,custom_[1-1],custom,[global],1.0,1.0,1.0,0.78,0.95,0.62
