# Environment setting / 環境設定
Best practices: Categorical variables - Uniform encoding / 最佳實踐：類別變項 - 均勻編碼

In [1]:
import os
import sys
from pathlib import Path

# 自動載入 utils / Auto-load utils
if "COLAB_GPU" in os.environ:
    url = "https://raw.githubusercontent.com/nics-tw/petsard/main/demo/utils.py"
    exec(open(url).read())
else:
    # 靜默搜尋 utils.py / Silent search for utils.py
    current = Path.cwd()
    for _ in range(5):
        if (current / "utils.py").exists():
            sys.path.insert(0, str(current))
            break
        current = current.parent

    # 匯入 utils 模組 / Import utils module
    from utils import quick_setup

# 快速設定 / Quick setup
is_colab, branch, yaml_path = quick_setup(
    yaml_file="categorical.yaml",
    benchmark_data=[
        "best-practices_categorical_high-cardinality",
    ],
    branch="main",  # 可選，預設為 "main"
)

from petsard import Executor

🚀 PETsARD v1.5.1
📅 2025-07-31 13:42:43 UTC+8
📁 Subfolder: best-practices
📄 YAML path: petsard/demo/best-practices/categorical.yaml
⚙️ Configuration content:
---
Loader:
  data:
    filepath: 'benchmark/best-practices_categorical_high-cardinality.csv'
    column_types:
      category:
        - university_code
        - college_code
        - department_code
Preprocessor:
  encoding_uniform:
    sequence:
      - 'encoder'
    encoder:
      birth_year: 'encoding_uniform'
      birth_month: 'encoding_uniform'
      birth_day: 'encoding_uniform'
      zodiac: 'encoding_uniform'
      university_code: 'encoding_uniform'
      university: 'encoding_uniform'
      college_code: 'encoding_uniform'
      college: 'encoding_uniform'
      deparment_code: 'encoding_uniform'
      department_name: 'encoding_uniform'
      admission_type_code: 'encoding_uniform'
      admission_type: 'encoding_uniform'
      disabled_code: 'encoding_uniform'
      disabled_type: 'encoding_uniform'
      nationali

In [2]:
from pprint import pprint

import pandas as pd


students = pd.read_csv("benchmark/best-practices_categorical_high-cardinality.csv")


students_columns = {}
for column in students.columns:
    students_columns[column] = {}
    students_columns[column]["cardinality"] = students[column].nunique()
    students_columns[column]["dtype"] = str(students[column].dtype)

print("columns: ")
pprint(students_columns)

columns: 
{'admission_type': {'cardinality': 11, 'dtype': 'object'},
 'admission_type_code': {'cardinality': 11, 'dtype': 'object'},
 'birth_day': {'cardinality': 31, 'dtype': 'int64'},
 'birth_month': {'cardinality': 12, 'dtype': 'int64'},
 'birth_year': {'cardinality': 7, 'dtype': 'int64'},
 'college': {'cardinality': 4, 'dtype': 'object'},
 'college_code': {'cardinality': 6, 'dtype': 'object'},
 'department_code': {'cardinality': 24, 'dtype': 'int64'},
 'department_name': {'cardinality': 20, 'dtype': 'object'},
 'disabled_code': {'cardinality': 2, 'dtype': 'int64'},
 'disabled_type': {'cardinality': 2, 'dtype': 'object'},
 'identity': {'cardinality': 14, 'dtype': 'object'},
 'identity_code': {'cardinality': 14, 'dtype': 'int64'},
 'nationality': {'cardinality': 63, 'dtype': 'object'},
 'nationality_code': {'cardinality': 62, 'dtype': 'object'},
 'sex': {'cardinality': 2, 'dtype': 'object'},
 'university': {'cardinality': 2, 'dtype': 'object'},
 'university_code': {'cardinality': 2, 

In [3]:
students.head(5).T

Unnamed: 0,0,1,2,3,4
birth_year,2005,2003,2002,2002,2000
birth_month,4,1,11,12,10
birth_day,9,16,7,24,7
zodiac,牡羊座,摩羯座,天蠍座,摩羯座,天秤座
university_code,2,1,1,1,1
university,國立政治大學,國立臺灣大學,國立臺灣大學,國立臺灣大學,國立臺灣大學
college_code,700,2000,1000,9000,1000
college,理學院,理學院,文學院,電機資訊學院,文學院
department_code,702,2080,1070,9020,1040
department_name,心理學系,地理環境資源學系,日本語文學系,資訊工程學系,哲學系


# Execution and Result / 執行與結果

In [4]:
exec_case = Executor(config=yaml_path)
exec_case.run()

Now is petsard_Loader[data]_Preprocessor[encoding_uniform]_Synthesizer[demo]_Postprocessor[demo] save to csv...


In [5]:
exec_case.get_result()[
    "Loader[data]_Preprocessor[encoding_uniform]_Synthesizer[demo]_Postprocessor[demo]_Reporter[output]"
][
    "Loader[data]_Preprocessor[encoding_uniform]_Synthesizer[demo]_Postprocessor[demo]"
].head(5).T

Unnamed: 0,0,1,2,3,4
birth_year,2004,2001,2004,2004,2003
birth_month,5,12,2,4,5
birth_day,16,20,21,6,3
zodiac,摩羯座,天秤座,雙子座,射手座,金牛座
university_code,002,002,002,001,002
university,國立臺灣大學,國立政治大學,國立政治大學,國立臺灣大學,國立政治大學
college_code,100,100,2000,2000,2000
college,資訊學院,資訊學院,理學院,文學院,電機資訊學院
department_code,102,9010,702,2090,9010
department_name,哲學系,應用數學系,資訊科學系,電機工程學系,電機工程學系
