# Environment setting / 環境設定
Best practices: Categorical variables - Uniform encoding / 最佳實踐：高基數變項 - 約束條件

In [1]:
import os
import sys
from pathlib import Path

# 自動載入 utils / Auto-load utils
if "COLAB_GPU" in os.environ:
    url = "https://raw.githubusercontent.com/nics-tw/petsard/main/demo/utils.py"
    exec(open(url).read())
else:
    # 靜默搜尋 utils.py / Silent search for utils.py
    current = Path.cwd()
    for _ in range(5):
        if (current / "utils.py").exists():
            sys.path.insert(0, str(current))
            break
        current = current.parent

    # 匯入 utils 模組 / Import utils module
    from utils import quick_setup

# 快速設定 / Quick setup
is_colab, branch, yaml_path = quick_setup(
    yaml_file=[
        "categorical.yaml",
        "high-cardinality.yaml",
    ],
    benchmark_data=[
        "best-practices_categorical_high-cardinality",
    ],
    branch="main",  # 可選，預設為 "main"
    example_files=["high-cardinality.py"],
)

from petsard import Executor

🚀 PETsARD v1.5.1
📅 2025-07-31 14:07:15 UTC+8
📁 Example files specified (local environment):
✅ Found locally: high-cardinality.py
📁 Subfolder: best-practices
📄 YAML path (1/2): petsard/demo/best-practices/categorical.yaml
📄 YAML path (2/2): petsard/demo/best-practices/high-cardinality.yaml
⚙️ Configuration content (1/2) - categorical.yaml:
---
Loader:
  data:
    filepath: 'benchmark/best-practices_categorical_high-cardinality.csv'
    column_types:
      category:
        - university_code
        - college_code
        - department_code
Preprocessor:
  encoding_uniform:
    sequence:
      - 'encoder'
    encoder:
      birth_year: 'encoding_uniform'
      birth_month: 'encoding_uniform'
      birth_day: 'encoding_uniform'
      zodiac: 'encoding_uniform'
      university_code: 'encoding_uniform'
      university: 'encoding_uniform'
      college_code: 'encoding_uniform'
      college: 'encoding_uniform'
      deparment_code: 'encoding_uniform'
      department_name: 'encoding_uniform

In [2]:
import importlib
from pprint import pprint

import pandas as pd


# Setup utils in best-practices
high_cardinality_utils = importlib.import_module("best-practices.high-cardinality")


students = pd.read_csv("benchmark/best-practices_categorical_high-cardinality.csv")


students_columns = {}
for column in students.columns:
    students_columns[column] = {}
    students_columns[column]["cardinality"] = students[column].nunique()
    students_columns[column]["dtype"] = str(students[column].dtype)

print("columns: ")
pprint(students_columns)

columns: 
{'admission_type': {'cardinality': 11, 'dtype': 'object'},
 'admission_type_code': {'cardinality': 11, 'dtype': 'object'},
 'birth_day': {'cardinality': 31, 'dtype': 'int64'},
 'birth_month': {'cardinality': 12, 'dtype': 'int64'},
 'birth_year': {'cardinality': 7, 'dtype': 'int64'},
 'college': {'cardinality': 4, 'dtype': 'object'},
 'college_code': {'cardinality': 6, 'dtype': 'object'},
 'department_code': {'cardinality': 24, 'dtype': 'int64'},
 'department_name': {'cardinality': 20, 'dtype': 'object'},
 'disabled_code': {'cardinality': 2, 'dtype': 'int64'},
 'disabled_type': {'cardinality': 2, 'dtype': 'object'},
 'identity': {'cardinality': 14, 'dtype': 'object'},
 'identity_code': {'cardinality': 14, 'dtype': 'int64'},
 'nationality': {'cardinality': 63, 'dtype': 'object'},
 'nationality_code': {'cardinality': 62, 'dtype': 'object'},
 'sex': {'cardinality': 2, 'dtype': 'object'},
 'university': {'cardinality': 2, 'dtype': 'object'},
 'university_code': {'cardinality': 2, 

In [3]:
students.head(5).T

Unnamed: 0,0,1,2,3,4
birth_year,2005,2003,2002,2002,2000
birth_month,4,1,11,12,10
birth_day,9,16,7,24,7
zodiac,牡羊座,摩羯座,天蠍座,摩羯座,天秤座
university_code,2,1,1,1,1
university,國立政治大學,國立臺灣大學,國立臺灣大學,國立臺灣大學,國立臺灣大學
college_code,700,2000,1000,9000,1000
college,理學院,理學院,文學院,電機資訊學院,文學院
department_code,702,2080,1070,9020,1040
department_name,心理學系,地理環境資源學系,日本語文學系,資訊工程學系,哲學系


# Execution and Result / 執行與結果

In [4]:
exec_case = Executor(config=yaml_path[0])
exec_case.run()

Now is petsard_Loader[data]_Preprocessor[encoding_uniform]_Synthesizer[demo]_Postprocessor[demo] save to csv...


In [5]:
output = exec_case.get_result()[
    "Loader[data]_Preprocessor[encoding_uniform]_Synthesizer[demo]_Postprocessor[demo]_Reporter[output]"
][
    "Loader[data]_Preprocessor[encoding_uniform]_Synthesizer[demo]_Postprocessor[demo]"
].copy()

output.head(5).T

Unnamed: 0,0,1,2,3,4
birth_year,2004,2001,2004,2004,2003
birth_month,5,12,2,4,5
birth_day,16,20,21,6,3
zodiac,摩羯座,天秤座,雙子座,射手座,金牛座
university_code,002,002,002,001,002
university,國立臺灣大學,國立政治大學,國立政治大學,國立臺灣大學,國立政治大學
college_code,100,100,2000,2000,2000
college,資訊學院,資訊學院,理學院,文學院,電機資訊學院
department_code,1070,9010,102,701,9010
department_name,哲學系,應用數學系,資訊科學系,電機工程學系,電機工程學系


# Invalid on Constraints / 約束條件違反

## Field Constraints / 欄位約束 (`field_constraints`)
Birthday and Zodiac Sign / 生日與星座

In [6]:
high_cardinality_utils.check_invalid(output, ["birthday"])

# 152 invalid birthday dates found

   index                                            reason  year  month  day
0     18    該年6月沒有31日 Day 31 does not exist in 6 that year  2001      6   31
1     24  該年11月沒有31日 Day 31 does not exist in 11 that year  2002     11   31
2     70    該年6月沒有31日 Day 31 does not exist in 6 that year  2002      6   31
3    120    該年2月沒有30日 Day 30 does not exist in 2 that year  2006      2   30
4    148    該年6月沒有31日 Day 31 does not exist in 6 that year  2004      6   31

Counts by reason:
reason
該年11月沒有31日 Day 31 does not exist in 11 that year    21
該年2月沒有29日 Day 29 does not exist in 2 that year      30
該年2月沒有30日 Day 30 does not exist in 2 that year      31
該年2月沒有31日 Day 31 does not exist in 2 that year      19
該年4月沒有31日 Day 31 does not exist in 4 that year      14
該年6月沒有31日 Day 31 does not exist in 6 that year      18
該年9月沒有31日 Day 31 does not exist in 9 that year      19
Name: index, dtype: int64

Total invalid records: 152
Unique invalid records: 152


In [7]:
high_cardinality_utils.check_invalid(output, ["zodiac"])

# 9204 invalid zodiac signs found

   index                      reason  month  day zodiac expected_zodiac
0      0  星座與出生日期不匹配 Zodiac mismatch      5   16    摩羯座             金牛座
1      1  星座與出生日期不匹配 Zodiac mismatch     12   20    天秤座             射手座
2      2  星座與出生日期不匹配 Zodiac mismatch      2   21    雙子座             雙魚座
3      3  星座與出生日期不匹配 Zodiac mismatch      4    6    射手座             牡羊座
4      5  星座與出生日期不匹配 Zodiac mismatch      1   28    摩羯座             水瓶座

Counts by reason:
reason
星座與出生日期不匹配 Zodiac mismatch    9204
Name: index, dtype: int64

Total invalid records: 9204
Unique invalid records: 9204


## Field Combination Constraints / 欄位組合約束 (`field_combinations`)
University, College, and Department / 大學、學院、與系所

In [8]:
high_cardinality_utils.check_invalid(output, ["university"])

# 3213 invalid universities found

   index university_code university  \
0      0             002     國立臺灣大學   
1      6             002     國立臺灣大學   
2      7             002     國立臺灣大學   
3      8             002     國立臺灣大學   
4     11             002     國立臺灣大學   

                                              reason  
0  大學代碼與名稱不符，大學代碼應對應 國立政治大學 (University code and ...  
1  大學代碼與名稱不符，大學代碼應對應 國立政治大學 (University code and ...  
2  大學代碼與名稱不符，大學代碼應對應 國立政治大學 (University code and ...  
3  大學代碼與名稱不符，大學代碼應對應 國立政治大學 (University code and ...  
4  大學代碼與名稱不符，大學代碼應對應 國立政治大學 (University code and ...  

Counts by reason:
reason
大學代碼與名稱不符，大學代碼應對應 國立政治大學 (University code and name mismatch)    3213
Name: index, dtype: int64

Total invalid records: 3213
Unique invalid records: 3213


In [9]:
high_cardinality_utils.check_invalid(output, ["college"])

# 6478 invalid colleges found

   index college_code college  \
0      0          100    資訊學院   
1      1          100    資訊學院   
2      3         2000     文學院   
3      4         2000  電機資訊學院   
4      5         9000    資訊學院   

                                              reason  
0  學院代碼與名稱不符，學院代碼應對應 文學院 (College code and name m...  
1  學院代碼與名稱不符，學院代碼應對應 文學院 (College code and name m...  
2  學院代碼與名稱不符，學院代碼應對應 理學院 (College code and name m...  
3  學院代碼與名稱不符，學院代碼應對應 理學院 (College code and name m...  
4  學院代碼與名稱不符，學院代碼應對應 電機資訊學院 (College code and nam...  

Counts by reason:
reason
學院代碼與名稱不符，學院代碼應對應 文學院 (College code and name mismatch)       1717
學院代碼與名稱不符，學院代碼應對應 理學院 (College code and name mismatch)       4280
學院代碼與名稱不符，學院代碼應對應 資訊學院 (College code and name mismatch)       127
學院代碼與名稱不符，學院代碼應對應 電機資訊學院 (College code and name mismatch)     354
Name: index, dtype: int64

# 8284 invalid university-college relationships found

   index university_code college_code  \
0      2             002   

In [10]:
high_cardinality_utils.check_invalid(output, ["department"])

# 9207 invalid departments found

   index department_code department_name  \
0      0            1070             哲學系   
1      1            9010           應用數學系   
2      2             102           資訊科學系   
3      3             701          電機工程學系   
4      5             701          外國語文學系   

                                              reason  
0  系所代碼與名稱不符，系所代碼應對應 日本語文學系 (Department code and ...  
1  系所代碼與名稱不符，系所代碼應對應 電機工程學系 (Department code and ...  
2  系所代碼與名稱不符，系所代碼應對應 教育學系 (Department code and na...  
3  系所代碼與名稱不符，系所代碼應對應 應用數學系 (Department code and n...  
4  系所代碼與名稱不符，系所代碼應對應 應用數學系 (Department code and n...  

Counts by reason:
reason
系所代碼與名稱不符，系所代碼應對應 中國文學系 (Department code and name mismatch)        549
系所代碼與名稱不符，系所代碼應對應 人類學系 (Department code and name mismatch)         207
系所代碼與名稱不符，系所代碼應對應 化學系 (Department code and name mismatch)          226
系所代碼與名稱不符，系所代碼應對應 哲學系 (Department code and name mismatch)          615
系所代碼與名稱不符，系所代碼應對應 圖書資訊學系 (Department code and name mismatch) 

## Missing Value Group Constraints / 遺失值群組約束 (`nan_group`)
Nationality and Nationality Code / 國籍與國籍代碼

In [11]:
high_cardinality_utils.check_invalid(output, ["nationality"])

# 1370 invalid nationalities found

   index nationality_code nationality  \
0     19              113        中華民國   
1     20              113        中華民國   
2     21              113        中華民國   
3     23              009        中華民國   
4     25              019        中華民國   

                                              reason  
0  中華民國國籍不應有國籍代碼 (ROC nationality should not have...  
1  中華民國國籍不應有國籍代碼 (ROC nationality should not have...  
2  中華民國國籍不應有國籍代碼 (ROC nationality should not have...  
3  中華民國國籍不應有國籍代碼 (ROC nationality should not have...  
4  中華民國國籍不應有國籍代碼 (ROC nationality should not have...  

Counts by reason:
reason
中華民國國籍不應有國籍代碼 (ROC nationality should not have nationality code)    1370
Name: index, dtype: int64

Total invalid records: 1370
Unique invalid records: 1370


# Execution and Result 2 / 執行與結果 2

In [12]:
exec_case_w_const = Executor(config=yaml_path[1])
exec_case_w_const.run()

Trial 10: Got 1 rows, need 9999 more
Trial 20: Got 5 rows, need 9995 more
Trial 30: Got 6 rows, need 9994 more
Trial 40: Got 10 rows, need 9990 more
Trial 50: Got 15 rows, need 9985 more
Trial 60: Got 19 rows, need 9981 more
Trial 70: Got 21 rows, need 9979 more
Trial 80: Got 26 rows, need 9974 more
Trial 90: Got 29 rows, need 9971 more
Trial 100: Got 35 rows, need 9965 more
Trial 110: Got 38 rows, need 9962 more
Trial 120: Got 40 rows, need 9960 more
Trial 130: Got 46 rows, need 9954 more
Trial 140: Got 48 rows, need 9952 more
Trial 150: Got 52 rows, need 9948 more
Trial 160: Got 58 rows, need 9942 more
Trial 170: Got 60 rows, need 9940 more
Trial 180: Got 66 rows, need 9934 more
Trial 190: Got 68 rows, need 9932 more
Trial 200: Got 72 rows, need 9928 more
Trial 210: Got 75 rows, need 9925 more
Trial 220: Got 81 rows, need 9919 more
Trial 230: Got 86 rows, need 9914 more
Trial 240: Got 89 rows, need 9911 more
Trial 250: Got 94 rows, need 9906 more
Trial 260: Got 97 rows, need 9903 mor

  self.constrained_data = self.constrainer.resample_until_satisfy(


In [13]:
output_w_const = exec_case_w_const.get_result()[
    "Loader[data]_Preprocessor[encoding_uniform]_Synthesizer[demo]_Postprocessor[demo]_Constrainer[demo]_Reporter[output]"
][
    "Loader[data]_Preprocessor[encoding_uniform]_Synthesizer[demo]_Postprocessor[demo]_Constrainer[demo]"
].copy()

output_w_const.head(5).T

Unnamed: 0,0,1,2,3,4
birth_year,2005,2002,2004,2003,2001
birth_month,6,3,3,4,3
birth_day,23,29,5,1,11
zodiac,巨蟹座,牡羊座,雙魚座,牡羊座,雙魚座
university_code,002,002,002,001,002
university,國立政治大學,國立政治大學,國立政治大學,國立臺灣大學,國立政治大學
college_code,ZA0,ZA0,ZA0,9000,ZA0
college,資訊學院,資訊學院,資訊學院,電機資訊學院,資訊學院
department_code,703,703,703,9010,703
department_name,資訊科學系,資訊科學系,資訊科學系,電機工程學系,資訊科學系


# Invalid on Constraints: After setting Constraints / 約束條件違反：在設定約束條件之後

In [14]:
high_cardinality_utils.check_invalid(output_w_const)

# No invalid colleges found

# No invalid birthday dates found

# No invalid nationalities found

# No invalid departments found

# No invalid college-department relationships found

# No invalid universities found

# No invalid university-college relationships found

# No invalid zodiac signs found

Total invalid records: 0
Unique invalid records: 0
