# Environment setting
環境設定

In [1]:
import os
import requests
import sys
from pathlib import Path
from pprint import pprint
from typing import Optional

import pandas as pd


# determine branch, default is main
branch: str = "main"

# determine subfolder, default is None (petsard/demo/)
subfolder: Optional[str] = "best-practices"


# Check if running in Google Colab, if so, download the utils.py file from GitHub
is_colab: bool = "COLAB_GPU" in os.environ
if is_colab:
    utils_url = (
        f"https://raw.githubusercontent.com/nics-tw/petsard/{branch}/demo/utils.py"
    )
    response = requests.get(utils_url)

    if response.status_code == 200:
        with open("utils.py", "w") as f:
            f.write(response.text)

        Path("__init__.py").touch()
    else:
        raise RuntimeError(
            f"Failed to download utils.py. Status code: {response.status_code}"
        )


# If not colab, and also contains subfolder, add the correct path of util.py
else:
    if subfolder:
        sys.path.append(os.path.dirname(os.getcwd()))

In [None]:
import importlib

# Import the utils module
from utils import (
    get_yaml_path,
    setup_environment,
)


# Setup the environment
setup_environment(
    is_colab,
    branch,
    benchmark_data=[
        "best-practices_categorical_high-cardinality",
    ],
    subfolder=subfolder,
)

# Setup utils in best-practices
best_practices_utils = importlib.import_module("best-practices.best_practices_utils")

In [3]:
# Import PETsARD
from petsard import Executor

# Best practices: Categorical variables - Uniform encoding
最佳實踐：高基數變項 - 約束條件

In [4]:
students = pd.read_csv("benchmark/best-practices_categorical_high-cardinality.csv")


students_columns = {}
for column in students.columns:
    students_columns[column] = {}
    students_columns[column]["cardinality"] = students[column].nunique()
    students_columns[column]["dtype"] = str(students[column].dtype)

print("columns: ")
pprint(students_columns)

columns: 
{'admission_type': {'cardinality': 11, 'dtype': 'object'},
 'admission_type_code': {'cardinality': 11, 'dtype': 'object'},
 'birth_day': {'cardinality': 31, 'dtype': 'int64'},
 'birth_month': {'cardinality': 12, 'dtype': 'int64'},
 'birth_year': {'cardinality': 7, 'dtype': 'int64'},
 'college': {'cardinality': 4, 'dtype': 'object'},
 'college_code': {'cardinality': 6, 'dtype': 'object'},
 'department_code': {'cardinality': 24, 'dtype': 'int64'},
 'department_name': {'cardinality': 20, 'dtype': 'object'},
 'disabled_code': {'cardinality': 2, 'dtype': 'int64'},
 'disabled_type': {'cardinality': 2, 'dtype': 'object'},
 'identity': {'cardinality': 14, 'dtype': 'object'},
 'identity_code': {'cardinality': 14, 'dtype': 'int64'},
 'nationality': {'cardinality': 63, 'dtype': 'object'},
 'nationality_code': {'cardinality': 62, 'dtype': 'object'},
 'sex': {'cardinality': 2, 'dtype': 'object'},
 'university': {'cardinality': 2, 'dtype': 'object'},
 'university_code': {'cardinality': 2, 

In [5]:
students.head(5).T

Unnamed: 0,0,1,2,3,4
birth_year,2005,2003,2002,2002,2000
birth_month,4,1,11,12,10
birth_day,9,16,7,24,7
zodiac,牡羊座,摩羯座,天蠍座,摩羯座,天秤座
university_code,2,1,1,1,1
university,國立政治大學,國立臺灣大學,國立臺灣大學,國立臺灣大學,國立臺灣大學
college_code,700,2000,1000,9000,1000
college,理學院,理學院,文學院,電機資訊學院,文學院
department_code,702,2080,1070,9020,1040
department_name,心理學系,地理環境資源學系,日本語文學系,資訊工程學系,哲學系


# YAML Configuration of Categorical variables for PETsARD
PETsARD 的類別變項 YAML 設定

In [6]:
yaml_file_case: str = "categorical.yaml"

yaml_path_case: str = get_yaml_path(
    is_colab=is_colab,
    yaml_file=yaml_file_case,
    branch=branch,
    subfolder="best-practices",
)

Configuration content:
---
Loader:
  data:
    filepath: 'benchmark/best-practices_categorical_high-cardinality.csv'
    column_types:
      category:
        - university_code
        - college_code
        - department_code
Preprocessor:
  encoding_uniform:
    sequence:
      - 'encoder'
    encoder:
      birth_year: 'encoding_uniform'
      birth_month: 'encoding_uniform'
      birth_day: 'encoding_uniform'
      zodiac: 'encoding_uniform'
      university_code: 'encoding_uniform'
      university: 'encoding_uniform'
      college_code: 'encoding_uniform'
      college: 'encoding_uniform'
      deparment_code: 'encoding_uniform'
      department_name: 'encoding_uniform'
      admission_type_code: 'encoding_uniform'
      admission_type: 'encoding_uniform'
      disabled_code: 'encoding_uniform'
      disabled_type: 'encoding_uniform'
      nationality_code: 'encoding_uniform'
      nationality: 'encoding_uniform'
      identity_code: 'encoding_uniform'
      identity: 'encoding_un

### Execution and Result
執行與結果

In [7]:
# Initialize and run executor
exec_case = Executor(config=yaml_path_case)
exec_case.run()

Now is petsard_Loader[data]_Preprocessor[encoding_uniform]_Synthesizer[demo]_Postprocessor[demo] save to csv...


In [8]:
output = exec_case.get_result()[
    "Loader[data]_Preprocessor[encoding_uniform]_Synthesizer[demo]_Postprocessor[demo]_Reporter[output]"
][
    "Loader[data]_Preprocessor[encoding_uniform]_Synthesizer[demo]_Postprocessor[demo]"
].copy()

output.head(5).T

Unnamed: 0,0,1,2,3,4
birth_year,2004,2001,2004,2004,2003
birth_month,5,12,2,4,5
birth_day,16,20,21,6,3
zodiac,摩羯座,天秤座,雙子座,射手座,金牛座
university_code,002,002,002,001,002
university,國立臺灣大學,國立政治大學,國立政治大學,國立臺灣大學,國立政治大學
college_code,100,100,2000,2000,2000
college,資訊學院,資訊學院,理學院,文學院,電機資訊學院
department_code,1070,9010,104,2090,702
department_name,哲學系,應用數學系,資訊科學系,電機工程學系,電機工程學系


# Invalid on Constraints
約束條件違反

## Birthday and Zodiac Sign: Field Constraints (`field_constraints`)
生日與星座：欄位約束 (`field_constraints`)

In [9]:
best_practices_utils.check_invalid(output, ["birthday"])

# 152 invalid birthday dates found

   index                                            reason  year  month  day
0     18    該年6月沒有31日 Day 31 does not exist in 6 that year  2001      6   31
1     24  該年11月沒有31日 Day 31 does not exist in 11 that year  2002     11   31
2     70    該年6月沒有31日 Day 31 does not exist in 6 that year  2002      6   31
3    120    該年2月沒有30日 Day 30 does not exist in 2 that year  2006      2   30
4    148    該年6月沒有31日 Day 31 does not exist in 6 that year  2004      6   31

Counts by reason:
reason
該年11月沒有31日 Day 31 does not exist in 11 that year    21
該年2月沒有29日 Day 29 does not exist in 2 that year      29
該年2月沒有30日 Day 30 does not exist in 2 that year      31
該年2月沒有31日 Day 31 does not exist in 2 that year      19
該年4月沒有31日 Day 31 does not exist in 4 that year      15
該年6月沒有31日 Day 31 does not exist in 6 that year      18
該年9月沒有31日 Day 31 does not exist in 9 that year      19
Name: index, dtype: int64

Total invalid records: 152
Unique invalid records: 152


In [10]:
best_practices_utils.check_invalid(output, ["zodiac"])

# 9199 invalid zodiac signs found

   index                      reason  month  day zodiac expected_zodiac
0      0  星座與出生日期不匹配 Zodiac mismatch      5   16    摩羯座             金牛座
1      1  星座與出生日期不匹配 Zodiac mismatch     12   20    天秤座             射手座
2      2  星座與出生日期不匹配 Zodiac mismatch      2   21    雙子座             雙魚座
3      3  星座與出生日期不匹配 Zodiac mismatch      4    6    射手座             牡羊座
4      5  星座與出生日期不匹配 Zodiac mismatch      1   28    摩羯座             水瓶座

Counts by reason:
reason
星座與出生日期不匹配 Zodiac mismatch    9199
Name: index, dtype: int64

Total invalid records: 9199
Unique invalid records: 9199


## University, College, and Department: Field Combination Constraints (`field_combinations`)
大學、學院、與系所：欄位組合約束 (`field_combinations`)

In [11]:
best_practices_utils.check_invalid(output, ["university"])

# 3213 invalid universities found

   index university_code university  \
0      0             002     國立臺灣大學   
1      6             002     國立臺灣大學   
2      7             002     國立臺灣大學   
3      8             002     國立臺灣大學   
4     11             002     國立臺灣大學   

                                              reason  
0  大學代碼與名稱不符，大學代碼應對應 國立政治大學 (University code and ...  
1  大學代碼與名稱不符，大學代碼應對應 國立政治大學 (University code and ...  
2  大學代碼與名稱不符，大學代碼應對應 國立政治大學 (University code and ...  
3  大學代碼與名稱不符，大學代碼應對應 國立政治大學 (University code and ...  
4  大學代碼與名稱不符，大學代碼應對應 國立政治大學 (University code and ...  

Counts by reason:
reason
大學代碼與名稱不符，大學代碼應對應 國立政治大學 (University code and name mismatch)    3213
Name: index, dtype: int64

Total invalid records: 3213
Unique invalid records: 3213


In [12]:
best_practices_utils.check_invalid(output, ["college"])

# 6477 invalid colleges found

   index college_code college  \
0      0          100    資訊學院   
1      1          100    資訊學院   
2      3         2000     文學院   
3      4         2000  電機資訊學院   
4      5         9000    資訊學院   

                                              reason  
0  學院代碼與名稱不符，學院代碼應對應 文學院 (College code and name m...  
1  學院代碼與名稱不符，學院代碼應對應 文學院 (College code and name m...  
2  學院代碼與名稱不符，學院代碼應對應 理學院 (College code and name m...  
3  學院代碼與名稱不符，學院代碼應對應 理學院 (College code and name m...  
4  學院代碼與名稱不符，學院代碼應對應 電機資訊學院 (College code and nam...  

Counts by reason:
reason
學院代碼與名稱不符，學院代碼應對應 文學院 (College code and name mismatch)       1716
學院代碼與名稱不符，學院代碼應對應 理學院 (College code and name mismatch)       4280
學院代碼與名稱不符，學院代碼應對應 資訊學院 (College code and name mismatch)       127
學院代碼與名稱不符，學院代碼應對應 電機資訊學院 (College code and name mismatch)     354
Name: index, dtype: int64

# 8284 invalid university-college relationships found

   index university_code college_code  \
0      2             002   

In [13]:
best_practices_utils.check_invalid(output, ["department"])

# 8411 invalid college-department relationships found

   index college_code department_code  \
0      0          100            1070   
1      1          100            9010   
2      2         2000             104   
3      4         2000             702   
4      5         9000             703   

                                              reason  
0  系所 1070 不屬於該學院 (Department does not belong to ...  
1  系所 9010 不屬於該學院 (Department does not belong to ...  
2  系所 104 不屬於該學院 (Department does not belong to c...  
3  系所 702 不屬於該學院 (Department does not belong to c...  
4  系所 703 不屬於該學院 (Department does not belong to c...  

Counts by reason:
reason
系所 101 不屬於該學院 (Department does not belong to college)      354
系所 1010 不屬於該學院 (Department does not belong to college)     167
系所 102 不屬於該學院 (Department does not belong to college)      384
系所 1020 不屬於該學院 (Department does not belong to college)     218
系所 103 不屬於該學院 (Department does not belong to college)      389
系所 1030 不屬於該學院 (Department 

## Nationality and Nationality Code: Missing Value Group Constraints (`nan_group`)
國籍與國籍代碼：遺失值群組約束 (`nan_group`)

In [14]:
best_practices_utils.check_invalid(output, ["nationality"])

# 1370 invalid nationalities found

   index nationality_code nationality  \
0     19              113        中華民國   
1     20              113        中華民國   
2     21              113        中華民國   
3     23              009        中華民國   
4     25              019        中華民國   

                                              reason  
0  中華民國國籍不應有國籍代碼 (ROC nationality should not have...  
1  中華民國國籍不應有國籍代碼 (ROC nationality should not have...  
2  中華民國國籍不應有國籍代碼 (ROC nationality should not have...  
3  中華民國國籍不應有國籍代碼 (ROC nationality should not have...  
4  中華民國國籍不應有國籍代碼 (ROC nationality should not have...  

Counts by reason:
reason
中華民國國籍不應有國籍代碼 (ROC nationality should not have nationality code)    1370
Name: index, dtype: int64

Total invalid records: 1370
Unique invalid records: 1370


# YAML Configuration for PETsARD
PETsARD 的 YAML 設定

In [15]:
yaml_file_case: str = "high-cardinality.yaml"

yaml_path_case: str = get_yaml_path(
    is_colab=is_colab,
    yaml_file=yaml_file_case,
    branch=branch,
    subfolder="best-practices",
)

Configuration content:
---
Loader:
  data:
    filepath: 'benchmark/best-practices_categorical_high-cardinality.csv'
    column_types:
      category:
        - university_code
        - college_code
        - department_code
Preprocessor:
  encoding_uniform:
    sequence:
      - 'encoder'
    encoder:
      birth_year: 'encoding_uniform'
      birth_month: 'encoding_uniform'
      birth_day: 'encoding_uniform'
      zodiac: 'encoding_uniform'
      university_code: 'encoding_uniform'
      university: 'encoding_uniform'
      college_code: 'encoding_uniform'
      college: 'encoding_uniform'
      deparment_code: 'encoding_uniform'
      department_name: 'encoding_uniform'
      admission_type_code: 'encoding_uniform'
      admission_type: 'encoding_uniform'
      disabled_code: 'encoding_uniform'
      disabled_type: 'encoding_uniform'
      nationality_code: 'encoding_uniform'
      nationality: 'encoding_uniform'
      identity_code: 'encoding_uniform'
      identity: 'encoding_un

### Execution and Result
執行與結果

In [16]:
# Initialize and run executor
exec_case = Executor(config=yaml_path_case)
exec_case.run()

Now is petsard_Loader[data]_Preprocessor[encoding_uniform]_Synthesizer[demo]_Postprocessor[demo]_Constrainer[demo] save to csv...


In [17]:
output_w_const = exec_case.get_result()[
    "Loader[data]_Preprocessor[encoding_uniform]_Synthesizer[demo]_Postprocessor[demo]_Constrainer[demo]_Reporter[output]"
][
    "Loader[data]_Preprocessor[encoding_uniform]_Synthesizer[demo]_Postprocessor[demo]_Constrainer[demo]"
].copy()

output_w_const.head(5).T

Unnamed: 0,0,1,2,3,4
birth_year,2004,2001,2004,2003,2003
birth_month,6,12,2,8,4
birth_day,19,10,14,27,30
zodiac,巨蟹座,雙子座,天秤座,牡羊座,摩羯座
university_code,002,002,002,001,002
university,國立臺灣大學,國立政治大學,國立政治大學,國立臺灣大學,國立政治大學
college_code,100,100,2000,2000,2000
college,資訊學院,資訊學院,理學院,文學院,電機資訊學院
department_code,101,1090,703,9010,703
department_name,哲學系,應用數學系,資訊科學系,歷史學系,電機工程學系


# Invalid on Constraints: After setting Constraints
約束條件違反：在設定約束條件之後

In [18]:
best_practices_utils.check_invalid(output_w_const)

# 1382 invalid nationalities found

   index nationality_code nationality  \
0     19              113        中華民國   
1     20              113        中華民國   
2     21              113        中華民國   
3     23              024        中華民國   
4     25              321        中華民國   

                                              reason  
0  中華民國國籍不應有國籍代碼 (ROC nationality should not have...  
1  中華民國國籍不應有國籍代碼 (ROC nationality should not have...  
2  中華民國國籍不應有國籍代碼 (ROC nationality should not have...  
3  中華民國國籍不應有國籍代碼 (ROC nationality should not have...  
4  中華民國國籍不應有國籍代碼 (ROC nationality should not have...  

Counts by reason:
reason
中華民國國籍不應有國籍代碼 (ROC nationality should not have nationality code)    1382
Name: index, dtype: int64

# 137 invalid birthday dates found

   index                                            reason  year  month  day
0     65  該年11月沒有31日 Day 31 does not exist in 11 that year  2005     11   31
1     92    該年2月沒有30日 Day 30 does not exist in 2 that year  2004      2