# Environment setting
環境設定

In [1]:
import os
import requests
import sys
from pathlib import Path
from pprint import pprint
from typing import Optional

import pandas as pd


# determine branch, default is main
branch: str = "main"

# determine subfolder, default is None (petsard/demo/)
subfolder: Optional[str] = "best-practices"


# Check if running in Google Colab, if so, download the utils.py file from GitHub
is_colab: bool = "COLAB_GPU" in os.environ
if is_colab:
    utils_url = (
        f"https://raw.githubusercontent.com/nics-tw/petsard/{branch}/demo/utils.py"
    )
    response = requests.get(utils_url)

    if response.status_code == 200:
        with open("utils.py", "w") as f:
            f.write(response.text)

        Path("__init__.py").touch()
    else:
        raise RuntimeError(
            f"Failed to download utils.py. Status code: {response.status_code}"
        )


# If not colab, and also contains subfolder, add the correct path of util.py
else:
    if subfolder:
        sys.path.append(os.path.dirname(os.getcwd()))

In [None]:
# Import the utils module
from utils import (
    get_yaml_path,
    setup_environment,
)

# Setup the environment
setup_environment(
    is_colab,
    branch,
    benchmark_data=[
        "best-practices_multi-table_companies",
        "best-practices_multi-table_applications",
        "best-practices_multi-table_tracking",
    ],
    subfolder=subfolder,
)

In [3]:
# Import PETsARD
from petsard import Executor

# Best practices: Multi-Table data - Denormalization
最佳實踐：多表格資料 - 反正規化

In [4]:
companies = pd.read_csv("benchmark/best-practices_multi-table_companies.csv")
applications = pd.read_csv("benchmark/best-practices_multi-table_applications.csv")
tracking = pd.read_csv("benchmark/best-practices_multi-table_tracking.csv")

print("Companies columns: ")
pprint(companies.columns.tolist())

print("\nApplications columns:")
pprint(applications.columns.tolist())

print("\nTracking columns:")
pprint(tracking.columns.tolist())

Companies columns: 
['company_id',
 'industry',
 'sub_industry',
 'city',
 'district',
 'established_date',
 'capital']

Applications columns:
['application_id',
 'company_id',
 'loan_type',
 'apply_date',
 'approval_date',
 'status',
 'amount_requested',
 'amount_approved']

Tracking columns:
['application_id',
 'profit_ratio_avg_profit_ratio',
 'profit_ratio_min_profit_ratio',
 'profit_ratio_profit_ratio_std',
 'profit_ratio_negative_profit_count',
 'tracking_date_tracking_months',
 'tracking_date_last_tracking_date',
 'revenue_avg_revenue',
 'revenue_revenue_growth',
 'risk_level_last_risk',
 'risk_level_second_last_risk']


In [5]:
companies.head(5)

Unnamed: 0,company_id,industry,sub_industry,city,district,established_date,capital
0,C000001,營建工程,環保工程,新北市,板橋區,2019-11-03,19899000
1,C000002,營建工程,建築工程,臺北市,內湖區,2017-01-02,17359000
2,C000003,製造業,金屬加工,臺北市,內湖區,2012-05-29,5452000
3,C000004,營建工程,環保工程,桃園市,中壢區,2010-09-24,20497000
4,C000005,批發零售,零售,臺北市,內湖區,2010-07-24,17379000


In [6]:
applications.head(5)

Unnamed: 0,application_id,company_id,loan_type,apply_date,approval_date,status,amount_requested,amount_approved
0,A00000001,C000001,廠房擴充,2022-01-21,2022-03-19,approved,12848000,12432000.0
1,A00000002,C000001,營運週轉金,2025-01-05,2025-02-11,approved,2076000,1516000.0
2,A00000003,C000001,創新研發,2025-01-05,2025-01-30,approved,11683000,10703000.0
3,A00000004,C000002,營運週轉金,2020-12-12,,rejected,5533000,
4,A00000005,C000002,廠房擴充,2026-02-14,,rejected,1433000,


In [7]:
tracking.head(5)

Unnamed: 0,application_id,profit_ratio_avg_profit_ratio,profit_ratio_min_profit_ratio,profit_ratio_profit_ratio_std,profit_ratio_negative_profit_count,tracking_date_tracking_months,tracking_date_last_tracking_date,revenue_avg_revenue,revenue_revenue_growth,risk_level_last_risk,risk_level_second_last_risk
0,A00000001,0.033225,-0.096496,0.084001,4,30.0,2024-09-04,18404860.0,-0.026405,high_risk,normal
1,A00000002,-0.002636,-0.08058,0.074297,6,30.0,2027-07-31,19263500.0,1.284445,normal,warning
2,A00000003,0.009984,-0.087006,0.084297,6,30.0,2027-07-19,24701240.0,1.561825,attention,severe
3,A00000007,0.002074,-0.091077,0.093598,4,21.0,2024-09-26,23880200.0,0.090593,attention,normal
4,A00000008,0.038045,-0.033057,0.053279,3,30.0,2018-12-16,23902150.0,-0.516376,high_risk,normal


# Database Denormalization Preprocessing
資料庫反正規化前處理

> 注意: 這段前處理需在 PETsARD 合成流程之前執行
> (Note: This preprocessing must be executed before PETsARD synthesis process)

In [8]:
# 標記每個公司的第一次和最新一次申請
applications["sort_tuple"] = list(
    zip(applications["apply_date"], applications["application_id"])
)

# 找出每個公司的最早申請
min_tuples = applications.groupby("company_id")["sort_tuple"].transform("min")
applications["is_first_application"] = applications["sort_tuple"] == min_tuples

# 找出每個公司的最晚申請
max_tuples = applications.groupby("company_id")["sort_tuple"].transform("max")
applications["is_latest_application"] = applications["sort_tuple"] == max_tuples

applications.drop(columns=["sort_tuple"], inplace=True, errors="ignore")


# 將財務追蹤資料串接上申請資料，以獲得公司編號
tracking_w_company = tracking.merge(
    applications[["company_id", "application_id"]],
    how="left",
    left_on="application_id",
    right_on="application_id",
)

# 標記每個公司的最新一次財務追蹤
tracking_w_company["sort_tuple"] = list(
    zip(
        tracking_w_company["tracking_date_last_tracking_date"],
        tracking_w_company["application_id"],
    )
)

max_tuples = tracking_w_company.groupby("company_id")["sort_tuple"].transform("max")
tracking_w_company["is_latest_tracking"] = (
    tracking_w_company["sort_tuple"] == max_tuples
)

tracking_w_company.drop(columns=["sort_tuple"], inplace=True, errors="ignore")


# 合併企業資料與申請資料 (Merge company and application data)
denorm_data: pd.DataFrame = (
    companies.merge(
        applications[applications["is_first_application"]].add_prefix("first_apply_"),
        how="left",
        left_on="company_id",
        right_on="first_apply_company_id",
    )
    .drop(
        columns=[
            "first_apply_company_id",
            "first_apply_is_first_application",
            "first_apply_is_latest_application",
        ]
    )
    .merge(
        applications[applications["is_latest_application"]].add_prefix("latest_apply_"),
        how="left",
        left_on="company_id",
        right_on="latest_apply_company_id",
    )
    .drop(
        columns=[
            "latest_apply_company_id",
            "latest_apply_is_first_application",
            "latest_apply_is_latest_application",
        ]
    )
)

# 加入彙整後的追蹤資料 (Add summarized tracking data)
denorm_data = denorm_data.merge(
    tracking_w_company[tracking_w_company["is_latest_tracking"]].add_prefix(
        "latest_track_"
    ),
    how="left",
    left_on="company_id",
    right_on="latest_track_company_id",
).drop(columns=["latest_track_company_id", "latest_track_is_latest_tracking"])


# 檢視結果 (Review results)
print("資料形狀 (Data shape):", denorm_data.shape)
print("\n欄位清單 (Column list):")
pprint(denorm_data.columns.tolist())

資料形狀 (Data shape): (1000, 32)

欄位清單 (Column list):
['company_id',
 'industry',
 'sub_industry',
 'city',
 'district',
 'established_date',
 'capital',
 'first_apply_application_id',
 'first_apply_loan_type',
 'first_apply_apply_date',
 'first_apply_approval_date',
 'first_apply_status',
 'first_apply_amount_requested',
 'first_apply_amount_approved',
 'latest_apply_application_id',
 'latest_apply_loan_type',
 'latest_apply_apply_date',
 'latest_apply_approval_date',
 'latest_apply_status',
 'latest_apply_amount_requested',
 'latest_apply_amount_approved',
 'latest_track_application_id',
 'latest_track_profit_ratio_avg_profit_ratio',
 'latest_track_profit_ratio_min_profit_ratio',
 'latest_track_profit_ratio_profit_ratio_std',
 'latest_track_profit_ratio_negative_profit_count',
 'latest_track_tracking_date_tracking_months',
 'latest_track_tracking_date_last_tracking_date',
 'latest_track_revenue_avg_revenue',
 'latest_track_revenue_revenue_growth',
 'latest_track_risk_level_last_risk',
 

In [9]:
print("\n前五筆資料 (First five rows):")
print(denorm_data.head().T)


前五筆資料 (First five rows):
                                                               0  \
company_id                                               C000001   
industry                                                    營建工程   
sub_industry                                                環保工程   
city                                                         新北市   
district                                                     板橋區   
established_date                                      2019-11-03   
capital                                                 19899000   
first_apply_application_id                             A00000001   
first_apply_loan_type                                       廠房擴充   
first_apply_apply_date                                2022-01-21   
first_apply_approval_date                             2022-03-19   
first_apply_status                                      approved   
first_apply_amount_requested                          12848000.0   
first_apply_amount_app

In [10]:
# 將反正規化結果儲存 (Save denormalized result)
denorm_data.to_csv("benchmark/best-practices_multi-table.csv", index=False)

# 接著可以使用 PETsARD 進行資料合成 (Now you can proceed with PETsARD data synthesis)

# YAML Configuration for PETsARD
PETsARD 的 YAML 設定

In [11]:
yaml_file_case: str = "multi-table.yaml"

yaml_path_case: str = get_yaml_path(
    is_colab=is_colab,
    yaml_file=yaml_file_case,
    branch=branch,
    subfolder="best-practices",
)

Configuration content:
---
Loader:
  data:
    filepath: 'benchmark/best-practices_multi-table.csv'
Preprocessor:
  demo:
    method: 'default'
Synthesizer:
  demo:
    method: 'default'
Postprocessor:
  demo:
    method: 'default'
Reporter:
  output:
    method: 'save_data'
    source: 'Postprocessor'
...


### Execution and Result
執行與結果

In [12]:
# Initialize and run executor
exec_case = Executor(config=yaml_path_case)
exec_case.run()

Now is petsard_Loader[data]_Preprocessor[demo]_Synthesizer[demo]_Postprocessor[demo] save to csv...


In [13]:
exec_case.get_result()[
    "Loader[data]_Preprocessor[demo]_Synthesizer[demo]_Postprocessor[demo]_Reporter[output]"
]["Loader[data]_Preprocessor[demo]_Synthesizer[demo]_Postprocessor[demo]"]

Unnamed: 0,company_id,industry,sub_industry,city,district,established_date,capital,first_apply_application_id,first_apply_loan_type,first_apply_apply_date,...,latest_track_profit_ratio_avg_profit_ratio,latest_track_profit_ratio_min_profit_ratio,latest_track_profit_ratio_profit_ratio_std,latest_track_profit_ratio_negative_profit_count,latest_track_tracking_date_tracking_months,latest_track_tracking_date_last_tracking_date,latest_track_revenue_avg_revenue,latest_track_revenue_revenue_growth,latest_track_risk_level_last_risk,latest_track_risk_level_second_last_risk
0,C000850,批發零售,專業諮詢,桃園市,桃園區,2010-04-20 00:00:00,3.114868e+07,A00002103,數位轉型,2016-11-12 00:00:00,...,0.067532,-0.086573,0.017894,3.297509,30.720440,2025-02-13 00:00:00,22199038.0,0.223145,normal,severe
1,C000834,製造業,進出口貿易,臺北市,信義區,2013-10-14 00:00:00,2.635636e+07,A00002047,創新研發,2014-10-25 00:00:00,...,-0.024975,-0.097724,0.017894,4.160503,25.692762,2016-11-09 00:00:00,26238032.0,1.988463,attention,normal
2,C000084,批發零售,進出口貿易,臺北市,龜山區,2018-10-26 00:00:00,1.697725e+07,A00000217,數位轉型,2017-09-06 00:00:00,...,0.062152,-0.088433,0.017894,2.237628,18.443953,2026-11-28 00:00:00,25265614.0,-0.821225,normal,warning
3,C000465,製造業,食品,桃園市,內湖區,2013-04-14 00:00:00,3.190299e+07,A00001124,廠房擴充,2018-08-14 00:00:00,...,0.072189,-0.054672,0.017894,1.969288,13.410726,2023-12-08 00:00:00,25491372.0,-0.957805,attention,normal
4,C000725,批發零售,食品,桃園市,三重區,2015-09-15 00:00:00,4.653879e+07,A00001810,數位轉型,2017-03-08 00:00:00,...,0.062610,-0.086922,0.017894,2.971905,21.720663,2019-04-01 00:00:00,31187308.0,-0.917853,normal,high_risk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486,C000380,批發零售,電子商務,桃園市,龜山區,2013-01-19 00:00:00,3.691596e+07,A00000944,營運週轉金,2027-12-28 00:00:00,...,0.077108,-0.069288,0.017894,1.550973,28.489460,2028-12-29 00:00:00,24169968.0,0.328769,high_risk,normal
487,C000920,製造業,民生用品,桃園市,新莊區,2017-08-09 00:00:00,2.513753e+07,A00002291,創新研發,2020-12-01 00:00:00,...,0.037465,-0.089207,0.017894,3.336019,23.421486,2025-10-21 00:00:00,23727142.0,0.249100,attention,normal
488,C000012,營建工程,民生用品,新北市,三重區,2015-10-29 00:00:00,8.201464e+06,,,,...,,,,,,,,,,
489,C000667,服務業,食品,臺北市,新莊區,2012-11-10 00:00:00,1.653587e+07,A00001640,廠房擴充,2016-06-23 00:00:00,...,0.096563,-0.038651,0.017894,3.594633,29.958788,2019-06-13 00:00:00,22584166.0,4.100326,severe,normal
