# Environment setting
環境設定

In [1]:
import os
import requests
import sys
from pathlib import Path
from pprint import pprint
from typing import Optional

import pandas as pd


# determine branch, default is main
branch: str = "main"

# determine subfolder, default is None (petsard/demo/)
subfolder: Optional[str] = "best-practices"


# Check if running in Google Colab, if so, download the utils.py file from GitHub
is_colab: bool = "COLAB_GPU" in os.environ
if is_colab:
    utils_url = (
        f"https://raw.githubusercontent.com/nics-tw/petsard/{branch}/demo/utils.py"
    )
    response = requests.get(utils_url)

    if response.status_code == 200:
        with open("utils.py", "w") as f:
            f.write(response.text)

        Path("__init__.py").touch()
    else:
        raise RuntimeError(
            f"Failed to download utils.py. Status code: {response.status_code}"
        )


# If not colab, and also contains subfolder, add the correct path of util.py
else:
    if subfolder:
        sys.path.append(os.path.dirname(os.getcwd()))

In [None]:
# Import the utils module
from utils import (
    get_yaml_path,
    setup_environment,
)

# Setup the environment
setup_environment(
    is_colab,
    branch,
    benchmark_data=[
        "best-practices_multi-table",
    ],
    subfolder=subfolder,
)

In [3]:
# Import PETsARD
from petsard import Executor

# Best practices: Multi-Timestamp data - Time Anchoring
最佳實踐：多時間點資料 - 時間定錨

In [4]:
multitable = pd.read_csv("benchmark/best-practices_multi-table.csv")

print("Multi-table columns: ")
pprint(multitable.columns.tolist())

Multi-table columns: 
['company_id',
 'industry',
 'sub_industry',
 'city',
 'district',
 'established_date',
 'capital',
 'first_apply_application_id',
 'first_apply_loan_type',
 'first_apply_apply_date',
 'first_apply_approval_date',
 'first_apply_status',
 'first_apply_amount_requested',
 'first_apply_amount_approved',
 'latest_apply_application_id',
 'latest_apply_loan_type',
 'latest_apply_apply_date',
 'latest_apply_approval_date',
 'latest_apply_status',
 'latest_apply_amount_requested',
 'latest_apply_amount_approved',
 'latest_track_application_id',
 'latest_track_profit_ratio_avg_profit_ratio',
 'latest_track_profit_ratio_min_profit_ratio',
 'latest_track_profit_ratio_profit_ratio_std',
 'latest_track_profit_ratio_negative_profit_count',
 'latest_track_tracking_date_tracking_months',
 'latest_track_tracking_date_last_tracking_date',
 'latest_track_revenue_avg_revenue',
 'latest_track_revenue_revenue_growth',
 'latest_track_risk_level_last_risk',
 'latest_track_risk_level_seco

In [5]:
multitable.head(5).T

Unnamed: 0,0,1,2,3,4
company_id,C000001,C000002,C000003,C000004,C000005
industry,營建工程,營建工程,製造業,營建工程,批發零售
sub_industry,環保工程,建築工程,金屬加工,環保工程,零售
city,新北市,臺北市,臺北市,桃園市,臺北市
district,板橋區,內湖區,內湖區,中壢區,內湖區
established_date,2019-11-03,2017-01-02,2012-05-29,2010-09-24,2010-07-24
capital,19899000,17359000,5452000,20497000,17379000
first_apply_application_id,A00000001,A00000006,A00000008,,A00000014
first_apply_loan_type,廠房擴充,數位轉型,廠房擴充,,疫後紓困
first_apply_apply_date,2022-01-21,2020-03-17,2016-05-08,,2014-01-04


# YAML Configuration for PETsARD
PETsARD 的 YAML 設定

In [6]:
yaml_file_case: str = "multi-timestamp.yaml"

yaml_path_case: str = get_yaml_path(
    is_colab=is_colab,
    yaml_file=yaml_file_case,
    branch=branch,
    subfolder="best-practices",
)

Configuration content:
---
Loader:
  data:
    filepath: 'benchmark/best-practices_multi-table.csv'
Preprocessor:
  time-anchoring:
    method: 'default'
    config:
      scaler:
        'established_date':
          # 以公司成立日為錨點，計算與申請、核准、追蹤等重要時間點的天數差異
          # Using company establishment date as anchor to calculate day differences
          #  with application, approval and tracking dates
          method: 'scaler_timeanchor'
          reference:
            - 'apply_date'
            - 'approval_date'
            - 'tracking_date_last_tracking_date'
          unit: 'D' # D 代表以天為單位 D represents measurement in days
Synthesizer:
  demo:
    method: 'default'
Postprocessor:
  demo:
    method: 'default'
Reporter:
  output:
    method: 'save_data'
    source: 'Postprocessor'
...


### Execution and Result
執行與結果

In [7]:
# Initialize and run executor
exec_case = Executor(config=yaml_path_case)
exec_case.run()

Now is petsard_Loader[data]_Preprocessor[time-anchoring]_Synthesizer[demo]_Postprocessor[demo] save to csv...


In [8]:
exec_case.get_result()[
    "Loader[data]_Preprocessor[time-anchoring]_Synthesizer[demo]_Postprocessor[demo]_Reporter[output]"
][
    "Loader[data]_Preprocessor[time-anchoring]_Synthesizer[demo]_Postprocessor[demo]"
].head(5).T

Unnamed: 0,0,1,2,3,4
company_id,C000850,C000827,C000085,C000464,C000724
industry,批發零售,服務業,批發零售,服務業,營建工程
sub_industry,電子零組件,餐飲,進出口貿易,食品,機電工程
city,臺北市,臺北市,臺北市,桃園市,臺北市
district,龜山區,板橋區,三重區,內湖區,大安區
established_date,2018-06-15 00:00:00,2015-06-20 00:00:00,2013-04-01 00:00:00,2018-01-11 00:00:00,2018-11-09 00:00:00
capital,30608865.641198,24391511.078634,17253441.59153,31436983.257465,46467931.09333
first_apply_application_id,A00002110,A00002055,A00000217,A00001141,A00001815
first_apply_loan_type,數位轉型,創新研發,創新研發,數位轉型,疫後紓困
first_apply_apply_date,2012-04-09 00:00:00,2022-08-28 00:00:00,2024-03-28 00:00:00,2018-04-11 00:00:00,2017-03-08 00:00:00
