In [17]:
import os
import sys

if "google.colab" in sys.modules:
    # Running in Colab

    !git clone https://github.com/pthengtr/kcw-analytics.git
    !cd /content/kcw-analytics && git pull origin main

    from google.colab import drive
    drive.mount("/content/drive")

    BASE_FOLDER = "/content/drive/Shareddrives"
    BASE_FOLDER_GIT = "/content"
else:
    # Running in local Jupyter
    BASE_FOLDER = r"G:\Shared drives"
    BASE_FOLDER_GIT = r"C:\Users\Windows 11\Notebook"

print("Using folder:", BASE_FOLDER)

fatal: destination path 'kcw-analytics' already exists and is not an empty directory.
From https://github.com/pthengtr/kcw-analytics
 * branch            main       -> FETCH_HEAD
Already up to date.
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using folder: /content/drive/Shareddrives


In [18]:
folder = f"{BASE_FOLDER}/KCW-Data/kcw_analytics/01_raw"

In [19]:
import os
import pandas as pd

data = {}

for file in os.listdir(folder):
    if file.endswith(".csv"):
        path = os.path.join(folder, file)
        data[file] = pd.read_csv(
            path,
            dtype={
              "BCODE": "string",
              "MOBILE": "string",
              "BILLNO": "string",
            },
            encoding="utf-8-sig",
            low_memory=False   # stops chunk guessing
        )
        print(f"Loaded: {file} -> {data[file].shape}")

Loaded: raw_inventory_hq_2024.csv -> (4983, 8)
Loaded: raw_syp_pimas_purchase_bills.csv -> (2924, 49)
Loaded: raw_syp_pidet_purchase_lines.csv -> (27348, 41)
Loaded: raw_syp_simas_sales_bills.csv -> (12426, 49)
Loaded: raw_syp_sidet_sales_lines.csv -> (36682, 38)
Loaded: raw_hq_icmas_products.csv -> (114926, 94)
Loaded: raw_hq_pimas_purchase_bills.csv -> (50248, 49)
Loaded: raw_hq_pidet_purchase_lines.csv -> (153882, 41)
Loaded: raw_hq_sidet_sales_lines.csv -> (733249, 38)
Loaded: raw_hq_apmas_payable.csv -> (976, 20)
Loaded: raw_hq_pvmas_notes_vouchers.csv -> (13768, 32)
Loaded: raw_hq_armas_receivable.csv -> (2223, 20)
Loaded: raw_hq_simas_sales_bills.csv -> (276028, 49)


In [20]:
import pandas as pd
import numpy as np

def build_bill_summary_by_taxic(
    df: pd.DataFrame,
    *,
    billno_col: str = "BILLNO",
    billdate_col: str = "BILLDATE",
    detail_col: str = "DETAIL",
    amount_col: str = "AMOUNT",
    taxic_col: str = "TAXIC",
    tax_rate: float = 0.07,
    tax_id_value: str = "0000000000000",
    taxic_yes: str = "Y",
):
    """
    VAT logic:
      TAXIC == Y : AMOUNT includes VAT
      TAXIC == N : AMOUNT excludes VAT
    """

    out = df.copy()
    out[billno_col] = out[billno_col].astype("string")
    out[amount_col] = pd.to_numeric(out[amount_col], errors="coerce").fillna(0)

    # normalize TAXIC
    if taxic_col not in out.columns:
        out[taxic_col] = ""

    out[taxic_col] = out[taxic_col].astype("string").str.strip().str.upper()

    # ===== pick DETAIL from highest AMOUNT row =====
    idx_max_amt = out.groupby(billno_col)[amount_col].idxmax()
    detail_pick = (
        out.loc[idx_max_amt, [billno_col, detail_col]]
        .set_index(billno_col)[detail_col]
    )

    # ===== group totals =====
    totals = (
        out.groupby(billno_col, as_index=False)
        .agg(
            TOTAL_AMOUNT=(amount_col, "sum"),
            BILLDATE=(billdate_col, "first"),
            TAXIC=(taxic_col, "first"),
        )
    )

    totals[detail_col] = totals[billno_col].map(detail_pick)

    # ===== VAT calculation based on TAXIC =====
    divisor = 1 + tax_rate
    is_vat_inclusive = totals["TAXIC"].eq(taxic_yes)

    totals["BEFORE_VAT"] = np.where(
        is_vat_inclusive,
        (totals["TOTAL_AMOUNT"] / divisor).round(2),   # inclusive case
        totals["TOTAL_AMOUNT"].round(2)               # exclusive case
    )

    totals["VAT_AMOUNT"] = np.where(
        is_vat_inclusive,
        (totals["TOTAL_AMOUNT"] - totals["BEFORE_VAT"]).round(2),
        (totals["TOTAL_AMOUNT"] * tax_rate).round(2)
    )

    # optional: total including VAT (very useful downstream)
    totals["TOTAL_INCL_VAT"] = (totals["BEFORE_VAT"] + totals["VAT_AMOUNT"]).round(2)

    # TAX ID
    totals["TAX_ID"] = str(tax_id_value).zfill(13)[:13]

    # SEQ
    totals = totals.sort_values(billno_col).reset_index(drop=True)
    totals["SEQ"] = np.arange(1, len(totals) + 1)

    return totals

import pandas as pd
import numpy as np

def map_simas_bill_fields(
    df: pd.DataFrame,
    df_simas: pd.DataFrame,
    *,
    billno_col: str = "BILLNO",
    fields: tuple = ("DEDUCT", "TAX", "AFTERTAX"),
    copy: bool = True,
    verbose: bool = True,
):
    """
    Join DEDUCT, TAX, AFTERTAX from df_simas into df using BILLNO.
    """

    if billno_col not in df.columns:
        raise ValueError(f"{billno_col} not found in df")

    if billno_col not in df_simas.columns:
        raise ValueError(f"{billno_col} not found in df_simas")

    # --- normalize join keys (VERY important for legacy POS)
    left = df.copy()
    right = df_simas.copy()

    left["_JOIN_KEY"] = left[billno_col].astype("string").str.strip().str.upper()
    right["_JOIN_KEY"] = right[billno_col].astype("string").str.strip().str.upper()

    # --- build lookup table (avoid duplicate explosion)
    cols = ["_JOIN_KEY"] + [c for c in fields if c in right.columns]

    lookup = (
        right[cols]
        .drop_duplicates(subset=["_JOIN_KEY"], keep="first")
    )

    # --- merge
    result = left.merge(
        lookup,
        on="_JOIN_KEY",
        how="left"
    ).drop(columns=["_JOIN_KEY"])

    if copy:
        result = result.copy()

    if verbose:
        matched = result[fields[0]].notna().sum()
        print(f"[map_simas_bill_fields] matched rows: {matched:,}/{len(result):,}")

    return result

def filter_year_month(df, year, month, date_col="BILLDATE"):
    return df[pd.to_datetime(df[date_col]).dt.to_period("M") == f"{year}-{month:02d}"]

In [21]:
def map_armas_mobile(
    df: pd.DataFrame,
    df_armas: pd.DataFrame,
    *,
    acctno_col: str = "ACCTNO",
    mobile_col: str = "MOBILE",
    verbose: bool = True,
):
    if acctno_col not in df.columns:
        raise ValueError(f"{acctno_col} not found in df (did you map it from SIMAS?)")
    if acctno_col not in df_armas.columns:
        raise ValueError(f"{acctno_col} not found in df_armas")
    if mobile_col not in df_armas.columns:
        raise ValueError(f"{mobile_col} not found in df_armas")

    left = df.copy()
    right = df_armas[[acctno_col, mobile_col]].copy()

    # normalize keys
    left["_ACCTKEY"] = left[acctno_col].astype("string").str.strip().str.upper()
    right["_ACCTKEY"] = right[acctno_col].astype("string").str.strip().str.upper()

    lookup = right[["_ACCTKEY", mobile_col]].drop_duplicates("_ACCTKEY", keep="first")

    out = left.merge(lookup, on="_ACCTKEY", how="left").drop(columns=["_ACCTKEY"])

    if verbose:
        matched = out[mobile_col].notna().sum()
        print(f"[map_armas_mobile] matched MOBILE: {matched:,}/{len(out):,}")

    return out

In [22]:
import pandas as pd

# 1) Config: doc type prefixes per source
BILLTYPE_RULES = {
    "hq": {
        "TD":  ("TD",),
        "TAD": ("TAD",),
        "TR":  ("TR",),
        "CN":  ("CN",),
    },
    "syp": {
        "TD":  ("3TD",),
        "TAD": ("3TAD",),
        "TR":  ("3TR",),
        "CN":  ("3CN",),
    },
}

def build_monthly_doc_summaries(
    df_sidet: pd.DataFrame,
    df_simas: pd.DataFrame,
    df_armas: pd.DataFrame,
    *,
    source: str,     # "hq" or "syp"
    year: int,
    month: int,
    billno_col: str = "BILLNO",
    date_col: str = "BILLDATE",
    verbose: bool = True,
):
    """
    Clean pipeline:
      1) filter month
      2) remove TF bills
      3) split TD/TAD/TR/CN
      4) build summary (TAXIC logic)
      5) map SIMAS fields
    """

    # ---- 1) filter month
    df_m = filter_year_month(df_sidet, year, month)

    # normalize billno once
    s = df_m[billno_col].astype("string").str.strip().str.upper()

    # ---- 2) remove TF bills
    df_m = df_m.loc[~s.str.contains("TF", na=False)].copy()
    s = df_m[billno_col].astype("string").str.strip().str.upper()

    # ---- 3) prefix rules (simple, no config dict needed)
    if source == "hq":
        rules = {
            "TD":  ("TD",),
            "TAD": ("TAD",),
            "TR":  ("TR",),
            "CN":  ("CN",),
        }
    else:  # syp
        rules = {
            "TD":  ("3TD",),
            "TAD": ("3TAD",),
            "TR":  ("3TR",),
            "CN":  ("3CN",),
        }

    out = {}

    for doc_type, prefixes in rules.items():

        mask = s.str.startswith(prefixes, na=False)
        df_type = df_m.loc[mask].copy()

        if df_type.empty:
            out[doc_type] = df_type
            if verbose:
                print(f"[{source}] {doc_type}: 0 rows")
            continue

        # ---- 4) build summary (VAT logic by TAXIC)
        summ = build_bill_summary_by_taxic(
            df_type,
            billno_col=billno_col,
            billdate_col=date_col,
        )

        # ---- 5) map SIMAS fields
        summ = map_simas_bill_fields(
            summ,
            df_simas,
            billno_col=billno_col,
            fields=("ACCTNO", "ACCTNAME", "DEDUCT","BEFORETAX", "TAX", "AFTERTAX", "REMARKS", "PO"),
            verbose=verbose,
        )

        # ---- 6) map ARMAS MOBILE by ACCTNO
        summ = map_armas_mobile(
            summ,
            df_armas,
            acctno_col="ACCTNO",
            mobile_col="MOBILE",
            verbose=verbose,
        )

        out[doc_type] = summ

        if verbose:
            print(f"[{source}] {doc_type}: {len(df_type):,} rows -> {len(summ):,} bills")

    return out

In [23]:
# df_lines2 = map_simas_and_armas_fields(df_lines, df_simas, df_armas)

# summary = build_bill_summary_by_taxic(df_lines2)

In [24]:
df_hq_sidet = data["raw_hq_sidet_sales_lines.csv"].copy()
df_syp_sidet = data["raw_syp_sidet_sales_lines.csv"].copy()

df_hq_simas = data["raw_hq_simas_sales_bills.csv"].copy()
df_syp_simas = data["raw_syp_simas_sales_bills.csv"].copy()

df_armas = data["raw_hq_armas_receivable.csv"].copy()
df_armas = data["raw_hq_apmas_payable.csv"].copy()

In [25]:
df_hq_simas.columns


Index(['ID', 'JOURMODE', 'JOURTYPE', 'JOURDATE', 'JOURNO', 'JOURTIME',
       'DEPTNO', 'BOOKNO', 'BILLTYPE', 'BILLDATE', 'BILLTIME', 'BILLNO',
       'LINES', 'TAXIC', 'DISCOUNT', 'DEDUCT', 'BEFORETAX', 'VAT', 'TAX',
       'AFTERTAX', 'EXEMPT', 'SVCCHG', 'WITHHOLD', 'PAID', 'CASHED', 'CASHAMT',
       'CHKAMT', 'DUEAMT', 'PAYSTAT', 'ACCTNO', 'ACCTNAME', 'ADDR1', 'ADDR2',
       'PO', 'SALE', 'RE', 'TERM', 'DUEDATE', 'NOTEDATE', 'NOTENO',
       'VOUCDATE1', 'VOUCNO1', 'VOUCDATE2', 'VOUCNO2', 'POSTED1', 'POSTED2',
       'REMARKS', 'CANCELED', 'DONE'],
      dtype='object')

In [26]:
from datetime import datetime
import pytz

# Singapore timezone
tz = pytz.timezone("Asia/Bangkok")

today = datetime.now(tz)

YEAR = today.year
MONTH = today.month

print(YEAR, MONTH)

2026 2


In [27]:
hq_summaries = build_monthly_doc_summaries(
    df_hq_sidet, df_hq_simas, df_armas,
    source="hq", year=YEAR, month=MONTH
)

syp_summaries = build_monthly_doc_summaries(
    df_syp_sidet, df_syp_simas, df_armas,
    source="syp", year=YEAR, month=MONTH
)

# Access:
df_hq_td_summary  = hq_summaries["TD"]
df_hq_tad_summary = hq_summaries["TAD"]
df_hq_tr_summary  = hq_summaries["TR"]
df_hq_cn_summary  = hq_summaries["CN"]

df_syp_td_summary  = syp_summaries["TD"]
df_syp_tad_summary = syp_summaries["TAD"]
df_syp_tr_summary  = syp_summaries["TR"]
df_syp_cn_summary  = syp_summaries["CN"]

[map_simas_bill_fields] matched rows: 154/154
[map_armas_mobile] matched MOBILE: 9/154
[hq] TD: 333 rows -> 154 bills
[map_simas_bill_fields] matched rows: 600/600
[map_armas_mobile] matched MOBILE: 21/600
[hq] TAD: 657 rows -> 600 bills
[map_simas_bill_fields] matched rows: 33/33
[map_armas_mobile] matched MOBILE: 4/33
[hq] TR: 74 rows -> 33 bills
[map_simas_bill_fields] matched rows: 35/35
[map_armas_mobile] matched MOBILE: 1/35
[hq] CN: 38 rows -> 35 bills
[syp] TD: 0 rows
[syp] TAD: 0 rows
[map_simas_bill_fields] matched rows: 9/9
[map_armas_mobile] matched MOBILE: 0/9
[syp] TR: 37 rows -> 9 bills
[syp] CN: 0 rows


In [28]:
df_hq_tr_summary["REMARKS"] = (
    df_hq_tr_summary["REMARKS"]
    .astype("string")
    .str.extract(r"##(.{13})", expand=False)
)

df_syp_tr_summary["REMARKS"] = (
    df_syp_tr_summary["REMARKS"]
    .astype("string")
    .str.extract(r"##(.{13})", expand=False)
)

In [29]:
def clean_and_select(df):
    keep_cols = [
        "BILLNO", "BILLDATE", "ACCTNO", "ACCTNAME",
        "REMARKS", "MOBILE", "BEFORETAX", "TAX", "AFTERTAX", "DETAIL", "PO"
    ]

    # Return if no rows
    if df.empty:
        return df

    # Return if required columns missing
    if not set(keep_cols).issubset(df.columns):
        return df

    # Clean REMARKS column
    df["REMARKS"] = df["REMARKS"].astype(str).str.replace(r"^##", "", regex=True)

    df["REMARKS"] = '="' + df["REMARKS"].astype(str) + '"'
    df["MOBILE"] = '="' + df["MOBILE"].astype(str) + '"'

    # Keep only selected columns
    return df[keep_cols]

In [30]:
import os

kcwdir = os.path.join(BASE_FOLDER, "KCW-Data")
print(kcwdir)

/content/drive/Shareddrives/KCW-Data


In [31]:
import os
from pathlib import Path

out_dir = os.path.join(
    kcwdir,
    "kcw_analytics",
    "02_staging",
    "VAT_Sales",
    f"{YEAR}_{MONTH:02d}"
)
os.makedirs(out_dir, exist_ok=True)

exports = [
    (clean_and_select(df_hq_td_summary),  "TD"),
    (clean_and_select(df_hq_tad_summary), "TAD"),
    (clean_and_select(df_hq_tr_summary),  "TR"),
    (clean_and_select(df_hq_cn_summary),  "CN"),
    (clean_and_select(df_syp_td_summary),  "3TD"),
    (clean_and_select(df_syp_tad_summary), "3TAD"),
    (clean_and_select(df_syp_tr_summary),  "3TR"),
    (clean_and_select(df_syp_cn_summary),  "3CN"),
]

for df, name in exports:
    path = os.path.join(out_dir, f"{name}.csv")
    df.to_csv(path, index=False, encoding="utf-8-sig")
    print(f"Saved -> {path}")

Saved -> /content/drive/Shareddrives/KCW-Data/kcw_analytics/02_staging/VAT_Sales/2026_02/TD.csv
Saved -> /content/drive/Shareddrives/KCW-Data/kcw_analytics/02_staging/VAT_Sales/2026_02/TAD.csv
Saved -> /content/drive/Shareddrives/KCW-Data/kcw_analytics/02_staging/VAT_Sales/2026_02/TR.csv
Saved -> /content/drive/Shareddrives/KCW-Data/kcw_analytics/02_staging/VAT_Sales/2026_02/CN.csv
Saved -> /content/drive/Shareddrives/KCW-Data/kcw_analytics/02_staging/VAT_Sales/2026_02/3TD.csv
Saved -> /content/drive/Shareddrives/KCW-Data/kcw_analytics/02_staging/VAT_Sales/2026_02/3TAD.csv
Saved -> /content/drive/Shareddrives/KCW-Data/kcw_analytics/02_staging/VAT_Sales/2026_02/3TR.csv
Saved -> /content/drive/Shareddrives/KCW-Data/kcw_analytics/02_staging/VAT_Sales/2026_02/3CN.csv


In [32]:
df_pidet = data["raw_hq_pidet_purchase_lines.csv"].copy()
df_pimas = data["raw_hq_pimas_purchase_bills.csv"].copy()

df_apmas = data["raw_hq_apmas_payable.csv"].copy()

In [33]:
def filter_year_month(df, year, month, date_col="BILLDATE"):
    d = df.copy()
    d[date_col] = pd.to_datetime(d[date_col], errors="coerce")
    return d[d[date_col].dt.to_period("M") == f"{year}-{month:02d}"]

In [34]:
df_pimas_m = filter_year_month(df_pimas, YEAR, MONTH)
df_pidet_m = filter_year_month(df_pidet, YEAR, MONTH)

In [35]:
# --- copy first (safe practice)
pidet_join = df_pidet_m.copy()
pimas_join = df_pimas_m.copy()

# --- normalize BILLNO (VERY important for old POS)
pidet_join["_JOIN_KEY"] = pidet_join["BILLNO"].astype("string").str.strip().str.upper()
pimas_join["_JOIN_KEY"] = pimas_join["BILLNO"].astype("string").str.strip().str.upper()

# --- build lookup from pimas (avoid duplicate explosion)
lookup = (
    pimas_join[["_JOIN_KEY", "ACCTNO" ,"ACCTNAME","BOOKNO", "BEFORETAX","TAX", "AFTERTAX", "REMARKS", "PO"]]
    .drop_duplicates(subset=["_JOIN_KEY"], keep="first")
)

# --- LEFT JOIN (pidet is base)
pidet_join = pidet_join.merge(
    lookup,
    on="_JOIN_KEY",
    how="left"
).drop(columns=["_JOIN_KEY"])

print("rows after join:", len(pidet_join))
pidet_join.head()

rows after join: 2696


Unnamed: 0,ID,JOURMODE,JOURTYPE,JOURDATE,BILLTYPE,BILLDATE,BILLNO,LINE,ITEMNO,BCODE,...,CANCELED,DONE,ACCTNO_y,ACCTNAME,BOOKNO,BEFORETAX,TAX,AFTERTAX,REMARKS,PO
0,287444,2,PJ,2026-02-01,1,2026-02-01,152.0,10.0,1.0,12051971,...,N,N,BM,ห้างหุ้นส่วนจำกัด บี เอ็ม แทรกเตอร์,1.0,9600.0,0.0,9600.0,19/1/69##0103532025557,6901-358
1,287445,2,PJ,2026-02-01,1,2026-02-01,152.0,20.0,2.0,12051621,...,N,N,BM,ห้างหุ้นส่วนจำกัด บี เอ็ม แทรกเตอร์,1.0,9600.0,0.0,9600.0,19/1/69##0103532025557,6901-358
2,287635,2,PJ,2026-02-01,1,2026-02-01,169.0,10.0,1.0,12051545,...,N,N,BM,ห้างหุ้นส่วนจำกัด บี เอ็ม แทรกเตอร์,1.0,2450.0,0.0,2450.0,21/1/69##0103532025557,6901-390
3,287636,2,PJ,2026-02-01,1,2026-02-01,169.0,20.0,2.0,30051472,...,N,N,BM,ห้างหุ้นส่วนจำกัด บี เอ็ม แทรกเตอร์,1.0,2450.0,0.0,2450.0,21/1/69##0103532025557,6901-390
4,287793,2,PJ,2026-02-01,1,2026-02-01,193.0,10.0,1.0,18012060,...,N,N,BM,ห้างหุ้นส่วนจำกัด บี เอ็ม แทรกเตอร์,1.0,12200.0,0.0,12200.0,22/1/69##0103532025557,69001-430


In [36]:
pidet_join = pidet_join[pidet_join["ISVAT"] == "Y"]

In [37]:
pidet_join["REMARKS_EXTRACT"] = pidet_join["REMARKS"].astype(str).str[-13:]

In [38]:
pidet_join = pidet_join.merge(df_apmas[["ACCTNO", "MOBILE"]],
              left_on="ACCTNO_x",
              right_on="ACCTNO",
              how="left")

In [39]:
pidet_join.columns

Index(['ID', 'JOURMODE', 'JOURTYPE', 'JOURDATE', 'BILLTYPE', 'BILLDATE',
       'BILLNO', 'LINE', 'ITEMNO', 'BCODE', 'PCODE', 'MCODE', 'DETAIL',
       'WHNUMBER', 'LOCATION1', 'STATUS', 'SERIAL', 'TAXIC', 'EXMPT', 'ISVAT',
       'QTY', 'UI', 'MTP', 'PRICE', 'XPRICE', 'DISCNT1', 'DISCNT2', 'DISCNT3',
       'DISCNT4', 'DED', 'VAT', 'AMOUNT', 'CHGAMT', 'ACCTNO_x', 'PAID',
       'SALEDATE', 'SALENO', 'SALEPRICE', 'ACCT_NO', 'CANCELED', 'DONE',
       'ACCTNO_y', 'ACCTNAME', 'BOOKNO', 'BEFORETAX', 'TAX', 'AFTERTAX',
       'REMARKS', 'PO', 'REMARKS_EXTRACT', 'ACCTNO', 'MOBILE'],
      dtype='object')

In [40]:
import pandas as pd
import numpy as np

def build_purchase_bill_summary_by_taxic(
    df: pd.DataFrame,
    *,
    billno_col: str = "BILLNO",
    billdate_col: str = "BILLDATE",
    detail_col: str = "DETAIL",
    amount_col: str = "AMOUNT",
    taxic_col: str = "TAXIC",
    bookno_col: str = "BOOKNO",
    beforetax_col: str = "BEFORETAX",
    tax_col: str = "TAX",
    aftertax_col: str = "AFTERTAX",
    tax_rate: float = 0.07,
    tax_id_value: str = "0000000000000",
    taxic_yes: str = "Y",
):
    """
    Purchase summary:
      - AMOUNT summed from pidet lines
      - KEEP BOOKNO, TAX, AFTERTAX from input (pimas)
      - VAT split based on TAXIC
    """

    out = df.copy()

    out[billno_col] = out[billno_col].astype("string").str.strip().str.upper()
    out[amount_col] = pd.to_numeric(out[amount_col], errors="coerce").fillna(0)

    if taxic_col not in out.columns:
        out[taxic_col] = ""

    out[taxic_col] = out[taxic_col].astype("string").str.strip().str.upper()

    # ===== pick DETAIL from highest AMOUNT row =====
    idx_max_amt = out.groupby(billno_col)[amount_col].idxmax()
    detail_pick = (
        out.loc[idx_max_amt, [billno_col, detail_col]]
        .set_index(billno_col)[detail_col]
    )

    # ===== group totals (KEEP BOOKNO / TAX / AFTERTAX) =====
    totals = (
        out.groupby(billno_col, as_index=False)
        .agg(
            TOTAL_AMOUNT=(amount_col, "sum"),
            BILLDATE=(billdate_col, "first"),
            TAXIC=(taxic_col, "first"),
            BOOKNO=(bookno_col, "first"),
            BEFORETAX=(beforetax_col, "first"),
            TAX=(tax_col, "first"),
            AFTERTAX=(aftertax_col, "first"),
            ACCTNO=("ACCTNO", "first"),
            ACCTNAME=("ACCTNAME", "first"),
            REMARKS=("REMARKS", "first"),
            REMARKS_EXTRACT=("REMARKS_EXTRACT","first"),
            MOBILE=("MOBILE","first"),
            PO = ("PO", "first"),
        )
    )

    totals[detail_col] = totals[billno_col].map(detail_pick)

    # ===== VAT calculation based on TAXIC =====
    divisor = 1 + tax_rate
    is_vat_inclusive = totals["TAXIC"].eq(taxic_yes)

    totals["BEFORE_VAT"] = np.where(
        is_vat_inclusive,
        (totals["TOTAL_AMOUNT"] / divisor).round(2),
        totals["TOTAL_AMOUNT"].round(2)
    )

    totals["VAT_AMOUNT"] = np.where(
        is_vat_inclusive,
        (totals["TOTAL_AMOUNT"] - totals["BEFORE_VAT"]).round(2),
        (totals["TOTAL_AMOUNT"] * tax_rate).round(2)
    )

    totals["TOTAL_INCL_VAT"] = (totals["BEFORE_VAT"] + totals["VAT_AMOUNT"]).round(2)

    totals["TAX_ID"] = str(tax_id_value).zfill(13)[:13]

    totals = totals.sort_values(billno_col).reset_index(drop=True)
    totals["SEQ"] = np.arange(1, len(totals) + 1)

    return totals

In [41]:
purchase_summary = build_purchase_bill_summary_by_taxic(pidet_join)

In [42]:
def clean_and_select_purchase(df):
    keep_cols = [
        "BILLNO", "BOOKNO", "BILLDATE", "ACCTNO", "ACCTNAME",
        "REMARKS", "REMARKS_EXTRACT", "MOBILE", "BEFORETAX", "TAX", "AFTERTAX", "DETAIL", "PO"
    ]

    # Return if no rows
    if df.empty:
        return df

    # Return if required columns missing
    if not set(keep_cols).issubset(df.columns):
        return df

    # Clean REMARKS column
    df["REMARKS"] = df["REMARKS"].astype(str).str.replace(r"^##", "", regex=True)

    df["REMARKS"] = '="' + df["REMARKS"].astype(str) + '"'
    df["REMARKS_EXTRACT"] = '="' + df["REMARKS_EXTRACT"].astype(str) + '"'
    df["MOBILE"] = '="' + df["MOBILE"].astype(str) + '"'

    # Keep only selected columns
    return df[keep_cols]

In [43]:
purchase_summary = clean_and_select_purchase(purchase_summary)

In [44]:
purchase_summary

Unnamed: 0,BILLNO,BOOKNO,BILLDATE,ACCTNO,ACCTNAME,REMARKS,REMARKS_EXTRACT,MOBILE,BEFORETAX,TAX,AFTERTAX,DETAIL,PO
0,-1003861-2026,1.0,2026-02-23,7PACO,บมจ. เพรสซิเด้นท์ ออโตโมบิล อินดัสทรีส์ (สาขา0...,"=""24/2/69##0107563000231""","=""0107563000231""","=""0107563000231""",14336.45,1003.55,15340.00,น้ำยาแอร์ R134a 134A ถังเล็ก 3K PACO,6902-421
1,-202602/00074,1.0,2026-02-02,7PB,บริษัท เดชณรงค์ แทรคเตอร์ จำกัด (สำนักงานใหญ่),"=""4/2/69##0275555000306""","=""0275555000306""","=""0275555000306""",5800.00,406.00,6206.00,"แขนลากผานคูโบต้ารุ่นดาม 35"" M5000-M6040 (RH ...",6902-010
2,-202602/00078,1.0,2026-02-02,7PB,บริษัท เดชณรงค์ แทรคเตอร์ จำกัด (สำนักงานใหญ่),"=""4/2/69##0275555000306""","=""0275555000306""","=""0275555000306""",3400.00,238.00,3638.00,เฟืองบายศรี+เดือยหมูเพลาหน้า M5000-6040 8Tx1 ...,6902-010
3,-202602/00082,1.0,2026-02-02,7PB,บริษัท เดชณรงค์ แทรคเตอร์ จำกัด (สำนักงานใหญ่),"=""4/2/69##0275555000306""","=""0275555000306""","=""0275555000306""",5600.00,392.00,5992.00,กระบอกช่วยยก กระบอกใหญ่(ฝาเกลียว) M8540-M9540...,
4,-202602/00301,1.0,2026-02-04,7PB,บริษัท เดชณรงค์ แทรคเตอร์ จำกัด (สำนักงานใหญ่),"=""6/2/69##0275555000306""","=""0275555000306""","=""0275555000306""",13425.00,939.75,14364.75,แขนกลาง จัมโบ้ 6640 F/5000-6600 ย.1 DT,6902-069
...,...,...,...,...,...,...,...,...,...,...,...,...,...
489,VV0025447,1.0,2026-02-21,7333,บริษัท พี.เจ.บี.เอ็นเตอร์ไพรส์ จำกัด (สำนักงาน...,"=""23/2/69##0105537035315""","=""0105537035315""","=""0105537035315""",1280.00,89.60,1369.60,"ลูกหมากปีกนกล่าง T/T VIGO,REVO2W 333",
490,VVA6900338,1.0,2026-02-05,7HOD,บริษัท เอชโอดี อิมปอร์ต จำกัด (สำนักงานใหญ่),"=""6/2/69##0125547001162""","=""0125547001162""","=""0125547001162""",4107.01,287.49,4394.50,แหวนลูกสูบ NPR 6D16-6D17,
491,WC-690200828,2.0,2026-02-12,7WFC,บริษัท เวิลด์ ฟิลเตอร์ จำกัด (สำนักงานใหญ่),"=""13/2/69##0125539007769""","=""0125539007769""","=""0125539007769""",270.00,18.90,288.90,หม้อกรองเครื่อง BOBCAT K-FLO,
492,WC-690201513,2.0,2026-02-24,7WFC,บริษัท เวิลด์ ฟิลเตอร์ จำกัด (สำนักงานใหญ่),"=""25/2/69##0125539007769""","=""0125539007769""","=""0125539007769""",2270.00,158.90,2428.90,กรองโซล่า (ลูกคู่) M/F 4708 (รถนอก K-FLO,6902-481


In [45]:
import os

# your existing folder
out_dir = os.path.join(
    kcwdir,
    "kcw_analytics",
    "02_staging",
    "VAT_Purchases",
    f"{YEAR}_{MONTH:02d}"
)
os.makedirs(out_dir, exist_ok=True)

# --- export loop
for bookno, g in purchase_summary.groupby("BOOKNO", dropna=False):

    # safe filename
    bookno_str = "UNKNOWN" if pd.isna(bookno) else str(bookno).strip()
    safe_bookno = "".join(c if c.isalnum() or c in ("-","_") else "_" for c in bookno_str)

    file_path =  os.path.join(out_dir, f"VAT_PURCHASE_{YEAR}_{MONTH:02d}_BOOK_{safe_bookno}.csv")

    g.to_csv(file_path, index=False, encoding="utf-8-sig")

    print("saved:", file_path)

saved: /content/drive/Shareddrives/KCW-Data/kcw_analytics/02_staging/VAT_Purchases/2026_02/VAT_PURCHASE_2026_02_BOOK_1_0.csv
saved: /content/drive/Shareddrives/KCW-Data/kcw_analytics/02_staging/VAT_Purchases/2026_02/VAT_PURCHASE_2026_02_BOOK_2_0.csv
saved: /content/drive/Shareddrives/KCW-Data/kcw_analytics/02_staging/VAT_Purchases/2026_02/VAT_PURCHASE_2026_02_BOOK_5_0.csv
saved: /content/drive/Shareddrives/KCW-Data/kcw_analytics/02_staging/VAT_Purchases/2026_02/VAT_PURCHASE_2026_02_BOOK_6_0.csv
saved: /content/drive/Shareddrives/KCW-Data/kcw_analytics/02_staging/VAT_Purchases/2026_02/VAT_PURCHASE_2026_02_BOOK_UNKNOWN.csv
