In [1]:
import os
import sys

if "google.colab" in sys.modules:
    # Running in Colab

    !git clone https://github.com/pthengtr/kcw-analytics.git
    !cd /content/kcw-analytics && git pull origin main

    from google.colab import drive
    drive.mount("/content/drive")

    BASE_FOLDER = "/content/drive/Shareddrives"
    BASE_FOLDER_GIT = "/content"
else:
    # Running in local Jupyter
    BASE_FOLDER = r"G:\Shared drives"
    BASE_FOLDER_GIT = r"C:\Users\Windows 11\Notebook"

print("Using folder:", BASE_FOLDER)

Cloning into 'kcw-analytics'...
remote: Enumerating objects: 455, done.[K
remote: Counting objects: 100% (17/17), done.[K
remote: Compressing objects: 100% (14/14), done.[K
remote: Total 455 (delta 9), reused 4 (delta 3), pack-reused 438 (from 1)[K
Receiving objects: 100% (455/455), 380.01 KiB | 2.09 MiB/s, done.
Resolving deltas: 100% (293/293), done.
From https://github.com/pthengtr/kcw-analytics
 * branch            main       -> FETCH_HEAD
Already up to date.
Mounted at /content/drive
Using folder: /content/drive/Shareddrives


In [2]:
folder = f"{BASE_FOLDER}/KCW-Data/kcw_analytics/01_raw"

In [3]:
import os
import pandas as pd

data = {}

for file in os.listdir(folder):
    if file.endswith(".csv"):
        path = os.path.join(folder, file)
        data[file] = pd.read_csv(
            path,
            dtype={
              "BCODE": "string",
              "MOBILE": "string",
              "BILLNO": "string",
            },
            encoding="utf-8-sig",
            low_memory=False   # stops chunk guessing
        )
        print(f"Loaded: {file} -> {data[file].shape}")

Loaded: raw_inventory_hq_2024.csv -> (4983, 8)
Loaded: raw_syp_pimas_purchase_bills.csv -> (2924, 49)
Loaded: raw_syp_pidet_purchase_lines.csv -> (27348, 41)
Loaded: raw_syp_simas_sales_bills.csv -> (12426, 49)
Loaded: raw_syp_sidet_sales_lines.csv -> (36682, 38)
Loaded: raw_hq_icmas_products.csv -> (114925, 94)
Loaded: raw_hq_pidet_purchase_lines.csv -> (153771, 41)
Loaded: raw_hq_pimas_purchase_bills.csv -> (50217, 49)
Loaded: raw_hq_sidet_sales_lines.csv -> (733138, 38)
Loaded: raw_hq_armas_receivable.csv -> (2223, 20)
Loaded: raw_hq_apmas_payable.csv -> (976, 20)
Loaded: raw_hq_pvmas_notes_vouchers.csv -> (13751, 32)
Loaded: raw_hq_simas_sales_bills.csv -> (275961, 49)


In [4]:
import pandas as pd
import numpy as np

def build_bill_summary_by_taxic(
    df: pd.DataFrame,
    *,
    billno_col: str = "BILLNO",
    billdate_col: str = "BILLDATE",
    detail_col: str = "DETAIL",
    amount_col: str = "AMOUNT",
    taxic_col: str = "TAXIC",
    tax_rate: float = 0.07,
    tax_id_value: str = "0000000000000",
    taxic_yes: str = "Y",
):
    """
    VAT logic:
      TAXIC == Y : AMOUNT includes VAT
      TAXIC == N : AMOUNT excludes VAT
    """

    out = df.copy()
    out[billno_col] = out[billno_col].astype("string")
    out[amount_col] = pd.to_numeric(out[amount_col], errors="coerce").fillna(0)

    # normalize TAXIC
    if taxic_col not in out.columns:
        out[taxic_col] = ""

    out[taxic_col] = out[taxic_col].astype("string").str.strip().str.upper()

    # ===== pick DETAIL from highest AMOUNT row =====
    idx_max_amt = out.groupby(billno_col)[amount_col].idxmax()
    detail_pick = (
        out.loc[idx_max_amt, [billno_col, detail_col]]
        .set_index(billno_col)[detail_col]
    )

    # ===== group totals =====
    totals = (
        out.groupby(billno_col, as_index=False)
        .agg(
            TOTAL_AMOUNT=(amount_col, "sum"),
            BILLDATE=(billdate_col, "first"),
            TAXIC=(taxic_col, "first"),
        )
    )

    totals[detail_col] = totals[billno_col].map(detail_pick)

    # ===== VAT calculation based on TAXIC =====
    divisor = 1 + tax_rate
    is_vat_inclusive = totals["TAXIC"].eq(taxic_yes)

    totals["BEFORE_VAT"] = np.where(
        is_vat_inclusive,
        (totals["TOTAL_AMOUNT"] / divisor).round(2),   # inclusive case
        totals["TOTAL_AMOUNT"].round(2)               # exclusive case
    )

    totals["VAT_AMOUNT"] = np.where(
        is_vat_inclusive,
        (totals["TOTAL_AMOUNT"] - totals["BEFORE_VAT"]).round(2),
        (totals["TOTAL_AMOUNT"] * tax_rate).round(2)
    )

    # optional: total including VAT (very useful downstream)
    totals["TOTAL_INCL_VAT"] = (totals["BEFORE_VAT"] + totals["VAT_AMOUNT"]).round(2)

    # TAX ID
    totals["TAX_ID"] = str(tax_id_value).zfill(13)[:13]

    # SEQ
    totals = totals.sort_values(billno_col).reset_index(drop=True)
    totals["SEQ"] = np.arange(1, len(totals) + 1)

    return totals

import pandas as pd
import numpy as np

def map_simas_bill_fields(
    df: pd.DataFrame,
    df_simas: pd.DataFrame,
    *,
    billno_col: str = "BILLNO",
    fields: tuple = ("DEDUCT", "TAX", "AFTERTAX"),
    copy: bool = True,
    verbose: bool = True,
):
    """
    Join DEDUCT, TAX, AFTERTAX from df_simas into df using BILLNO.
    """

    if billno_col not in df.columns:
        raise ValueError(f"{billno_col} not found in df")

    if billno_col not in df_simas.columns:
        raise ValueError(f"{billno_col} not found in df_simas")

    # --- normalize join keys (VERY important for legacy POS)
    left = df.copy()
    right = df_simas.copy()

    left["_JOIN_KEY"] = left[billno_col].astype("string").str.strip().str.upper()
    right["_JOIN_KEY"] = right[billno_col].astype("string").str.strip().str.upper()

    # --- build lookup table (avoid duplicate explosion)
    cols = ["_JOIN_KEY"] + [c for c in fields if c in right.columns]

    lookup = (
        right[cols]
        .drop_duplicates(subset=["_JOIN_KEY"], keep="first")
    )

    # --- merge
    result = left.merge(
        lookup,
        on="_JOIN_KEY",
        how="left"
    ).drop(columns=["_JOIN_KEY"])

    if copy:
        result = result.copy()

    if verbose:
        matched = result[fields[0]].notna().sum()
        print(f"[map_simas_bill_fields] matched rows: {matched:,}/{len(result):,}")

    return result

def filter_year_month(df, year, month, date_col="BILLDATE"):
    return df[pd.to_datetime(df[date_col]).dt.to_period("M") == f"{year}-{month:02d}"]

In [5]:
def map_armas_mobile(
    df: pd.DataFrame,
    df_armas: pd.DataFrame,
    *,
    acctno_col: str = "ACCTNO",
    mobile_col: str = "MOBILE",
    verbose: bool = True,
):
    if acctno_col not in df.columns:
        raise ValueError(f"{acctno_col} not found in df (did you map it from SIMAS?)")
    if acctno_col not in df_armas.columns:
        raise ValueError(f"{acctno_col} not found in df_armas")
    if mobile_col not in df_armas.columns:
        raise ValueError(f"{mobile_col} not found in df_armas")

    left = df.copy()
    right = df_armas[[acctno_col, mobile_col]].copy()

    # normalize keys
    left["_ACCTKEY"] = left[acctno_col].astype("string").str.strip().str.upper()
    right["_ACCTKEY"] = right[acctno_col].astype("string").str.strip().str.upper()

    lookup = right[["_ACCTKEY", mobile_col]].drop_duplicates("_ACCTKEY", keep="first")

    out = left.merge(lookup, on="_ACCTKEY", how="left").drop(columns=["_ACCTKEY"])

    if verbose:
        matched = out[mobile_col].notna().sum()
        print(f"[map_armas_mobile] matched MOBILE: {matched:,}/{len(out):,}")

    return out

In [6]:
import pandas as pd

# 1) Config: doc type prefixes per source
BILLTYPE_RULES = {
    "hq": {
        "TD":  ("TD",),
        "TAD": ("TAD",),
        "TR":  ("TR",),
        "CN":  ("CN",),
    },
    "syp": {
        "TD":  ("3TD",),
        "TAD": ("3TAD",),
        "TR":  ("3TR",),
        "CN":  ("3CN",),
    },
}

def build_monthly_doc_summaries(
    df_sidet: pd.DataFrame,
    df_simas: pd.DataFrame,
    df_armas: pd.DataFrame,
    *,
    source: str,     # "hq" or "syp"
    year: int,
    month: int,
    billno_col: str = "BILLNO",
    date_col: str = "BILLDATE",
    verbose: bool = True,
):
    """
    Clean pipeline:
      1) filter month
      2) remove TF bills
      3) split TD/TAD/TR/CN
      4) build summary (TAXIC logic)
      5) map SIMAS fields
    """

    # ---- 1) filter month
    df_m = filter_year_month(df_sidet, year, month)

    # normalize billno once
    s = df_m[billno_col].astype("string").str.strip().str.upper()

    # ---- 2) remove TF bills
    df_m = df_m.loc[~s.str.contains("TF", na=False)].copy()
    s = df_m[billno_col].astype("string").str.strip().str.upper()

    # ---- 3) prefix rules (simple, no config dict needed)
    if source == "hq":
        rules = {
            "TD":  ("TD",),
            "TAD": ("TAD",),
            "TR":  ("TR",),
            "CN":  ("CN",),
        }
    else:  # syp
        rules = {
            "TD":  ("3TD",),
            "TAD": ("3TAD",),
            "TR":  ("3TR",),
            "CN":  ("3CN",),
        }

    out = {}

    for doc_type, prefixes in rules.items():

        mask = s.str.startswith(prefixes, na=False)
        df_type = df_m.loc[mask].copy()

        if df_type.empty:
            out[doc_type] = df_type
            if verbose:
                print(f"[{source}] {doc_type}: 0 rows")
            continue

        # ---- 4) build summary (VAT logic by TAXIC)
        summ = build_bill_summary_by_taxic(
            df_type,
            billno_col=billno_col,
            billdate_col=date_col,
        )

        # ---- 5) map SIMAS fields
        summ = map_simas_bill_fields(
            summ,
            df_simas,
            billno_col=billno_col,
            fields=("ACCTNO", "ACCTNAME", "DEDUCT","BEFORETAX", "TAX", "AFTERTAX", "REMARKS"),
            verbose=verbose,
        )

        # ---- 6) map ARMAS MOBILE by ACCTNO
        summ = map_armas_mobile(
            summ,
            df_armas,
            acctno_col="ACCTNO",
            mobile_col="MOBILE",
            verbose=verbose,
        )

        out[doc_type] = summ

        if verbose:
            print(f"[{source}] {doc_type}: {len(df_type):,} rows -> {len(summ):,} bills")

    return out

In [7]:
# df_lines2 = map_simas_and_armas_fields(df_lines, df_simas, df_armas)

# summary = build_bill_summary_by_taxic(df_lines2)

In [8]:
df_hq_sidet = data["raw_hq_sidet_sales_lines.csv"].copy()
df_syp_sidet = data["raw_syp_sidet_sales_lines.csv"].copy()

df_hq_simas = data["raw_hq_simas_sales_bills.csv"].copy()
df_syp_simas = data["raw_syp_simas_sales_bills.csv"].copy()

df_armas = data["raw_hq_armas_receivable.csv"].copy()
df_armas = data["raw_hq_apmas_payable.csv"].copy()

In [9]:
df_hq_simas.columns


Index(['ID', 'JOURMODE', 'JOURTYPE', 'JOURDATE', 'JOURNO', 'JOURTIME',
       'DEPTNO', 'BOOKNO', 'BILLTYPE', 'BILLDATE', 'BILLTIME', 'BILLNO',
       'LINES', 'TAXIC', 'DISCOUNT', 'DEDUCT', 'BEFORETAX', 'VAT', 'TAX',
       'AFTERTAX', 'EXEMPT', 'SVCCHG', 'WITHHOLD', 'PAID', 'CASHED', 'CASHAMT',
       'CHKAMT', 'DUEAMT', 'PAYSTAT', 'ACCTNO', 'ACCTNAME', 'ADDR1', 'ADDR2',
       'PO', 'SALE', 'RE', 'TERM', 'DUEDATE', 'NOTEDATE', 'NOTENO',
       'VOUCDATE1', 'VOUCNO1', 'VOUCDATE2', 'VOUCNO2', 'POSTED1', 'POSTED2',
       'REMARKS', 'CANCELED', 'DONE'],
      dtype='object')

In [10]:
YEAR = 2026
MONTH = 1

In [11]:
hq_summaries = build_monthly_doc_summaries(
    df_hq_sidet, df_hq_simas, df_armas,
    source="hq", year=YEAR, month=MONTH
)

syp_summaries = build_monthly_doc_summaries(
    df_syp_sidet, df_syp_simas, df_armas,
    source="syp", year=YEAR, month=MONTH
)

# Access:
df_hq_td_summary  = hq_summaries["TD"]
df_hq_tad_summary = hq_summaries["TAD"]
df_hq_tr_summary  = hq_summaries["TR"]
df_hq_cn_summary  = hq_summaries["CN"]

df_syp_td_summary  = syp_summaries["TD"]
df_syp_tad_summary = syp_summaries["TAD"]
df_syp_tr_summary  = syp_summaries["TR"]
df_syp_cn_summary  = syp_summaries["CN"]

[map_simas_bill_fields] matched rows: 137/137
[map_armas_mobile] matched MOBILE: 14/137
[hq] TD: 257 rows -> 137 bills
[map_simas_bill_fields] matched rows: 648/648
[map_armas_mobile] matched MOBILE: 24/648
[hq] TAD: 757 rows -> 648 bills
[map_simas_bill_fields] matched rows: 40/40
[map_armas_mobile] matched MOBILE: 5/40
[hq] TR: 93 rows -> 40 bills
[map_simas_bill_fields] matched rows: 28/28
[map_armas_mobile] matched MOBILE: 0/28
[hq] CN: 33 rows -> 28 bills
[syp] TD: 0 rows
[syp] TAD: 0 rows
[map_simas_bill_fields] matched rows: 18/18
[map_armas_mobile] matched MOBILE: 0/18
[syp] TR: 60 rows -> 18 bills
[syp] CN: 0 rows


In [12]:
def clean_and_select(df):
    keep_cols = [
        "BILLNO", "BILLDATE", "ACCTNO", "ACCTNAME",
        "REMARKS", "MOBILE", "BEFORETAX", "TAX", "AFTERTAX", "DETAIL"
    ]

    # Return if no rows
    if df.empty:
        return df

    # Return if required columns missing
    if not set(keep_cols).issubset(df.columns):
        return df

    # Clean REMARKS column
    df["REMARKS"] = df["REMARKS"].astype(str).str.replace(r"^##", "", regex=True)

    df["REMARKS"] = '="' + df["REMARKS"].astype(str) + '"'
    df["MOBILE"] = '="' + df["MOBILE"].astype(str) + '"'

    # Keep only selected columns
    return df[keep_cols]

In [13]:
import os

kcwdir = os.path.join(BASE_FOLDER, "KCW-Data")
print(kcwdir)

/content/drive/Shareddrives/KCW-Data


In [14]:
import os
from pathlib import Path

out_dir = os.path.join(
    kcwdir,
    "kcw_analytics",
    "02_staging",
    "VAT_Sales",
    f"{YEAR}_{MONTH:02d}"
)
os.makedirs(out_dir, exist_ok=True)

exports = [
    (clean_and_select(df_hq_td_summary),  "TD"),
    (clean_and_select(df_hq_tad_summary), "TAD"),
    (clean_and_select(df_hq_tr_summary),  "TR"),
    (clean_and_select(df_hq_cn_summary),  "CN"),
    (clean_and_select(df_syp_td_summary),  "3TD"),
    (clean_and_select(df_syp_tad_summary), "3TAD"),
    (clean_and_select(df_syp_tr_summary),  "3TR"),
    (clean_and_select(df_syp_cn_summary),  "3CN"),
]

for df, name in exports:
    path = os.path.join(out_dir, f"{name}.csv")
    df.to_csv(path, index=False, encoding="utf-8-sig")
    print(f"Saved -> {path}")

Saved -> /content/drive/Shareddrives/KCW-Data/kcw_analytics/02_staging/VAT_Sales/2026_01/TD.csv
Saved -> /content/drive/Shareddrives/KCW-Data/kcw_analytics/02_staging/VAT_Sales/2026_01/TAD.csv
Saved -> /content/drive/Shareddrives/KCW-Data/kcw_analytics/02_staging/VAT_Sales/2026_01/TR.csv
Saved -> /content/drive/Shareddrives/KCW-Data/kcw_analytics/02_staging/VAT_Sales/2026_01/CN.csv
Saved -> /content/drive/Shareddrives/KCW-Data/kcw_analytics/02_staging/VAT_Sales/2026_01/3TD.csv
Saved -> /content/drive/Shareddrives/KCW-Data/kcw_analytics/02_staging/VAT_Sales/2026_01/3TAD.csv
Saved -> /content/drive/Shareddrives/KCW-Data/kcw_analytics/02_staging/VAT_Sales/2026_01/3TR.csv
Saved -> /content/drive/Shareddrives/KCW-Data/kcw_analytics/02_staging/VAT_Sales/2026_01/3CN.csv


In [15]:
df_pidet = data["raw_hq_pidet_purchase_lines.csv"].copy()
df_pimas = data["raw_hq_pimas_purchase_bills.csv"].copy()

df_apmas = data["raw_hq_apmas_payable.csv"].copy()

In [16]:
def filter_year_month(df, year, month, date_col="BILLDATE"):
    d = df.copy()
    d[date_col] = pd.to_datetime(d[date_col], errors="coerce")
    return d[d[date_col].dt.to_period("M") == f"{year}-{month:02d}"]

In [17]:
df_pimas_m = filter_year_month(df_pimas, YEAR, MONTH)
df_pidet_m = filter_year_month(df_pidet, YEAR, MONTH)

In [18]:
# --- copy first (safe practice)
pidet_join = df_pidet_m.copy()
pimas_join = df_pimas_m.copy()

# --- normalize BILLNO (VERY important for old POS)
pidet_join["_JOIN_KEY"] = pidet_join["BILLNO"].astype("string").str.strip().str.upper()
pimas_join["_JOIN_KEY"] = pimas_join["BILLNO"].astype("string").str.strip().str.upper()

# --- build lookup from pimas (avoid duplicate explosion)
lookup = (
    pimas_join[["_JOIN_KEY", "ACCTNO" ,"ACCTNAME","BOOKNO", "BEFORETAX","TAX", "AFTERTAX", "REMARKS"]]
    .drop_duplicates(subset=["_JOIN_KEY"], keep="first")
)

# --- LEFT JOIN (pidet is base)
pidet_join = pidet_join.merge(
    lookup,
    on="_JOIN_KEY",
    how="left"
).drop(columns=["_JOIN_KEY"])

print("rows after join:", len(pidet_join))
pidet_join.head()

rows after join: 3030


Unnamed: 0,ID,JOURMODE,JOURTYPE,JOURDATE,BILLTYPE,BILLDATE,BILLNO,LINE,ITEMNO,BCODE,...,ACCT_NO,CANCELED,DONE,ACCTNO_y,ACCTNAME,BOOKNO,BEFORETAX,TAX,AFTERTAX,REMARKS
0,274656,2,PJ,2026-01-01,1,2026-01-01,214.0,10.0,1.0,30051174,...,,N,N,BM,ห้างหุ้นส่วนจำกัด บี เอ็ม แทรกเตอร์,1.0,2820.0,0.0,2820.0,19/12/68##0103532025557
1,274657,2,PJ,2026-01-01,1,2026-01-01,214.0,20.0,2.0,12051178,...,,N,N,BM,ห้างหุ้นส่วนจำกัด บี เอ็ม แทรกเตอร์,1.0,2820.0,0.0,2820.0,19/12/68##0103532025557
2,274658,2,PJ,2026-01-01,1,2026-01-01,214.0,30.0,3.0,12051649,...,,N,N,BM,ห้างหุ้นส่วนจำกัด บี เอ็ม แทรกเตอร์,1.0,2820.0,0.0,2820.0,19/12/68##0103532025557
3,274659,2,PJ,2026-01-01,1,2026-01-01,214.0,40.0,4.0,12052736,...,,N,N,BM,ห้างหุ้นส่วนจำกัด บี เอ็ม แทรกเตอร์,1.0,2820.0,0.0,2820.0,19/12/68##0103532025557
4,274802,2,PJ,2026-01-01,1,2026-01-01,175.0,10.0,1.0,12053152,...,,N,N,BM,ห้างหุ้นส่วนจำกัด บี เอ็ม แทรกเตอร์,1.0,11800.0,0.0,11800.0,20/12/68##0103532025557


In [19]:
pidet_join = pidet_join[pidet_join["ISVAT"] == "Y"]

In [20]:
pidet_join["REMARKS_EXTRACT"] = pidet_join["REMARKS"].astype(str).str[-13:]

In [21]:
pidet_join = pidet_join.merge(df_apmas[["ACCTNO", "MOBILE"]],
              left_on="ACCTNO_x",
              right_on="ACCTNO",
              how="left")

In [22]:
import pandas as pd
import numpy as np

def build_purchase_bill_summary_by_taxic(
    df: pd.DataFrame,
    *,
    billno_col: str = "BILLNO",
    billdate_col: str = "BILLDATE",
    detail_col: str = "DETAIL",
    amount_col: str = "AMOUNT",
    taxic_col: str = "TAXIC",
    bookno_col: str = "BOOKNO",
    beforetax_col: str = "BEFORETAX",
    tax_col: str = "TAX",
    aftertax_col: str = "AFTERTAX",
    tax_rate: float = 0.07,
    tax_id_value: str = "0000000000000",
    taxic_yes: str = "Y",
):
    """
    Purchase summary:
      - AMOUNT summed from pidet lines
      - KEEP BOOKNO, TAX, AFTERTAX from input (pimas)
      - VAT split based on TAXIC
    """

    out = df.copy()

    out[billno_col] = out[billno_col].astype("string").str.strip().str.upper()
    out[amount_col] = pd.to_numeric(out[amount_col], errors="coerce").fillna(0)

    if taxic_col not in out.columns:
        out[taxic_col] = ""

    out[taxic_col] = out[taxic_col].astype("string").str.strip().str.upper()

    # ===== pick DETAIL from highest AMOUNT row =====
    idx_max_amt = out.groupby(billno_col)[amount_col].idxmax()
    detail_pick = (
        out.loc[idx_max_amt, [billno_col, detail_col]]
        .set_index(billno_col)[detail_col]
    )

    # ===== group totals (KEEP BOOKNO / TAX / AFTERTAX) =====
    totals = (
        out.groupby(billno_col, as_index=False)
        .agg(
            TOTAL_AMOUNT=(amount_col, "sum"),
            BILLDATE=(billdate_col, "first"),
            TAXIC=(taxic_col, "first"),
            BOOKNO=(bookno_col, "first"),
            BEFORETAX=(beforetax_col, "first"),
            TAX=(tax_col, "first"),
            AFTERTAX=(aftertax_col, "first"),
            ACCTNO=("ACCTNO", "first"),
            ACCTNAME=("ACCTNAME", "first"),
            REMARKS=("REMARKS", "first"),
            REMARKS_EXTRACT=("REMARKS_EXTRACT","first"),
            MOBILE=("MOBILE","first"),
        )
    )

    totals[detail_col] = totals[billno_col].map(detail_pick)

    # ===== VAT calculation based on TAXIC =====
    divisor = 1 + tax_rate
    is_vat_inclusive = totals["TAXIC"].eq(taxic_yes)

    totals["BEFORE_VAT"] = np.where(
        is_vat_inclusive,
        (totals["TOTAL_AMOUNT"] / divisor).round(2),
        totals["TOTAL_AMOUNT"].round(2)
    )

    totals["VAT_AMOUNT"] = np.where(
        is_vat_inclusive,
        (totals["TOTAL_AMOUNT"] - totals["BEFORE_VAT"]).round(2),
        (totals["TOTAL_AMOUNT"] * tax_rate).round(2)
    )

    totals["TOTAL_INCL_VAT"] = (totals["BEFORE_VAT"] + totals["VAT_AMOUNT"]).round(2)

    totals["TAX_ID"] = str(tax_id_value).zfill(13)[:13]

    totals = totals.sort_values(billno_col).reset_index(drop=True)
    totals["SEQ"] = np.arange(1, len(totals) + 1)

    return totals

In [23]:
purchase_summary = build_purchase_bill_summary_by_taxic(pidet_join)

In [24]:
def clean_and_select_purchase(df):
    keep_cols = [
        "BILLNO", "BOOKNO", "BILLDATE", "ACCTNO", "ACCTNAME",
        "REMARKS", "REMARKS_EXTRACT", "MOBILE", "BEFORETAX", "TAX", "AFTERTAX", "DETAIL"
    ]

    # Return if no rows
    if df.empty:
        return df

    # Return if required columns missing
    if not set(keep_cols).issubset(df.columns):
        return df

    # Clean REMARKS column
    df["REMARKS"] = df["REMARKS"].astype(str).str.replace(r"^##", "", regex=True)

    df["REMARKS"] = '="' + df["REMARKS"].astype(str) + '"'
    df["REMARKS_EXTRACT"] = '="' + df["REMARKS_EXTRACT"].astype(str) + '"'
    df["MOBILE"] = '="' + df["MOBILE"].astype(str) + '"'

    # Keep only selected columns
    return df[keep_cols]

In [25]:
purchase_summary = clean_and_select_purchase(purchase_summary)

In [26]:
purchase_summary

Unnamed: 0,BILLNO,BOOKNO,BILLDATE,ACCTNO,ACCTNAME,REMARKS,REMARKS_EXTRACT,MOBILE,BEFORETAX,TAX,AFTERTAX,DETAIL
0,-202601/00009,1.0,2026-01-03,7PB,บริษัท เดชณรงค์ แทรคเตอร์ จำกัด (สำนักงานใหญ่),"=""06/01/69##0275555000306""","=""0275555000306""","=""0275555000306""",1320.00,92.40,1412.40,ไฟท้าย (RH) F/6640 ขาว -เหล หัวสิงห์
1,-202601/00215,1.0,2026-01-05,7PB,บริษัท เดชณรงค์ แทรคเตอร์ จำกัด (สำนักงานใหญ่),"=""7/1/69##0275555000306""","=""0275555000306""","=""0275555000306""",3370.00,235.90,3605.90,"ไส้กรองเครื่อง เหล็ก ก.16NF L4708,L L02,M50-M..."
2,-202601/00359,1.0,2026-01-06,7PB,บริษัท เดชณรงค์ แทรคเตอร์ จำกัด (สำนักงานใหญ่),"=""8/1/69##0275555000306""","=""0275555000306""","=""0275555000306""",1400.00,98.00,1498.00,"ปั้มน้ำ + ปะเก็น L3218,L3208,L40 แท้สั่ง"
3,-202601/00371,1.0,2026-01-07,7PB,บริษัท เดชณรงค์ แทรคเตอร์ จำกัด (สำนักงานใหญ่),"=""10/1/69##0275555000306""","=""0275555000306""","=""0275555000306""",12112.50,847.88,12960.38,หลังคาไฟเบอร์ 152x46.5x43 EF393T DT แท้
4,-202601/00392,1.0,2026-01-07,7PB,บริษัท เดชณรงค์ แทรคเตอร์ จำกัด (สำนักงานใหญ่),"=""8/1/69##0275555000306""","=""0275555000306""","=""0275555000306""",26500.00,1855.00,28355.00,ข้อเหวี่ยง F/6640 5.0 GENMOT
...,...,...,...,...,...,...,...,...,...,...,...,...
564,VV0025305,1.0,2026-01-28,7333,บริษัท พี.เจ.บี.เอ็นเตอร์ไพรส์ จำกัด (สำนักงาน...,"=""29/1/69##0105537035315""","=""0105537035315""","=""0105537035315""",5150.00,360.50,5510.50,"ลูกหมากปีกนกล่าง 2WD-4WD D-MAX,MU7.04-13 333"
565,VVA6900006,1.0,2026-01-05,7HOD,บริษัท เอชโอดี อิมปอร์ต จำกัด (สำนักงานใหญ่),"=""27/12/68##0125547001162""","=""0125547001162""","=""0125547001162""",3024.00,211.68,3235.68,ชาพอก (STD) อีเซกิ 4FA-4FB DAIDO
566,VVA6900010,1.0,2026-01-05,7HOD,บริษัท เอชโอดี อิมปอร์ต จำกัด (สำนักงานใหญ่),"=""27/12/68##0125547001162""","=""0125547001162""","=""0125547001162""",700.93,49.07,750.00,แหวนลูกสูบ 86m 4แหวน S250-C190-KBD อ NPR
567,VVA6900046,1.0,2026-01-07,7HOD,บริษัท เอชโอดี อิมปอร์ต จำกัด (สำนักงานใหญ่),"=""8/1/69##0125547001162""","=""0125547001162""","=""0125547001162""",1247.67,87.34,1335.01,แหวนลูกสูบ 76 มิล STD อิเซกิ 4FE1-4FA OEM


In [27]:
import os

# your existing folder
out_dir = os.path.join(
    kcwdir,
    "kcw_analytics",
    "02_staging",
    "VAT_Purchases",
    f"{YEAR}_{MONTH:02d}"
)
os.makedirs(out_dir, exist_ok=True)

# --- export loop
for bookno, g in purchase_summary.groupby("BOOKNO", dropna=False):

    # safe filename
    bookno_str = "UNKNOWN" if pd.isna(bookno) else str(bookno).strip()
    safe_bookno = "".join(c if c.isalnum() or c in ("-","_") else "_" for c in bookno_str)

    file_path =  os.path.join(out_dir, f"VAT_PURCHASE_{YEAR}_{MONTH:02d}_BOOK_{safe_bookno}.csv")

    g.to_csv(file_path, index=False, encoding="utf-8-sig")

    print("saved:", file_path)

saved: /content/drive/Shareddrives/KCW-Data/kcw_analytics/02_staging/VAT_Purchases/2026_01/VAT_PURCHASE_2026_01_BOOK_1_0.csv
saved: /content/drive/Shareddrives/KCW-Data/kcw_analytics/02_staging/VAT_Purchases/2026_01/VAT_PURCHASE_2026_01_BOOK_2_0.csv
saved: /content/drive/Shareddrives/KCW-Data/kcw_analytics/02_staging/VAT_Purchases/2026_01/VAT_PURCHASE_2026_01_BOOK_5_0.csv
saved: /content/drive/Shareddrives/KCW-Data/kcw_analytics/02_staging/VAT_Purchases/2026_01/VAT_PURCHASE_2026_01_BOOK_6_0.csv
