In [1]:
import os
import sys

if "google.colab" in sys.modules:
    # Running in Colab

    !git clone https://github.com/pthengtr/kcw-analytics.git
    !cd /content/kcw-analytics && git pull origin main

    from google.colab import drive
    drive.mount("/content/drive")

    BASE_FOLDER = "/content/drive/Shareddrives"
    BASE_FOLDER_GIT = "/content"
else:
    # Running in local Jupyter
    BASE_FOLDER = r"G:\Shared drives"
    BASE_FOLDER_GIT = r"C:\Users\Windows 11\Notebook"

print("Using folder:", BASE_FOLDER)

Using folder: G:\Shared drives


In [2]:
folder = f"{BASE_FOLDER}/KCW-Data/kcw_analytics/01_raw"

In [3]:
import os
import pandas as pd

data = {}

for file in os.listdir(folder):
    if file.endswith(".csv"):
        path = os.path.join(folder, file)
        data[file] = pd.read_csv(
            path,
            dtype={
              "BCODE": "string",
              "MOBILE": "string",
              "BILLNO": "string",
            },
            encoding="utf-8-sig",
            low_memory=False   # stops chunk guessing
        )
        print(f"Loaded: {file} -> {data[file].shape}")

Loaded: raw_inventory_hq_2024.csv -> (4983, 8)
Loaded: raw_syp_simas_sales_bills.csv -> (12558, 49)
Loaded: raw_hq_pidet_purchase_lines.csv -> (154002, 41)
Loaded: raw_syp_sidet_sales_lines.csv -> (37081, 38)
Loaded: raw_syp_pimas_purchase_bills.csv -> (2938, 49)
Loaded: raw_hq_simas_sales_bills.csv -> (275882, 49)
Loaded: raw_hq_pimas_purchase_bills.csv -> (50286, 49)
Loaded: raw_hq_sidet_sales_lines.csv -> (733120, 38)
Loaded: raw_syp_pidet_purchase_lines.csv -> (27470, 41)
Loaded: raw_hq_icmas_products.csv -> (114941, 94)
Loaded: raw_hq_pvmas_notes_vouchers.csv -> (13829, 32)
Loaded: raw_hq_armas_receivable.csv -> (2224, 20)
Loaded: raw_hq_apmas_payable.csv -> (976, 20)


In [4]:
import pandas as pd
import numpy as np

def build_bill_summary_by_taxic(
    df: pd.DataFrame,
    *,
    billno_col: str = "BILLNO",
    billdate_col: str = "BILLDATE",
    detail_col: str = "DETAIL",
    amount_col: str = "AMOUNT",
    taxic_col: str = "TAXIC",
    tax_rate: float = 0.07,
    tax_id_value: str = "0000000000000",
    taxic_yes: str = "Y",
):
    """
    VAT logic:
      TAXIC == Y : AMOUNT includes VAT
      TAXIC == N : AMOUNT excludes VAT
    """

    out = df.copy()
    out[billno_col] = out[billno_col].astype("string")
    out[amount_col] = pd.to_numeric(out[amount_col], errors="coerce").fillna(0)

    # normalize TAXIC
    if taxic_col not in out.columns:
        out[taxic_col] = ""

    out[taxic_col] = out[taxic_col].astype("string").str.strip().str.upper()

    # ===== pick DETAIL from highest AMOUNT row =====
    idx_max_amt = out.groupby(billno_col)[amount_col].idxmax()
    detail_pick = (
        out.loc[idx_max_amt, [billno_col, detail_col]]
        .set_index(billno_col)[detail_col]
    )

    # ===== group totals =====
    totals = (
        out.groupby(billno_col, as_index=False)
        .agg(
            TOTAL_AMOUNT=(amount_col, "sum"),
            BILLDATE=(billdate_col, "first"),
            TAXIC=(taxic_col, "first"),
        )
    )

    totals[detail_col] = totals[billno_col].map(detail_pick)

    # ===== VAT calculation based on TAXIC =====
    divisor = 1 + tax_rate
    is_vat_inclusive = totals["TAXIC"].eq(taxic_yes)

    totals["BEFORE_VAT"] = np.where(
        is_vat_inclusive,
        (totals["TOTAL_AMOUNT"] / divisor).round(2),   # inclusive case
        totals["TOTAL_AMOUNT"].round(2)               # exclusive case
    )

    totals["VAT_AMOUNT"] = np.where(
        is_vat_inclusive,
        (totals["TOTAL_AMOUNT"] - totals["BEFORE_VAT"]).round(2),
        (totals["TOTAL_AMOUNT"] * tax_rate).round(2)
    )

    # optional: total including VAT (very useful downstream)
    totals["TOTAL_INCL_VAT"] = (totals["BEFORE_VAT"] + totals["VAT_AMOUNT"]).round(2)

    # TAX ID
    totals["TAX_ID"] = str(tax_id_value).zfill(13)[:13]

    # SEQ
    totals = totals.sort_values(billno_col).reset_index(drop=True)
    totals["SEQ"] = np.arange(1, len(totals) + 1)

    return totals

import pandas as pd
import numpy as np

def map_simas_bill_fields(
    df: pd.DataFrame,
    df_simas: pd.DataFrame,
    *,
    billno_col: str = "BILLNO",
    fields: tuple = ("DEDUCT", "TAX", "AFTERTAX"),
    copy: bool = True,
    verbose: bool = True,
):
    """
    Join DEDUCT, TAX, AFTERTAX from df_simas into df using BILLNO.
    """

    if billno_col not in df.columns:
        raise ValueError(f"{billno_col} not found in df")

    if billno_col not in df_simas.columns:
        raise ValueError(f"{billno_col} not found in df_simas")

    # --- normalize join keys (VERY important for legacy POS)
    left = df.copy()
    right = df_simas.copy()

    left["_JOIN_KEY"] = left[billno_col].astype("string").str.strip().str.upper()
    right["_JOIN_KEY"] = right[billno_col].astype("string").str.strip().str.upper()

    # --- build lookup table (avoid duplicate explosion)
    cols = ["_JOIN_KEY"] + [c for c in fields if c in right.columns]

    lookup = (
        right[cols]
        .drop_duplicates(subset=["_JOIN_KEY"], keep="first")
    )

    # --- merge
    result = left.merge(
        lookup,
        on="_JOIN_KEY",
        how="left"
    ).drop(columns=["_JOIN_KEY"])

    if copy:
        result = result.copy()

    if verbose:
        matched = result[fields[0]].notna().sum()
        print(f"[map_simas_bill_fields] matched rows: {matched:,}/{len(result):,}")

    return result

def filter_year_month(df, year, month, date_col="BILLDATE"):
    return df[pd.to_datetime(df[date_col]).dt.to_period("M") == f"{year}-{month:02d}"]

In [5]:
def map_armas_mobile(
    df: pd.DataFrame,
    df_armas: pd.DataFrame,
    *,
    acctno_col: str = "ACCTNO",
    mobile_col: str = "MOBILE",
    verbose: bool = True,
):
    if acctno_col not in df.columns:
        raise ValueError(f"{acctno_col} not found in df (did you map it from SIMAS?)")
    if acctno_col not in df_armas.columns:
        raise ValueError(f"{acctno_col} not found in df_armas")
    if mobile_col not in df_armas.columns:
        raise ValueError(f"{mobile_col} not found in df_armas")

    left = df.copy()
    right = df_armas[[acctno_col, mobile_col]].copy()

    # normalize keys
    left["_ACCTKEY"] = left[acctno_col].astype("string").str.strip().str.upper()
    right["_ACCTKEY"] = right[acctno_col].astype("string").str.strip().str.upper()

    lookup = right[["_ACCTKEY", mobile_col]].drop_duplicates("_ACCTKEY", keep="first")

    out = left.merge(lookup, on="_ACCTKEY", how="left").drop(columns=["_ACCTKEY"])

    if verbose:
        matched = out[mobile_col].notna().sum()
        print(f"[map_armas_mobile] matched MOBILE: {matched:,}/{len(out):,}")

    return out

In [6]:
import pandas as pd

# 1) Config: doc type prefixes per source
BILLTYPE_RULES = {
    "hq": {
        "TD":  ("TD",),
        "TAD": ("TAD",),
        "TR":  ("TR",),
        "CN":  ("CN",),
    },
    "syp": {
        "TD":  ("3TD",),
        "TAD": ("3TAD",),
        "TR":  ("3TR",),
        "CN":  ("3CN",),
    },
}

def build_monthly_doc_summaries(
    df_sidet: pd.DataFrame,
    df_simas: pd.DataFrame,
    df_armas: pd.DataFrame,
    *,
    source: str,     # "hq" or "syp"
    year: int,
    month: int,
    billno_col: str = "BILLNO",
    date_col: str = "BILLDATE",
    verbose: bool = True,
):
    """
    Clean pipeline:
      1) filter month
      2) remove TF bills
      3) split TD/TAD/TR/CN
      4) build summary (TAXIC logic)
      5) map SIMAS fields
    """

    # ---- 1) filter month
    df_m = filter_year_month(df_sidet, year, month)

    # normalize billno once
    s = df_m[billno_col].astype("string").str.strip().str.upper()

    # ---- 2) remove TF bills
    df_m = df_m.loc[~s.str.contains("TF", na=False)].copy()
    s = df_m[billno_col].astype("string").str.strip().str.upper()

    # ---- 3) prefix rules (simple, no config dict needed)
    if source == "hq":
        rules = {
            "TD":  ("TD",),
            "TAD": ("TAD",),
            "TR":  ("TR",),
            "CN":  ("CN",),
        }
    else:  # syp
        rules = {
            "TD":  ("3TD",),
            "TAD": ("3TAD",),
            "TR":  ("3TR",),
            "CN":  ("3CN",),
        }

    out = {}

    for doc_type, prefixes in rules.items():

        mask = s.str.startswith(prefixes, na=False)
        df_type = df_m.loc[mask].copy()

        if df_type.empty:
            out[doc_type] = df_type
            if verbose:
                print(f"[{source}] {doc_type}: 0 rows")
            continue

        # ---- 4) build summary (VAT logic by TAXIC)
        summ = build_bill_summary_by_taxic(
            df_type,
            billno_col=billno_col,
            billdate_col=date_col,
        )

        # ---- 5) map SIMAS fields
        summ = map_simas_bill_fields(
            summ,
            df_simas,
            billno_col=billno_col,
            fields=("ACCTNO", "ACCTNAME", "DEDUCT","BEFORETAX", "TAX", "AFTERTAX", "REMARKS", "PO"),
            verbose=verbose,
        )

        # ---- 6) map ARMAS MOBILE by ACCTNO
        summ = map_armas_mobile(
            summ,
            df_armas,
            acctno_col="ACCTNO",
            mobile_col="MOBILE",
            verbose=verbose,
        )

        out[doc_type] = summ

        if verbose:
            print(f"[{source}] {doc_type}: {len(df_type):,} rows -> {len(summ):,} bills")

    return out

In [7]:
# df_lines2 = map_simas_and_armas_fields(df_lines, df_simas, df_armas)

# summary = build_bill_summary_by_taxic(df_lines2)

In [8]:
df_hq_sidet = data["raw_hq_sidet_sales_lines.csv"].copy()
df_syp_sidet = data["raw_syp_sidet_sales_lines.csv"].copy()

df_hq_simas = data["raw_hq_simas_sales_bills.csv"].copy()
df_syp_simas = data["raw_syp_simas_sales_bills.csv"].copy()

df_armas = data["raw_hq_armas_receivable.csv"].copy()
df_armas = data["raw_hq_apmas_payable.csv"].copy()

In [9]:
df_hq_simas.columns


Index(['ID', 'JOURMODE', 'JOURTYPE', 'JOURDATE', 'JOURNO', 'JOURTIME',
       'DEPTNO', 'BOOKNO', 'BILLTYPE', 'BILLDATE', 'BILLTIME', 'BILLNO',
       'LINES', 'TAXIC', 'DISCOUNT', 'DEDUCT', 'BEFORETAX', 'VAT', 'TAX',
       'AFTERTAX', 'EXEMPT', 'SVCCHG', 'WITHHOLD', 'PAID', 'CASHED', 'CASHAMT',
       'CHKAMT', 'DUEAMT', 'PAYSTAT', 'ACCTNO', 'ACCTNAME', 'ADDR1', 'ADDR2',
       'PO', 'SALE', 'RE', 'TERM', 'DUEDATE', 'NOTEDATE', 'NOTENO',
       'VOUCDATE1', 'VOUCNO1', 'VOUCDATE2', 'VOUCNO2', 'POSTED1', 'POSTED2',
       'REMARKS', 'CANCELED', 'DONE'],
      dtype='object')

In [10]:
from datetime import datetime
import pytz

# Singapore timezone
tz = pytz.timezone("Asia/Bangkok")

today = datetime.now(tz)

YEAR = today.year
MONTH = today.month

print(YEAR, MONTH)

2026 3


In [11]:
hq_summaries = build_monthly_doc_summaries(
    df_hq_sidet, df_hq_simas, df_armas,
    source="hq", year=YEAR, month=MONTH
)

syp_summaries = build_monthly_doc_summaries(
    df_syp_sidet, df_syp_simas, df_armas,
    source="syp", year=YEAR, month=MONTH
)

# Access:
df_hq_td_summary  = hq_summaries["TD"]
df_hq_tad_summary = hq_summaries["TAD"]
df_hq_tr_summary  = hq_summaries["TR"]
df_hq_cn_summary  = hq_summaries["CN"]

df_syp_td_summary  = syp_summaries["TD"]
df_syp_tad_summary = syp_summaries["TAD"]
df_syp_tr_summary  = syp_summaries["TR"]
df_syp_cn_summary  = syp_summaries["CN"]

[hq] TD: 0 rows
[hq] TAD: 0 rows
[map_simas_bill_fields] matched rows: 2/2
[map_armas_mobile] matched MOBILE: 0/2
[hq] TR: 4 rows -> 2 bills
[hq] CN: 0 rows
[syp] TD: 0 rows
[syp] TAD: 0 rows
[syp] TR: 0 rows
[syp] CN: 0 rows


In [12]:
import pandas as pd

def extract_remarks_code(df: pd.DataFrame, col: str = "REMARKS") -> pd.DataFrame:
    df = df.copy()

    # Ensure column always exists
    if col not in df.columns:
        df[col] = pd.NA

    # Apply extraction safely
    df[col] = (
        df[col]
        .astype("string")
        .str.extract(r"##(.{13})", expand=False)
    )

    return df

In [13]:
df_hq_tr_summary  = extract_remarks_code(df_hq_tr_summary)
df_syp_tr_summary = extract_remarks_code(df_syp_tr_summary)

In [14]:
def clean_and_select(df):
    keep_cols = [
        "BILLNO", "BILLDATE", "ACCTNO", "ACCTNAME",
        "REMARKS", "MOBILE", "BEFORETAX", "TAX", "AFTERTAX", "DETAIL", "PO"
    ]

    # Return if no rows
    if df.empty:
        return df

    # Return if required columns missing
    if not set(keep_cols).issubset(df.columns):
        return df

    # Clean REMARKS column
    df["REMARKS"] = df["REMARKS"].astype(str).str.replace(r"^##", "", regex=True)

    df["REMARKS"] = '="' + df["REMARKS"].astype(str) + '"'
    df["MOBILE"] = '="' + df["MOBILE"].astype(str) + '"'

    # Keep only selected columns
    return df[keep_cols]

In [15]:
import os

kcwdir = os.path.join(BASE_FOLDER, "KCW-Data")
print(kcwdir)

G:\Shared drives\KCW-Data


In [16]:
import os
from pathlib import Path

out_dir = os.path.join(
    kcwdir,
    "kcw_analytics",
    "02_staging",
    "VAT_Sales",
    f"{YEAR}_{MONTH:02d}"
)
os.makedirs(out_dir, exist_ok=True)

exports = [
    (clean_and_select(df_hq_td_summary),  "TD"),
    (clean_and_select(df_hq_tad_summary), "TAD"),
    (clean_and_select(df_hq_tr_summary),  "TR"),
    (clean_and_select(df_hq_cn_summary),  "CN"),
    (clean_and_select(df_syp_td_summary),  "3TD"),
    (clean_and_select(df_syp_tad_summary), "3TAD"),
    (clean_and_select(df_syp_tr_summary),  "3TR"),
    (clean_and_select(df_syp_cn_summary),  "3CN"),
]

for df, name in exports:
    path = os.path.join(out_dir, f"{name}.csv")
    df.to_csv(path, index=False, encoding="utf-8-sig")
    print(f"Saved -> {path}")

Saved -> G:\Shared drives\KCW-Data\kcw_analytics\02_staging\VAT_Sales\2026_03\TD.csv
Saved -> G:\Shared drives\KCW-Data\kcw_analytics\02_staging\VAT_Sales\2026_03\TAD.csv
Saved -> G:\Shared drives\KCW-Data\kcw_analytics\02_staging\VAT_Sales\2026_03\TR.csv
Saved -> G:\Shared drives\KCW-Data\kcw_analytics\02_staging\VAT_Sales\2026_03\CN.csv
Saved -> G:\Shared drives\KCW-Data\kcw_analytics\02_staging\VAT_Sales\2026_03\3TD.csv
Saved -> G:\Shared drives\KCW-Data\kcw_analytics\02_staging\VAT_Sales\2026_03\3TAD.csv
Saved -> G:\Shared drives\KCW-Data\kcw_analytics\02_staging\VAT_Sales\2026_03\3TR.csv
Saved -> G:\Shared drives\KCW-Data\kcw_analytics\02_staging\VAT_Sales\2026_03\3CN.csv


In [17]:
df_pidet = data["raw_hq_pidet_purchase_lines.csv"].copy()
df_pimas = data["raw_hq_pimas_purchase_bills.csv"].copy()

df_apmas = data["raw_hq_apmas_payable.csv"].copy()

In [18]:
def filter_year_month(df, year, month, date_col="BILLDATE"):
    d = df.copy()
    d[date_col] = pd.to_datetime(d[date_col], errors="coerce")
    return d[d[date_col].dt.to_period("M") == f"{year}-{month:02d}"]

In [19]:
df_pimas_m = filter_year_month(df_pimas, YEAR, MONTH)
df_pidet_m = filter_year_month(df_pidet, YEAR, MONTH)

In [20]:
# --- copy first (safe practice)
pidet_join = df_pidet_m.copy()
pimas_join = df_pimas_m.copy()

# --- normalize BILLNO (VERY important for old POS)
pidet_join["_JOIN_KEY"] = pidet_join["BILLNO"].astype("string").str.strip().str.upper()
pimas_join["_JOIN_KEY"] = pimas_join["BILLNO"].astype("string").str.strip().str.upper()

# --- build lookup from pimas (avoid duplicate explosion)
lookup = (
    pimas_join[["_JOIN_KEY", "ACCTNO" ,"ACCTNAME","BOOKNO", "BEFORETAX","TAX", "AFTERTAX", "REMARKS", "PO"]]
    .drop_duplicates(subset=["_JOIN_KEY"], keep="first")
)

# --- LEFT JOIN (pidet is base)
pidet_join = pidet_join.merge(
    lookup,
    on="_JOIN_KEY",
    how="left"
).drop(columns=["_JOIN_KEY"])

print("rows after join:", len(pidet_join))
pidet_join.head()

rows after join: 140


Unnamed: 0,ID,JOURMODE,JOURTYPE,JOURDATE,BILLTYPE,BILLDATE,BILLNO,LINE,ITEMNO,BCODE,...,CANCELED,DONE,ACCTNO_y,ACCTNAME,BOOKNO,BEFORETAX,TAX,AFTERTAX,REMARKS,PO
0,290441,2,PJ,2026-03-01,1,2026-03-01,20085.0,10.0,1.0,12052167,...,N,N,BM,ห้างหุ้นส่วนจำกัด บี เอ็ม แทรกเตอร์,1.0,7780.0,0.0,7780.0,19/2/69##0103532025557,6902-306
1,290442,2,PJ,2026-03-01,1,2026-03-01,20085.0,20.0,2.0,12053251,...,N,N,BM,ห้างหุ้นส่วนจำกัด บี เอ็ม แทรกเตอร์,1.0,7780.0,0.0,7780.0,19/2/69##0103532025557,6902-306
2,290443,2,PJ,2026-03-01,1,2026-03-01,20085.0,30.0,3.0,12053154,...,N,N,BM,ห้างหุ้นส่วนจำกัด บี เอ็ม แทรกเตอร์,1.0,7780.0,0.0,7780.0,19/2/69##0103532025557,6902-306
3,290444,2,PJ,2026-03-01,1,2026-03-01,20085.0,40.0,4.0,12051152,...,N,N,BM,ห้างหุ้นส่วนจำกัด บี เอ็ม แทรกเตอร์,1.0,7780.0,0.0,7780.0,19/2/69##0103532025557,6902-306
4,290445,2,PJ,2026-03-01,1,2026-03-01,20085.0,50.0,5.0,12010042,...,N,N,BM,ห้างหุ้นส่วนจำกัด บี เอ็ม แทรกเตอร์,1.0,7780.0,0.0,7780.0,19/2/69##0103532025557,6902-306


In [21]:
pidet_join = pidet_join[pidet_join["ISVAT"] == "Y"]

In [22]:
pidet_join["REMARKS_EXTRACT"] = pidet_join["REMARKS"].astype(str).str[-13:]

In [23]:
pidet_join = pidet_join.merge(df_apmas[["ACCTNO", "MOBILE"]],
              left_on="ACCTNO_x",
              right_on="ACCTNO",
              how="left")

In [24]:
pidet_join.columns

Index(['ID', 'JOURMODE', 'JOURTYPE', 'JOURDATE', 'BILLTYPE', 'BILLDATE',
       'BILLNO', 'LINE', 'ITEMNO', 'BCODE', 'PCODE', 'MCODE', 'DETAIL',
       'WHNUMBER', 'LOCATION1', 'STATUS', 'SERIAL', 'TAXIC', 'EXMPT', 'ISVAT',
       'QTY', 'UI', 'MTP', 'PRICE', 'XPRICE', 'DISCNT1', 'DISCNT2', 'DISCNT3',
       'DISCNT4', 'DED', 'VAT', 'AMOUNT', 'CHGAMT', 'ACCTNO_x', 'PAID',
       'SALEDATE', 'SALENO', 'SALEPRICE', 'ACCT_NO', 'CANCELED', 'DONE',
       'ACCTNO_y', 'ACCTNAME', 'BOOKNO', 'BEFORETAX', 'TAX', 'AFTERTAX',
       'REMARKS', 'PO', 'REMARKS_EXTRACT', 'ACCTNO', 'MOBILE'],
      dtype='object')

In [25]:
import pandas as pd
import numpy as np

def build_purchase_bill_summary_by_taxic(
    df: pd.DataFrame,
    *,
    billno_col: str = "BILLNO",
    billdate_col: str = "BILLDATE",
    detail_col: str = "DETAIL",
    amount_col: str = "AMOUNT",
    taxic_col: str = "TAXIC",
    bookno_col: str = "BOOKNO",
    beforetax_col: str = "BEFORETAX",
    tax_col: str = "TAX",
    aftertax_col: str = "AFTERTAX",
    tax_rate: float = 0.07,
    tax_id_value: str = "0000000000000",
    taxic_yes: str = "Y",
):
    """
    Purchase summary:
      - AMOUNT summed from pidet lines
      - KEEP BOOKNO, TAX, AFTERTAX from input (pimas)
      - VAT split based on TAXIC
    """

    out = df.copy()

    out[billno_col] = out[billno_col].astype("string").str.strip().str.upper()
    out[amount_col] = pd.to_numeric(out[amount_col], errors="coerce").fillna(0)

    if taxic_col not in out.columns:
        out[taxic_col] = ""

    out[taxic_col] = out[taxic_col].astype("string").str.strip().str.upper()

    # ===== pick DETAIL from highest AMOUNT row =====
    idx_max_amt = out.groupby(billno_col)[amount_col].idxmax()
    detail_pick = (
        out.loc[idx_max_amt, [billno_col, detail_col]]
        .set_index(billno_col)[detail_col]
    )

    # ===== group totals (KEEP BOOKNO / TAX / AFTERTAX) =====
    totals = (
        out.groupby(billno_col, as_index=False)
        .agg(
            TOTAL_AMOUNT=(amount_col, "sum"),
            BILLDATE=(billdate_col, "first"),
            TAXIC=(taxic_col, "first"),
            BOOKNO=(bookno_col, "first"),
            BEFORETAX=(beforetax_col, "first"),
            TAX=(tax_col, "first"),
            AFTERTAX=(aftertax_col, "first"),
            ACCTNO=("ACCTNO", "first"),
            ACCTNAME=("ACCTNAME", "first"),
            REMARKS=("REMARKS", "first"),
            REMARKS_EXTRACT=("REMARKS_EXTRACT","first"),
            MOBILE=("MOBILE","first"),
            PO = ("PO", "first"),
        )
    )

    totals[detail_col] = totals[billno_col].map(detail_pick)

    # ===== VAT calculation based on TAXIC =====
    divisor = 1 + tax_rate
    is_vat_inclusive = totals["TAXIC"].eq(taxic_yes)

    totals["BEFORE_VAT"] = np.where(
        is_vat_inclusive,
        (totals["TOTAL_AMOUNT"] / divisor).round(2),
        totals["TOTAL_AMOUNT"].round(2)
    )

    totals["VAT_AMOUNT"] = np.where(
        is_vat_inclusive,
        (totals["TOTAL_AMOUNT"] - totals["BEFORE_VAT"]).round(2),
        (totals["TOTAL_AMOUNT"] * tax_rate).round(2)
    )

    totals["TOTAL_INCL_VAT"] = (totals["BEFORE_VAT"] + totals["VAT_AMOUNT"]).round(2)

    totals["TAX_ID"] = str(tax_id_value).zfill(13)[:13]

    totals = totals.sort_values(billno_col).reset_index(drop=True)
    totals["SEQ"] = np.arange(1, len(totals) + 1)

    return totals

In [26]:
purchase_summary = build_purchase_bill_summary_by_taxic(pidet_join)

In [27]:
def clean_and_select_purchase(df):
    keep_cols = [
        "BILLNO", "BOOKNO", "BILLDATE", "ACCTNO", "ACCTNAME",
        "REMARKS", "REMARKS_EXTRACT", "MOBILE", "BEFORETAX", "TAX", "AFTERTAX", "DETAIL", "PO"
    ]

    # Return if no rows
    if df.empty:
        return df

    # Return if required columns missing
    if not set(keep_cols).issubset(df.columns):
        return df

    # Clean REMARKS column
    df["REMARKS"] = df["REMARKS"].astype(str).str.replace(r"^##", "", regex=True)

    df["REMARKS"] = '="' + df["REMARKS"].astype(str) + '"'
    df["REMARKS_EXTRACT"] = '="' + df["REMARKS_EXTRACT"].astype(str) + '"'
    df["MOBILE"] = '="' + df["MOBILE"].astype(str) + '"'

    # Keep only selected columns
    return df[keep_cols]

In [28]:
purchase_summary = clean_and_select_purchase(purchase_summary)

In [29]:
purchase_summary

Unnamed: 0,BILLNO,BOOKNO,BILLDATE,ACCTNO,ACCTNAME,REMARKS,REMARKS_EXTRACT,MOBILE,BEFORETAX,TAX,AFTERTAX,DETAIL,PO
0,-20260300018,1.0,2026-03-02,7MC,บริษัท ไมโครซัพพลายทูลส์ จำกัด (สำนักงานใหญ่),"=""0745559003372""","=""0745559003372""","=""0745559003372""",9602.0,672.14,10274.14,สายไฮโดรลิค 2 ชั้น 5 หุน U2T-10,6902-568
1,0291443,1.0,2026-03-01,7รกชม,ร้าน ชัยมอเตอร์ พุเตย,"=""28/2/69##3670501044255""","=""3670501044255""","=""3670501044255""",37000.0,2590.0,39590.0,เครื่องตัดหญ้า 1.เมตร (15-22แร P.T,6902-460
2,6903-001,1.0,2026-03-02,7SKT,ห้างหุ้นส่วนจำกัด เอส.เค.ที. แมชชีนทูลส์ (สนญ.),"=""27/2/69##0103535014254""","=""0103535014254""","=""0103535014254""",1773.63,124.15,1897.78,รอกโซ่ 0.5T ทรงเหลี่ยม 0.5T โซ่คู่ ยาว MIT...,6902-557
3,IV26030010,1.0,2026-03-02,7LK,ห้างหุ้นส่วนจำกัด โลหะกิจกลการ อิมปอร์ต (สำนัก...,"=""27/2/69##0103528009909""","=""0103528009909""","=""0103528009909""",2901.5,203.11,3104.61,"สาแหรกแหนบหลังโค้ง 9/16"" 8"" S220..ELF LOCO",6902-569
4,IV6901700,1.0,2026-03-01,7KMK,บริษัท กิจมงคลอะไหล่ยนต์ (1991) จำกัด (สำนักงา...,"=""27/2/69##0105534058501""","=""0105534058501""","=""0105534058501""",1773.75,124.16,1897.91,"แม่ปั้มคลัชล่าง 3/4"" L200,ไซโคลน,สตา Assab",6902-571
5,PP6902562,1.0,2026-03-02,7SL,บริษัท 88 สมุทรปราการ จำกัด (สำนักงานใหญ่),"=""28/2/69##0115559008221""","=""0115559008221""","=""0115559008221""",3009.35,210.65,3220.0,หม้อน้ำ พต.หนา MT 18x24 K64 สตราด้า 2.5 SL,6902-590
6,RIV69030019,1.0,2026-03-02,7S.PRY,บจก. เอส.พี.อาร์.วาย ออโต้พาร์ท (สำนักงานใหญ่),"=""27/2/69##0115547003165""","=""0115547003165""","=""0115547003165""",778.0,54.46,832.46,"มือเปิดฝาท้าย กลาง D-MAX 03-11,เชฟ S.PRY",6902-551
7,SC6904203,1.0,2026-03-02,7SKKS,บริษัท อินเตอร์เนชั่นแนล ฟรอนเทียร์ ยูนิเทรด จ...,"=""26/2/69##0105556003920""","=""0105556003920""","=""0105556003920""",2046.73,143.27,2190.0,ชุดซ่อมแม่ปั้มเบรค ชุดเล็ก F/6640-TS120 (ก ...,6902-527
8,SC6904208,1.0,2026-03-02,7SKKS,บริษัท อินเตอร์เนชั่นแนล ฟรอนเทียร์ ยูนิเทรด จ...,"=""26/2/69##0105556003920""","=""0105556003920""","=""0105556003920""",10691.59,748.41,11440.0,"ปั้มไฮ แกนเฟือง 12T 4รู (หมุนซ้าย) L1802,L200...",6902-527
9,SC6904316,1.0,2026-03-02,7SKKS,บริษัท อินเตอร์เนชั่นแนล ฟรอนเทียร์ ยูนิเทรด จ...,"=""27/2/69##0105556003920""","=""0105556003920""","=""0105556003920""",4710.28,329.72,5040.0,ปั้มน้ำ มูเล่ย์เล็ก F/6600 แปลงมูเล UK,6902-565


In [30]:
import os

# your existing folder
out_dir = os.path.join(
    kcwdir,
    "kcw_analytics",
    "02_staging",
    "VAT_Purchases",
    f"{YEAR}_{MONTH:02d}"
)
os.makedirs(out_dir, exist_ok=True)

# --- export loop
for bookno, g in purchase_summary.groupby("BOOKNO", dropna=False):

    # safe filename
    bookno_str = "UNKNOWN" if pd.isna(bookno) else str(bookno).strip()
    safe_bookno = "".join(c if c.isalnum() or c in ("-","_") else "_" for c in bookno_str)

    file_path =  os.path.join(out_dir, f"VAT_PURCHASE_{YEAR}_{MONTH:02d}_BOOK_{safe_bookno}.csv")

    g.to_csv(file_path, index=False, encoding="utf-8-sig")

    print("saved:", file_path)

saved: G:\Shared drives\KCW-Data\kcw_analytics\02_staging\VAT_Purchases\2026_03\VAT_PURCHASE_2026_03_BOOK_1_0.csv
