In [1]:
!git clone https://github.com/pthengtr/kcw-analytics.git

Cloning into 'kcw-analytics'...
remote: Enumerating objects: 387, done.[K
remote: Counting objects: 100% (165/165), done.[K
remote: Compressing objects: 100% (134/134), done.[K
remote: Total 387 (delta 111), reused 57 (delta 28), pack-reused 222 (from 1)[K
Receiving objects: 100% (387/387), 315.61 KiB | 6.86 MiB/s, done.
Resolving deltas: 100% (243/243), done.


In [2]:
!cd /content/kcw-analytics && git pull origin main

From https://github.com/pthengtr/kcw-analytics
 * branch            main       -> FETCH_HEAD
Already up to date.


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os
import pandas as pd

folder = "/content/drive/Shareddrives/KCW-Data/kcw_analytics/01_raw"

data = {}

for file in os.listdir(folder):
    if file.endswith(".csv"):
        path = os.path.join(folder, file)
        data[file] = pd.read_csv(
            path,
            dtype={
              "BCODE": "string",
              "ITEMNO": "string",
              "BILLNO": "string",
            },
            encoding="utf-8-sig",
            low_memory=False   # stops chunk guessing
        )
        print(f"Loaded: {file} -> {data[file].shape}")

Loaded: raw_inventory_hq_2024.csv -> (4983, 8)
Loaded: raw_syp_pidet_purchase_lines.csv -> (27233, 41)
Loaded: raw_syp_pimas_purchase_bills.csv -> (2915, 49)
Loaded: raw_syp_simas_sales_bills.csv -> (12299, 49)
Loaded: raw_syp_sidet_sales_lines.csv -> (36360, 38)
Loaded: raw_hq_icmas_products.csv -> (114903, 94)
Loaded: raw_hq_pidet_purchase_lines.csv -> (153589, 41)
Loaded: raw_hq_pimas_purchase_bills.csv -> (50176, 49)
Loaded: raw_hq_sidet_sales_lines.csv -> (732969, 38)
Loaded: raw_hq_pvmas_notes_vouchers.csv -> (13730, 32)
Loaded: raw_hq_simas_sales_bills.csv -> (275965, 49)


In [51]:
import pandas as pd
import numpy as np

def build_bill_summary_by_taxic(
    df: pd.DataFrame,
    *,
    billno_col: str = "BILLNO",
    billdate_col: str = "BILLDATE",
    detail_col: str = "DETAIL",
    amount_col: str = "AMOUNT",
    taxic_col: str = "TAXIC",
    tax_rate: float = 0.07,
    tax_id_value: str = "0000000000000",
    taxic_yes: str = "Y",
):
    """
    VAT logic:
      TAXIC == Y : AMOUNT includes VAT
      TAXIC == N : AMOUNT excludes VAT
    """

    out = df.copy()
    out[billno_col] = out[billno_col].astype("string")
    out[amount_col] = pd.to_numeric(out[amount_col], errors="coerce").fillna(0)

    # normalize TAXIC
    if taxic_col not in out.columns:
        out[taxic_col] = ""

    out[taxic_col] = out[taxic_col].astype("string").str.strip().str.upper()

    # ===== pick DETAIL from highest AMOUNT row =====
    idx_max_amt = out.groupby(billno_col)[amount_col].idxmax()
    detail_pick = (
        out.loc[idx_max_amt, [billno_col, detail_col]]
        .set_index(billno_col)[detail_col]
    )

    # ===== group totals =====
    totals = (
        out.groupby(billno_col, as_index=False)
        .agg(
            TOTAL_AMOUNT=(amount_col, "sum"),
            BILLDATE=(billdate_col, "first"),
            TAXIC=(taxic_col, "first"),
        )
    )

    totals[detail_col] = totals[billno_col].map(detail_pick)

    # ===== VAT calculation based on TAXIC =====
    divisor = 1 + tax_rate
    is_vat_inclusive = totals["TAXIC"].eq(taxic_yes)

    totals["BEFORE_VAT"] = np.where(
        is_vat_inclusive,
        (totals["TOTAL_AMOUNT"] / divisor).round(2),   # inclusive case
        totals["TOTAL_AMOUNT"].round(2)               # exclusive case
    )

    totals["VAT_AMOUNT"] = np.where(
        is_vat_inclusive,
        (totals["TOTAL_AMOUNT"] - totals["BEFORE_VAT"]).round(2),
        (totals["TOTAL_AMOUNT"] * tax_rate).round(2)
    )

    # optional: total including VAT (very useful downstream)
    totals["TOTAL_INCL_VAT"] = (totals["BEFORE_VAT"] + totals["VAT_AMOUNT"]).round(2)

    # TAX ID
    totals["TAX_ID"] = str(tax_id_value).zfill(13)[:13]

    # SEQ
    totals = totals.sort_values(billno_col).reset_index(drop=True)
    totals["SEQ"] = np.arange(1, len(totals) + 1)

    return totals

import pandas as pd

def map_simas_bill_fields(
    df: pd.DataFrame,
    df_simas: pd.DataFrame,
    *,
    billno_col: str = "BILLNO",
    fields: tuple = ("DEDUCT", "TAX", "AFTERTAX"),
    copy: bool = True,
    verbose: bool = True,
):
    """
    Join DEDUCT, TAX, AFTERTAX from df_simas into df using BILLNO.
    """

    if billno_col not in df.columns:
        raise ValueError(f"{billno_col} not found in df")

    if billno_col not in df_simas.columns:
        raise ValueError(f"{billno_col} not found in df_simas")

    # --- normalize join keys (VERY important for legacy POS)
    left = df.copy()
    right = df_simas.copy()

    left["_JOIN_KEY"] = left[billno_col].astype("string").str.strip().str.upper()
    right["_JOIN_KEY"] = right[billno_col].astype("string").str.strip().str.upper()

    # --- build lookup table (avoid duplicate explosion)
    cols = ["_JOIN_KEY"] + [c for c in fields if c in right.columns]

    lookup = (
        right[cols]
        .drop_duplicates(subset=["_JOIN_KEY"], keep="first")
    )

    # --- merge
    result = left.merge(
        lookup,
        on="_JOIN_KEY",
        how="left"
    ).drop(columns=["_JOIN_KEY"])

    if copy:
        result = result.copy()

    if verbose:
        matched = result[fields[0]].notna().sum()
        print(f"[map_simas_bill_fields] matched rows: {matched:,}/{len(result):,}")

    return result

def filter_year_month(df, year, month, date_col="BILLDATE"):
    return df[pd.to_datetime(df[date_col]).dt.to_period("M") == f"{year}-{month:02d}"]

In [78]:
import pandas as pd

# 1) Config: doc type prefixes per source
BILLTYPE_RULES = {
    "hq": {
        "TD":  ("TD",),
        "TAD": ("TAD",),
        "TR":  ("TR",),
        "CN":  ("CN",),
    },
    "syp": {
        "TD":  ("3TD",),
        "TAD": ("3TAD",),
        "TR":  ("3TR",),
        "CN":  ("3CN",),
    },
}

def build_monthly_doc_summaries(
    df_sidet: pd.DataFrame,
    df_simas: pd.DataFrame,
    *,
    source: str,     # "hq" or "syp"
    year: int,
    month: int,
    billno_col: str = "BILLNO",
    date_col: str = "BILLDATE",
    verbose: bool = True,
):
    """
    Clean pipeline:
      1) filter month
      2) remove TF bills
      3) split TD/TAD/TR/CN
      4) build summary (TAXIC logic)
      5) map SIMAS fields
    """

    # ---- 1) filter month
    df_m = filter_year_month(df_sidet, year, month)

    # normalize billno once
    s = df_m[billno_col].astype("string").str.strip().str.upper()

    # ---- 2) remove TF bills
    df_m = df_m.loc[~s.str.startswith("TF", na=False)].copy()
    s = df_m[billno_col].astype("string").str.strip().str.upper()

    # ---- 3) prefix rules (simple, no config dict needed)
    if source == "hq":
        rules = {
            "TD":  ("TD",),
            "TAD": ("TAD",),
            "TR":  ("TR",),
            "CN":  ("CN",),
        }
    else:  # syp
        rules = {
            "TD":  ("3TD",),
            "TAD": ("3TAD",),
            "TR":  ("3TR",),
            "CN":  ("3CN",),
        }

    out = {}

    for doc_type, prefixes in rules.items():

        mask = s.str.startswith(prefixes, na=False)
        df_type = df_m.loc[mask].copy()

        if df_type.empty:
            out[doc_type] = df_type
            if verbose:
                print(f"[{source}] {doc_type}: 0 rows")
            continue

        # ---- 4) build summary (VAT logic by TAXIC)
        summ = build_bill_summary_by_taxic(
            df_type,
            billno_col=billno_col,
            billdate_col=date_col,
        )

        # ---- 5) map SIMAS fields
        summ = map_simas_bill_fields(
            summ,
            df_simas,
            billno_col=billno_col,
            fields=("DEDUCT", "TAX", "AFTERTAX"),
            verbose=verbose,
        )

        out[doc_type] = summ

        if verbose:
            print(f"[{source}] {doc_type}: {len(df_type):,} rows -> {len(summ):,} bills")

    return out

In [79]:
df_hq_sidet = data["raw_hq_sidet_sales_lines.csv"].copy()
df_syp_sidet = data["raw_syp_sidet_sales_lines.csv"].copy()

df_hq_simas = data["raw_hq_simas_sales_bills.csv"].copy()
df_syp_simas = data["raw_syp_simas_sales_bills.csv"].copy()

In [80]:
dt = pd.to_datetime(df_hq_sidet["BILLDATE"], errors="coerce")

YEAR  = dt.dt.year.max()
MONTH = dt[dt.dt.year == YEAR].dt.month.max()

print(YEAR, MONTH)

2026 2


In [81]:
hq_summaries = build_monthly_doc_summaries(
    df_hq_sidet, df_hq_simas,
    source="hq", year=YEAR, month=MONTH
)

syp_summaries = build_monthly_doc_summaries(
    df_syp_sidet, df_syp_simas,
    source="syp", year=YEAR, month=MONTH
)

# Access:
df_hq_td_summary  = hq_summaries["TD"]
df_hq_tad_summary = hq_summaries["TAD"]
df_hq_tr_summary  = hq_summaries["TR"]
df_hq_cn_summary  = hq_summaries["CN"]

df_syp_td_summary  = syp_summaries["TD"]
df_syp_tad_summary = syp_summaries["TAD"]
df_syp_tr_summary  = syp_summaries["TR"]
df_syp_cn_summary  = syp_summaries["CN"]

[map_simas_bill_fields] matched rows: 4/141
[hq] TD: 315 rows -> 141 bills
[map_simas_bill_fields] matched rows: 0/539
[hq] TAD: 593 rows -> 539 bills
[map_simas_bill_fields] matched rows: 0/27
[hq] TR: 64 rows -> 27 bills
[map_simas_bill_fields] matched rows: 41/41
[hq] CN: 44 rows -> 41 bills
[syp] TD: 0 rows
[syp] TAD: 0 rows
[map_simas_bill_fields] matched rows: 0/9
[syp] TR: 37 rows -> 9 bills
[syp] CN: 0 rows


In [87]:
kcwdir = '/content/drive/Shareddrives/KCW-Data'

In [90]:
import os

out_dir = f"{kcwdir}/kcw_analytics/04_outputs/VAT_Sales/{YEAR}_{MONTH}"
os.makedirs(out_dir, exist_ok=True)

exports = [
    (df_hq_td_summary,  "TD"),
    (df_hq_tad_summary, "TAD"),
    (df_hq_tr_summary,  "TR"),
    (df_hq_cn_summary,  "CN"),
    (df_syp_td_summary,  "3TD"),
    (df_syp_tad_summary, "3TAD"),
    (df_syp_tr_summary,  "3TR"),
    (df_syp_cn_summary,  "3CN"),
]

for df, name in exports:
    path = f"{out_dir}/{name}.csv"
    df.to_csv(path, index=False, encoding="utf-8-sig")
    print(f"Saved -> {path}")

Saved -> /content/drive/Shareddrives/KCW-Data/kcw_analytics/04_outputs/VAT_Sales/2026_2/TD.csv
Saved -> /content/drive/Shareddrives/KCW-Data/kcw_analytics/04_outputs/VAT_Sales/2026_2/TAD.csv
Saved -> /content/drive/Shareddrives/KCW-Data/kcw_analytics/04_outputs/VAT_Sales/2026_2/TR.csv
Saved -> /content/drive/Shareddrives/KCW-Data/kcw_analytics/04_outputs/VAT_Sales/2026_2/CN.csv
Saved -> /content/drive/Shareddrives/KCW-Data/kcw_analytics/04_outputs/VAT_Sales/2026_2/3TD.csv
Saved -> /content/drive/Shareddrives/KCW-Data/kcw_analytics/04_outputs/VAT_Sales/2026_2/3TAD.csv
Saved -> /content/drive/Shareddrives/KCW-Data/kcw_analytics/04_outputs/VAT_Sales/2026_2/3TR.csv
Saved -> /content/drive/Shareddrives/KCW-Data/kcw_analytics/04_outputs/VAT_Sales/2026_2/3CN.csv
