In [1]:
!git clone https://github.com/pthengtr/kcw-analytics.git

Cloning into 'kcw-analytics'...
remote: Enumerating objects: 121, done.[K
remote: Counting objects: 100% (121/121), done.[K
remote: Compressing objects: 100% (76/76), done.[K
remote: Total 121 (delta 56), reused 68 (delta 18), pack-reused 0 (from 0)[K
Receiving objects: 100% (121/121), 95.51 KiB | 5.03 MiB/s, done.
Resolving deltas: 100% (56/56), done.


In [2]:
!cd /content/kcw-analytics && git pull origin main

From https://github.com/pthengtr/kcw-analytics
 * branch            main       -> FETCH_HEAD
Already up to date.


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import os
import pandas as pd

folder = "/content/drive/MyDrive/kcw_analytics/01_raw"

data = {}

for file in os.listdir(folder):
    if file.endswith(".csv"):
        path = os.path.join(folder, file)
        data[file] = pd.read_csv(
            path,
            dtype={
              "BCODE": "string",
              "ITEMNO": "string",
              "BILLNO": "string",
            },
            encoding="utf-8-sig",
            low_memory=False   # stops chunk guessing
        )
        print(f"Loaded: {file} -> {data[file].shape}")



Loaded: raw_hq_pimas_purchase_bills.csv -> (82716, 49)
Loaded: raw_hq_simas_sales_bills.csv -> (481937, 49)
Loaded: raw_hq_pidet_purchase_lines.csv -> (246580, 41)
Loaded: raw_hq_sidet_sales_lines.csv -> (1187205, 38)
Loaded: raw_hq_icmas_products.csv -> (114755, 94)
Loaded: raw_syp_pimas_purchase_bills.csv -> (2721, 49)
Loaded: raw_syp_simas_sales_bills.csv -> (10368, 49)
Loaded: raw_syp_sidet_sales_lines.csv -> (30374, 38)
Loaded: raw_syp_pidet_purchase_lines.csv -> (25555, 41)
Loaded: raw_inventory_hq_2024.csv -> (4983, 8)


In [6]:
import pandas as pd
import numpy as np

def build_flat_sales_for_powerbi(
    sidet: pd.DataFrame,
    pidet: pd.DataFrame,
    *,
    sidet_date_col: str = "BILLDATE",
    pidet_date_col: str = "BILLDATE",
    bcode_col: str = "BCODE",
    billno_col: str = "BILLNO",
    amount_col: str = "AMOUNT",
    status_col: str = "STATUS",
    canceled_col: str = "CANCELED",
    qty_col: str = "QTY",
    mtp_col: str = "MTP",
    acctno_col: str = "ACCTNO",
    isvat_col: str = "ISVAT",
    status_keep: float = 1.0,               # your data shows STATUS as 1.0 / 8.0
    canceled_keep: str = "N",
    unknown_cost_value: str = "UNKNOWN",
    add_cost_numeric_cols: bool = True,     # adds COST_NUM + COST_STATUS + SALE_UNITS (recommended)
    show_progress: bool = True,             # tqdm progress bar
    progress_every: int = 1000,             # used when tqdm is not available
) -> pd.DataFrame:
    """
    Build flat, Power BI–ready sales lines with progress reporting.

    SIDET rules:
      - keep STATUS == 1 (numeric) and CANCELED == 'N'
      - drop noise rows missing BCODE or BILLNO or AMOUNT
      - CATEGORY = first 2 chars of BCODE
      - PRICE = AMOUNT / (QTY*MTP)

    PIDET cost rules:
      - keep CANCELED == 'N'
      - purchase UNITS = QTY*MTP
      - cumulative average cost per BCODE up to each purchase date:
          AVG_COST = CUM_AMT / CUM_UNITS

    COST assignment:
      - for each sale line, COST_NUM is latest AVG_COST where purchase BILLDATE <= sale BILLDATE (per BCODE)
      - if missing => COST_STATUS='UNKNOWN' and COST_NUM is NaN (COST remains NaN)
      - includes progress bar (tqdm) or periodic prints
    """

    s = sidet.copy()
    p = pidet.copy()

    # -----------------------
    # 1) SIDET cleaning + drop noise
    # -----------------------
    s[bcode_col] = s[bcode_col].astype("string").str.strip()
    s[billno_col] = s[billno_col].astype("string").str.strip()

    s[amount_col] = pd.to_numeric(s[amount_col], errors="coerce")
    s[sidet_date_col] = pd.to_datetime(s[sidet_date_col], errors="coerce")

    # Drop noise rows (your rule)
    s = s[
        s[bcode_col].notna() & (s[bcode_col] != "") &
        s[billno_col].notna() & (s[billno_col] != "") &
        s[amount_col].notna()
    ].copy()

    # Must have sale date
    s = s[s[sidet_date_col].notna()].copy()

    # Normalize flags
    s[canceled_col] = s[canceled_col].astype("string").str.strip().str.upper()
    s[status_col] = pd.to_numeric(s[status_col], errors="coerce")

    # Enforce SIDET filters (your rule)
    s = s[(s[status_col] == status_keep) & (s[canceled_col] == canceled_keep)].copy()

    # Derived fields
    s["CATEGORY"] = s[bcode_col].astype(str).str[:2]

    s[qty_col] = pd.to_numeric(s[qty_col], errors="coerce")
    s[mtp_col] = pd.to_numeric(s[mtp_col], errors="coerce")

    s["SALE_UNITS"] = s[qty_col] * s[mtp_col]
    s["PRICE"] = (s[amount_col] / s["SALE_UNITS"]).replace([np.inf, -np.inf], np.nan)

    # -----------------------
    # 2) PIDET cleaning + cumulative avg cost by units (QTY*MTP)
    # -----------------------
    p[bcode_col] = p[bcode_col].astype("string").str.strip()
    p[amount_col] = pd.to_numeric(p[amount_col], errors="coerce")
    p[qty_col] = pd.to_numeric(p[qty_col], errors="coerce")
    p[mtp_col] = pd.to_numeric(p[mtp_col], errors="coerce")
    p[pidet_date_col] = pd.to_datetime(p[pidet_date_col], errors="coerce")

    if canceled_col in p.columns:
        p[canceled_col] = p[canceled_col].astype("string").str.strip().str.upper()
        p = p[p[canceled_col] == canceled_keep].copy()

    # Required fields + non-zero units
    p["PUR_UNITS"] = p[qty_col] * p[mtp_col]
    p = p[
        p[bcode_col].notna() & (p[bcode_col] != "") &
        p[pidet_date_col].notna() &
        p[amount_col].notna() &
        p["PUR_UNITS"].notna() & (p["PUR_UNITS"] != 0)
    ].copy()

    # Sort for cumulative stats
    p = p.sort_values([bcode_col, pidet_date_col], kind="mergesort").reset_index(drop=True)

    p["CUM_UNITS"] = p.groupby(bcode_col, sort=False)["PUR_UNITS"].cumsum()
    p["CUM_AMT"] = p.groupby(bcode_col, sort=False)[amount_col].cumsum()
    p["AVG_COST"] = (p["CUM_AMT"] / p["CUM_UNITS"]).replace([np.inf, -np.inf], np.nan)

    # -----------------------
    # 3) COST assignment (robust searchsorted per BCODE) + progress
    # -----------------------
    s = s.sort_values([bcode_col, sidet_date_col], kind="mergesort").reset_index(drop=True)
    cost_num = np.full(len(s), np.nan, dtype="float64")

    # Pre-split purchases by BCODE (much faster than filtering each loop)
    p_groups = {k: v for k, v in p.groupby(bcode_col, sort=False)}

    groups = s.groupby(bcode_col, sort=False).indices  # dict: bcode -> index array

    use_tqdm = False
    if show_progress:
        try:
            from tqdm import tqdm  # type: ignore
            use_tqdm = True
            iterator = tqdm(groups.keys(), total=len(groups), desc="Assigning COST (BCODE groups)")
        except Exception:
            iterator = groups.keys()
    else:
        iterator = groups.keys()

    for i, bcode in enumerate(iterator, start=1):
        if (not use_tqdm) and show_progress and (i % progress_every == 0):
            print(f"Processed {i:,}/{len(groups):,} BCODE groups...")

        s_idx = groups[bcode]
        p_sub = p_groups.get(bcode)
        if p_sub is None or p_sub.empty:
            continue

        p_dates = p_sub[pidet_date_col].to_numpy(dtype="datetime64[ns]")
        p_costs = p_sub["AVG_COST"].to_numpy(dtype="float64")

        s_dates = s.loc[s_idx, sidet_date_col].to_numpy(dtype="datetime64[ns]")

        pos = np.searchsorted(p_dates, s_dates, side="right") - 1
        valid = pos >= 0
        if np.any(valid):
            cost_num[s_idx[valid]] = p_costs[pos[valid]]

    # Attach COST columns
    if add_cost_numeric_cols:
        s["COST_NUM"] = cost_num
        s["COST_STATUS"] = np.where(np.isfinite(s["COST_NUM"]), "OK", unknown_cost_value)
        # keep COST numeric for Power BI calculations; unknowns stay NaN
        s["COST"] = s["COST_NUM"]
    else:
        # Mixed type column: numbers + UNKNOWN string
        s["COST"] = np.where(np.isfinite(cost_num), cost_num.astype(object), unknown_cost_value)

    # -----------------------
    # 4) Final output columns
    # -----------------------
    final_cols = [
        bcode_col,
        "CATEGORY",
        billno_col,
        sidet_date_col,
        status_col,
        acctno_col,
        isvat_col,
        qty_col,
        mtp_col,
        "PRICE",
        amount_col,
        canceled_col,
        "COST",
    ]

    if add_cost_numeric_cols:
        final_cols += ["COST_NUM", "COST_STATUS", "SALE_UNITS"]

    missing = [c for c in final_cols if c not in s.columns]
    if missing:
        raise KeyError(f"Missing expected columns in result: {missing}")

    return s[final_cols].copy()


In [7]:
flat_sales = build_flat_sales_for_powerbi(
    sidet=data["raw_hq_sidet_sales_lines.csv"],
    pidet=data["raw_hq_pidet_purchase_lines.csv"],
)

print("Rows output:", len(flat_sales))
print(flat_sales["STATUS"].value_counts(dropna=False).head())
print(flat_sales["CANCELED"].value_counts(dropna=False).head())
print(flat_sales["COST_STATUS"].value_counts(dropna=False).head())


Assigning COST (BCODE groups): 100%|██████████| 32221/32221 [00:19<00:00, 1649.21it/s]


Rows output: 1129590
STATUS
1.0    1129590
Name: count, dtype: int64
CANCELED
N    1129590
Name: count, dtype: Int64
COST_STATUS
OK         1073156
UNKNOWN      56434
Name: count, dtype: int64


In [15]:
flat_sales.head()

Unnamed: 0,BCODE,CATEGORY,BILLNO,BILLDATE,STATUS,ACCTNO,ISVAT,QTY,MTP,PRICE,AMOUNT,CANCELED,COST,COST_NUM,COST_STATUS,SALE_UNITS
0,.,.,8C68-0000898,2025-05-19,1.0,กม,N,,1.0,,0.0,N,,,UNKNOWN,
1,01010023,01,8K62-00901,2019-01-26,1.0,,N,2.0,1.0,100.0,200.0,N,,,UNKNOWN,2.0
2,01010023,01,8K64-09980,2021-04-29,1.0,จอ,N,8.0,1.0,140.0,1120.0,N,78.508,78.508,OK,8.0
3,01010023,01,8K64-21472,2021-09-06,1.0,,N,8.0,1.0,130.0,1040.0,N,74.879,74.879,OK,8.0
4,01010023,01,TF6805-217,2025-05-30,1.0,KCW1,N,6.0,1.0,76.0,456.0,N,75.252667,75.252667,OK,6.0


In [17]:
flat_sales.to_csv("/content/drive/MyDrive/kcw_analytics/03_curated/clean_sales_flat.csv", index=False, encoding="utf-8-sig")


In [20]:
import os

path = "/content/drive/MyDrive/kcw_analytics/03_curated/clean_sales_flat.csv"

print("Exists:", os.path.exists(path))
print("Folder content:")
print(os.listdir("/content/drive/MyDrive/kcw_analytics"))

Exists: True
Folder content:
['02_staging', '03_curated', '04_outputs', '05_powerbi', '01_raw', '00_fonts']


In [21]:
!find "/content/drive/MyDrive/kcw_analytics" -maxdepth 3 -type f -iname "*clean_sales_flat*.csv" -print


/content/drive/MyDrive/kcw_analytics/03_curated/clean_sales_flat.csv
