In [41]:
!git clone https://github.com/pthengtr/kcw-analytics.git

fatal: destination path 'kcw-analytics' already exists and is not an empty directory.


In [42]:
!cd /content/kcw-analytics && git pull origin main

From https://github.com/pthengtr/kcw-analytics
 * branch            main       -> FETCH_HEAD
Already up to date.


In [43]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [44]:
import os
import pandas as pd

folder = "/content/drive/MyDrive/kcw_analytics/01_raw"

data = {}

for file in os.listdir(folder):
    if file.endswith(".csv"):
        path = os.path.join(folder, file)
        data[file] = pd.read_csv(
            path,
            dtype={
              "BCODE": "string",
              "ITEMNO": "string",
              "BILLNO": "string",
            },
            encoding="utf-8-sig",
            low_memory=False   # stops chunk guessing
        )
        print(f"Loaded: {file} -> {data[file].shape}")



Loaded: raw_hq_pimas_purchase_bills.csv -> (82716, 49)
Loaded: raw_hq_simas_sales_bills.csv -> (481937, 49)
Loaded: raw_hq_pidet_purchase_lines.csv -> (246580, 41)
Loaded: raw_hq_sidet_sales_lines.csv -> (1187205, 38)
Loaded: raw_hq_icmas_products.csv -> (114755, 94)
Loaded: raw_syp_pimas_purchase_bills.csv -> (2721, 49)
Loaded: raw_syp_simas_sales_bills.csv -> (10368, 49)
Loaded: raw_syp_sidet_sales_lines.csv -> (30374, 38)
Loaded: raw_syp_pidet_purchase_lines.csv -> (25555, 41)
Loaded: raw_inventory_hq_2024.csv -> (4983, 8)


In [45]:
hq_sales_lines = data['raw_hq_sidet_sales_lines.csv'].copy()
syp_sales_lines = data['raw_syp_sidet_sales_lines.csv'].copy()
purchase_lines = data['raw_hq_pidet_purchase_lines.csv'].copy()

In [46]:
hq_sales_lines.columns

Index(['ID', 'JOURMODE', 'JOURTYPE', 'JOURDATE', 'BILLTYPE', 'BILLDATE',
       'BILLNO', 'LINE', 'ITEMNO', 'BCODE', 'PCODE', 'MCODE', 'DETAIL',
       'WHNUMBER', 'LOCATION1', 'STATUS', 'SERIAL', 'TAXIC', 'EXMPT', 'ISVAT',
       'QTY', 'UI', 'MTP', 'PRICE', 'XPRICE', 'DISCNT1', 'DISCNT2', 'DISCNT3',
       'DISCNT4', 'DED', 'VAT', 'AMOUNT', 'CHGAMT', 'ACCTNO', 'PAID',
       'ACCT_NO', 'DONE', 'CANCELED'],
      dtype='object')

In [47]:
import pandas as pd
import numpy as np
import re

_BCODE_RE = re.compile(r"^\d{8}$")

def remove_invalid_bcode(df: pd.DataFrame, *, bcode_col: str = "BCODE"):
    """
    Rule:
    - strip whitespace
    - BCODE must be exactly 8 digits (e.g., 22010585)
    Output:
    - clean_df, removed_idx
    """
    out = df.copy()

    b = out[bcode_col].astype("string").str.strip()
    out[bcode_col] = b  # keep cleaned BCODE

    valid = b.fillna("").str.match(_BCODE_RE)
    removed_idx = out.index[~valid].tolist()

    clean_df = out.loc[valid].copy()
    return clean_df, removed_idx


def _to_numeric_clean(series: pd.Series) -> pd.Series:
    """
    Convert common messy numeric strings to numbers.
    Handles: whitespace, commas. Non-convertible -> NaN.
    """
    s = series.astype("string").str.strip()
    s = s.str.replace(",", "", regex=False)  # "1,234.50" -> "1234.50"
    return pd.to_numeric(s, errors="coerce")


def remove_non_numeric_price_or_amount(
    df: pd.DataFrame,
    *,
    price_col: str = "PRICE",
    amount_col: str = "AMOUNT",
):
    """
    Rule:
    - PRICE must be numeric
    - AMOUNT must be numeric
    Output:
    - clean_df, removed_idx
    """
    out = df.copy()

    price_num = _to_numeric_clean(out[price_col])
    amount_num = _to_numeric_clean(out[amount_col])

    valid = price_num.notna() & amount_num.notna()

    # optionally overwrite with numeric versions (so downstream is safe)
    out[price_col] = price_num
    out[amount_col] = amount_num

    removed_idx = out.index[~valid].tolist()
    clean_df = out.loc[valid].copy()
    return clean_df, removed_idx


def remove_canceled_lines(df: pd.DataFrame, *, canceled_col: str = "CANCELED"):
    """
    Rule:
    - remove rows where CANCELED == 'Y' (case/whitespace insensitive)
    Output:
    - clean_df, removed_idx
    """
    out = df.copy()
    c = out[canceled_col].astype("string").str.strip().str.upper()

    is_canceled = c == "Y"
    removed_idx = out.index[is_canceled].tolist()

    clean_df = out.loc[~is_canceled].copy()
    return clean_df, removed_idx


In [48]:
!pip install tqdm



In [49]:
def enrich_sales_with_last_purchase_cost(
    sales: pd.DataFrame,
    purchases: pd.DataFrame,
    *,
    bcode_col: str = "BCODE",
    sale_date_col: str = "BILLDATE",
    purch_date_col: str = "BILLDATE",
    qty_col: str = "QTY",
    mtp_col: str = "MTP",
    amount_col: str = "AMOUNT",
    out_cost_col: str = "LAST_PURCHASE_COST",
    out_pdate_col: str = "LAST_PURCHASE_DATE",
    out_status_col: str = "COST_STATUS",
) -> pd.DataFrame:

    s = sales.copy()
    p = purchases.copy()

    s[bcode_col] = s[bcode_col].astype("string").str.strip()
    p[bcode_col] = p[bcode_col].astype("string").str.strip()

    s[sale_date_col] = pd.to_datetime(s[sale_date_col], errors="coerce")
    p[purch_date_col] = pd.to_datetime(p[purch_date_col], errors="coerce")

    denom = p[qty_col].astype(float) * p[mtp_col].astype(float)
    p["_UNIT_COST"] = np.where(denom != 0, p[amount_col].astype(float) / denom, np.nan)

    # Keep only valid purchases
    p = p[p[purch_date_col].notna() & p["_UNIT_COST"].notna()].copy()

    # ✅ Create a separate right-side date column so we don't overwrite sale BILLDATE
    p["_PURCH_DATE"] = p[purch_date_col]

    s["_POS"] = np.arange(len(s))
    s_valid = s[s[sale_date_col].notna()].copy()
    s_invalid = s[s[sale_date_col].isna()].copy()

    s_valid = s_valid.sort_values([sale_date_col, bcode_col, "_POS"], kind="mergesort")
    p = p.sort_values(["_PURCH_DATE", bcode_col], kind="mergesort")

    merged = pd.merge_asof(
        s_valid,
        p[[bcode_col, "_PURCH_DATE", "_UNIT_COST"]],
        left_on=sale_date_col,
        right_on="_PURCH_DATE",
        by=bcode_col,
        direction="backward",
        allow_exact_matches=True,
    )

    merged.rename(columns={"_UNIT_COST": out_cost_col, "_PURCH_DATE": out_pdate_col}, inplace=True)
    merged[out_status_col] = np.where(merged[out_cost_col].notna(), "OK", "UNKNOWN")

    if len(s_invalid) > 0:
        s_invalid[out_cost_col] = np.nan
        s_invalid[out_pdate_col] = pd.NaT
        s_invalid[out_status_col] = "UNKNOWN"
        merged = pd.concat([merged, s_invalid], ignore_index=False)

    merged = merged.sort_values("_POS", kind="mergesort").drop(columns=["_POS"])
    return merged


In [50]:
syp_sales_lines_cleaned, removed_bcode = remove_invalid_bcode(syp_sales_lines)
syp_sales_lines_cleaned, removed_nonnum = remove_non_numeric_price_or_amount(syp_sales_lines_cleaned)
syp_sales_lines_cleaned, removed_canceled = remove_canceled_lines(syp_sales_lines_cleaned)

total_rows = len(syp_sales_lines)

def pct(n, total):
    return (n / total * 100) if total else 0

print(f"Invalid BCODE removed: {len(removed_bcode)} ({pct(len(removed_bcode), total_rows):.2f}%)")
print(f"Non-numeric PRICE/AMOUNT removed: {len(removed_nonnum)} ({pct(len(removed_nonnum), total_rows):.2f}%)")
print(f"Canceled lines removed: {len(removed_canceled)} ({pct(len(removed_canceled), total_rows):.2f}%)")

total_removed = total_rows - len(syp_sales_lines_cleaned)
print(f"SYP Total removed: {total_removed} ({pct(total_removed, total_rows):.2f}%)")

Invalid BCODE removed: 6 (0.02%)
Non-numeric PRICE/AMOUNT removed: 4 (0.01%)
Canceled lines removed: 0 (0.00%)
SYP Total removed: 10 (0.03%)


In [51]:
hq_sales_lines_cleaned, removed_bcode = remove_invalid_bcode(hq_sales_lines)
hq_sales_lines_cleaned, removed_nonnum = remove_non_numeric_price_or_amount(hq_sales_lines_cleaned)
hq_sales_lines_cleaned, removed_canceled = remove_canceled_lines(hq_sales_lines_cleaned)

total_rows = len(hq_sales_lines)

def pct(n, total):
    return (n / total * 100) if total else 0

print(f"Invalid BCODE removed: {len(removed_bcode)} ({pct(len(removed_bcode), total_rows):.2f}%)")
print(f"Non-numeric PRICE/AMOUNT removed: {len(removed_nonnum)} ({pct(len(removed_nonnum), total_rows):.2f}%)")
print(f"Canceled lines removed: {len(removed_canceled)} ({pct(len(removed_canceled), total_rows):.2f}%)")

total_removed = total_rows - len(hq_sales_lines_cleaned)
print(f"HQ Total removed: {total_removed} ({pct(total_removed, total_rows):.2f}%)")

Invalid BCODE removed: 28958 (2.44%)
Non-numeric PRICE/AMOUNT removed: 553 (0.05%)
Canceled lines removed: 1698 (0.14%)
HQ Total removed: 31209 (2.63%)


In [52]:
purchase_lines_cleaned, removed_bcode = remove_invalid_bcode(purchase_lines)
purchase_lines_cleaned, removed_nonnum = remove_non_numeric_price_or_amount(purchase_lines_cleaned)
purchase_lines_cleaned, removed_canceled = remove_canceled_lines(purchase_lines_cleaned)

total_rows = len(purchase_lines)

def pct(n, total):
    return (n / total * 100) if total else 0

print(f"Invalid BCODE removed: {len(removed_bcode)} ({pct(len(removed_bcode), total_rows):.2f}%)")
print(f"Non-numeric PRICE/AMOUNT removed: {len(removed_nonnum)} ({pct(len(removed_nonnum), total_rows):.2f}%)")
print(f"Canceled lines removed: {len(removed_canceled)} ({pct(len(removed_canceled), total_rows):.2f}%)")

total_removed = total_rows - len(purchase_lines_cleaned)
print(f"Purchase HQ Total removed: {total_removed} ({pct(total_removed, total_rows):.2f}%)")

Invalid BCODE removed: 18311 (7.43%)
Non-numeric PRICE/AMOUNT removed: 1386 (0.56%)
Canceled lines removed: 13 (0.01%)
Purchase HQ Total removed: 19710 (7.99%)


In [53]:
hq_sales_enriched = enrich_sales_with_last_purchase_cost(
    hq_sales_lines_cleaned,
    purchase_lines_cleaned,
)

# QC
total = len(hq_sales_enriched)
unk = (hq_sales_enriched["COST_STATUS"] == "UNKNOWN").sum()
print(f"UNKNOWN: {unk} ({unk/total*100:.2f}%)")



UNKNOWN: 69015 (5.97%)


In [54]:
syp_sales_enriched = enrich_sales_with_last_purchase_cost(
    syp_sales_lines_cleaned,
    purchase_lines_cleaned,
)

# QC
total = len(syp_sales_enriched)
unk = (syp_sales_enriched["COST_STATUS"] == "UNKNOWN").sum()
print(f"UNKNOWN: {unk} ({unk/total*100:.2f}%)")

UNKNOWN: 465 (1.53%)


In [55]:
hq_sales_enriched["BRANCH"] = "HQ"
syp_sales_enriched["BRANCH"] = "SYP"

In [56]:
sales_all = pd.concat([hq_sales_enriched, syp_sales_enriched], ignore_index=True)

sales_all["BRANCH"] = sales_all["BRANCH"].astype("string")
sales_all["LAST_PURCHASE_COST"] = pd.to_numeric(sales_all["LAST_PURCHASE_COST"], errors="coerce")
sales_all["BILLDATE"] = pd.to_datetime(sales_all["BILLDATE"], errors="coerce")

sales_all["BRANCH_BILLNO"] = sales_all["BRANCH"] + "-" + sales_all["BILLNO"].astype(str)


In [57]:
import pandas as pd
import numpy as np

def _clean_str(s: pd.Series) -> pd.Series:
    return s.astype("string").str.strip()

def _to_dt(s: pd.Series) -> pd.Series:
    return pd.to_datetime(s, errors="coerce")

# ------------------------
# DimDate
# ------------------------
def build_dim_date_from_sales(sales_all: pd.DataFrame, *, date_col: str = "BILLDATE") -> pd.DataFrame:
    d = _to_dt(sales_all[date_col]).dropna().dt.normalize()
    if d.empty:
        return pd.DataFrame(columns=["Date", "DateKey", "Year", "Month", "Day", "YearMonth", "Quarter", "WeekNum"])

    date_range = pd.date_range(d.min(), d.max(), freq="D")
    dim = pd.DataFrame({"Date": date_range})
    dim["DateKey"] = dim["Date"].dt.strftime("%Y%m%d").astype(int)
    dim["Year"] = dim["Date"].dt.year
    dim["Month"] = dim["Date"].dt.month
    dim["Day"] = dim["Date"].dt.day
    dim["YearMonth"] = dim["Date"].dt.strftime("%Y-%m")
    dim["Quarter"] = dim["Date"].dt.quarter
    dim["WeekNum"] = dim["Date"].dt.isocalendar().week.astype(int)
    return dim

# ------------------------
# DimBranch
# ------------------------
def build_dim_branch(sales_all: pd.DataFrame, *, branch_col: str = "BRANCH") -> pd.DataFrame:
    dim = pd.DataFrame({"BRANCH": _clean_str(sales_all[branch_col])}).dropna()
    dim = dim[dim["BRANCH"] != ""].drop_duplicates().sort_values("BRANCH").reset_index(drop=True)
    dim["BranchKey"] = dim["BRANCH"]
    return dim

# ------------------------
# DimProduct (BCODE)
# ------------------------
def build_dim_product(
    sales_all: pd.DataFrame,
    *,
    bcode_col: str = "BCODE",
    detail_col: str = "DETAIL",
    ui_col: str = "UI",
    last_seen_date_col: str = "BILLDATE",
) -> pd.DataFrame:
    df = sales_all.copy()
    df[bcode_col] = _clean_str(df[bcode_col])
    df[detail_col] = _clean_str(df.get(detail_col, ""))
    df[ui_col] = _clean_str(df.get(ui_col, ""))
    df[last_seen_date_col] = _to_dt(df[last_seen_date_col])

    df = df[df[bcode_col].notna() & (df[bcode_col] != "")]
    df = df.sort_values([bcode_col, last_seen_date_col], kind="mergesort")
    last = df.groupby(bcode_col, sort=False).tail(1)

    dim = pd.DataFrame({
        "BCODE": last[bcode_col],
        "DETAIL": last.get(detail_col, pd.Series([pd.NA]*len(last))),
        "UI": last.get(ui_col, pd.Series([pd.NA]*len(last))),
        "LastSeenDate": last[last_seen_date_col].dt.normalize(),
    }).reset_index(drop=True)

    dim["ProductKey"] = dim["BCODE"]
    # add CATEGORY_CODE (first 2 digits) for easy relationship too
    dim["CATEGORY_CODE"] = dim["BCODE"].astype("string").str.slice(0, 2)
    return dim

# ------------------------
# DimCategory (first 2 digits of BCODE)
# ------------------------
def build_dim_category_from_bcode(
    sales_all: pd.DataFrame,
    *,
    bcode_col: str = "BCODE",
) -> pd.DataFrame:
    b = _clean_str(sales_all[bcode_col])
    cat = b.dropna().str.slice(0, 2)
    # keep only exactly 2 digits
    cat = cat[cat.str.match(r"^\d{2}$", na=False)]

    dim = pd.DataFrame({"CATEGORY_CODE": cat}).drop_duplicates().sort_values("CATEGORY_CODE").reset_index(drop=True)
    dim["CategoryKey"] = dim["CATEGORY_CODE"]
    return dim

# ------------------------
# DimCustomer (ACCTNO)
# ------------------------
def build_dim_customer(
    sales_all: pd.DataFrame,
    *,
    customer_col: str = "ACCTNO",
) -> pd.DataFrame:
    c = _clean_str(sales_all.get(customer_col, pd.Series([pd.NA]*len(sales_all))))
    dim = pd.DataFrame({"CUSTOMER_ACCTNO": c}).dropna()
    dim = dim[dim["CUSTOMER_ACCTNO"] != ""].drop_duplicates().sort_values("CUSTOMER_ACCTNO").reset_index(drop=True)
    dim["CustomerKey"] = dim["CUSTOMER_ACCTNO"]
    return dim

# ------------------------
# DimSupplier (ACCT_NO)
# ------------------------
def build_dim_supplier(
    sales_all: pd.DataFrame,
    *,
    supplier_col: str = "ACCT_NO",
) -> pd.DataFrame:
    s = _clean_str(sales_all.get(supplier_col, pd.Series([pd.NA]*len(sales_all))))
    dim = pd.DataFrame({"SUPPLIER_ACCT_NO": s}).dropna()
    dim = dim[dim["SUPPLIER_ACCT_NO"] != ""].drop_duplicates().sort_values("SUPPLIER_ACCT_NO").reset_index(drop=True)
    dim["SupplierKey"] = dim["SUPPLIER_ACCT_NO"]
    return dim

# ------------------------
# Wrapper
# ------------------------
def build_all_dims_for_powerbi_v2(sales_all: pd.DataFrame):
    return {
        "dim_date": build_dim_date_from_sales(sales_all),
        "dim_branch": build_dim_branch(sales_all),
        "dim_product": build_dim_product(sales_all),
        "dim_category": build_dim_category_from_bcode(sales_all),
        "dim_customer": build_dim_customer(sales_all),
        "dim_supplier": build_dim_supplier(sales_all),
    }


In [58]:
dims = build_all_dims_for_powerbi_v2(sales_all)
{k: v.shape for k, v in dims.items()}

{'dim_date': (3872, 8),
 'dim_branch': (2, 2),
 'dim_product': (33839, 6),
 'dim_category': (36, 2),
 'dim_customer': (2175, 2),
 'dim_supplier': (768, 2)}

In [59]:
import os
out_dir = "/content/drive/MyDrive/kcw_analytics/03_curated"
os.makedirs(out_dir, exist_ok=True)

for name, df in dims.items():
    df.to_csv(f"{out_dir}/{name}.csv", index=False, encoding="utf-8-sig")

sales_all.to_csv(
    f"{out_dir}/fact_sales_all.csv",
    index=False,
    encoding="utf-8-sig"   # important for Thai + Excel
)


**DEBUG**

In [60]:
syp_sales_enriched

Unnamed: 0,ID,JOURMODE,JOURTYPE,JOURDATE,BILLTYPE,BILLDATE,BILLNO,LINE,ITEMNO,BCODE,...,CHGAMT,ACCTNO,PAID,ACCT_NO,DONE,CANCELED,LAST_PURCHASE_DATE,LAST_PURCHASE_COST,COST_STATUS,BRANCH
13,9,2,SJ,2025-06-23 00:00:00,1,2025-06-23,3K68-0000001,10,,15018750,...,,,Y,7VP,N,N,2025-05-15,109.350000,OK,SYP
11,10,2,SJ,2025-06-23 00:00:00,1,2025-06-23,3K68-0000001,20,,15013500,...,,,Y,7VP,N,N,2025-06-13,51.400000,OK,SYP
6,12,2,SJ,2025-06-23 00:00:00,1,2025-06-23,3K68-0000002,10,,13022630,...,,,Y,7STKG,N,N,2025-05-05,116.912500,OK,SYP
5,13,2,SJ,2025-06-23 00:00:00,1,2025-06-23,3K68-0000002,20,,12010135,...,,,Y,CRRF,N,N,2025-06-16,115.000000,OK,SYP
9,14,2,SJ,2025-06-23 00:00:00,1,2025-06-23,3K68-0000003,10,,14050200,...,,,Y,7BONUS,N,N,2025-06-10,30.373750,OK,SYP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30329,1835286,2,SJ,2026-01-26 00:00:00,1,2026-01-26,33K69-0000573,50,,13052292,...,,,Y,TOP,N,N,2026-01-15,0.000000,OK,SYP
30347,1835287,2,SJ,2026-01-26 00:00:00,1,2026-01-26,33K69-0000573,60,,22050095,...,,,Y,7PZ,N,N,2025-12-15,77.876583,OK,SYP
30338,1835288,2,SJ,2026-01-26 00:00:00,1,2026-01-26,32K69-0000601,10,,17008000,...,,,Y,LK,N,N,2025-08-20,0.420560,OK,SYP
30339,1835289,2,SJ,2026-01-26 00:00:00,1,2026-01-26,32K69-0000601,20,,17008550,...,,,Y,ANP,N,N,2025-07-30,3.180000,OK,SYP


In [61]:
df_unknown = syp_sales_enriched[syp_sales_enriched["LAST_PURCHASE_COST"].isna()]
df_unknown


Unnamed: 0,ID,JOURMODE,JOURTYPE,JOURDATE,BILLTYPE,BILLDATE,BILLNO,LINE,ITEMNO,BCODE,...,CHGAMT,ACCTNO,PAID,ACCT_NO,DONE,CANCELED,LAST_PURCHASE_DATE,LAST_PURCHASE_COST,COST_STATUS,BRANCH
77,20050,2,SJ,2025-06-25 00:00:00,1,2025-06-25,3K68-0000032,10,,70010011,...,,,Y,,N,N,NaT,,UNKNOWN,SYP
383,70107,2,SJ,2025-07-01 00:00:00,1,2025-07-01,3K68-0000122,10,,13010000,...,,,Y,,N,N,NaT,,UNKNOWN,SYP
384,70133,2,SJ,2025-07-01 00:00:00,1,2025-07-01,3K68-0000135,50,,13010000,...,,ชย,Y,,N,N,NaT,,UNKNOWN,SYP
447,70155,2,SJ,2025-07-01 00:00:00,1,2025-07-01,3K68-0000143,20,,70010011,...,,ชตขพ,Y,,N,N,NaT,,UNKNOWN,SYP
467,80288,2,SJ,2025-07-02 00:00:00,1,2025-07-02,3K68-0000153,10,,08054857,...,,ชล,Y,,N,N,NaT,,UNKNOWN,SYP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30152,1825236,2,SJ,2026-01-24 00:00:00,1,2026-01-24,33K69-0000539,10,,05057622,...,,ชคร,Y,7YRR,N,N,NaT,,UNKNOWN,SYP
30153,1825359,0,SJ,2026-01-24 00:00:00,1,2026-01-24,3TAR6901-165,20,2,05057622,...,,7000,N,,N,N,NaT,,UNKNOWN,SYP
30292,1825378,2,SJ,2026-01-24 00:00:00,1,2026-01-24,33K69-0000553,20,,70010400,...,,,Y,,N,N,NaT,,UNKNOWN,SYP
30362,1835224,2,SJ,2026-01-26 00:00:00,1,2026-01-26,33K69-0000557,20,,70010300,...,,,Y,,N,N,NaT,,UNKNOWN,SYP


In [62]:
pidet = data["raw_hq_pidet_purchase_lines.csv"].copy()
pidet_filtered = pidet[pidet["BCODE"] == "13010000"].copy()

pidet_filtered[["BCODE", 'BILLDATE', "QTY", "MTP", "PRICE", "AMOUNT"]]

Unnamed: 0,BCODE,BILLDATE,QTY,MTP,PRICE,AMOUNT
