In [3]:
!git clone https://github.com/pthengtr/kcw-analytics.git

Cloning into 'kcw-analytics'...
remote: Enumerating objects: 184, done.[K
remote: Counting objects: 100% (184/184), done.[K
remote: Compressing objects: 100% (137/137), done.[K
remote: Total 184 (delta 103), reused 71 (delta 20), pack-reused 0 (from 0)[K
Receiving objects: 100% (184/184), 144.53 KiB | 2.13 MiB/s, done.
Resolving deltas: 100% (103/103), done.


In [4]:
!cd /content/kcw-analytics && git pull origin main

From https://github.com/pthengtr/kcw-analytics
 * branch            main       -> FETCH_HEAD
Already up to date.


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import os
import pandas as pd

folder = "/content/drive/MyDrive/kcw_analytics/01_raw"

data = {}

for file in os.listdir(folder):
    if file.endswith(".csv"):
        path = os.path.join(folder, file)
        data[file] = pd.read_csv(
            path,
            dtype={
              "BCODE": "string",
              "ITEMNO": "string",
              "BILLNO": "string",
            },
            encoding="utf-8-sig",
            low_memory=False   # stops chunk guessing
        )
        print(f"Loaded: {file} -> {data[file].shape}")



Loaded: raw_inventory_hq_2024.csv -> (4983, 8)
Loaded: raw_hq_pimas_purchase_bills.csv -> (83130, 49)
Loaded: raw_hq_sidet_sales_lines.csv -> (1194399, 38)
Loaded: raw_hq_simas_sales_bills.csv -> (484283, 49)
Loaded: raw_hq_pidet_purchase_lines.csv -> (247915, 41)
Loaded: raw_syp_pimas_purchase_bills.csv -> (2829, 49)
Loaded: raw_syp_simas_sales_bills.csv -> (11348, 49)
Loaded: raw_syp_pidet_purchase_lines.csv -> (26431, 41)
Loaded: raw_syp_sidet_sales_lines.csv -> (33314, 38)
Loaded: raw_hq_icmas_products.csv -> (114825, 94)


In [7]:
hq_sales_lines = data['raw_hq_sidet_sales_lines.csv'].copy()
syp_sales_lines = data['raw_syp_sidet_sales_lines.csv'].copy()
purchase_lines = data['raw_hq_pidet_purchase_lines.csv'].copy()

In [8]:
hq_sales_lines.columns

Index(['ID', 'JOURMODE', 'JOURTYPE', 'JOURDATE', 'BILLTYPE', 'BILLDATE',
       'BILLNO', 'LINE', 'ITEMNO', 'BCODE', 'PCODE', 'MCODE', 'DETAIL',
       'WHNUMBER', 'LOCATION1', 'STATUS', 'SERIAL', 'TAXIC', 'EXMPT', 'ISVAT',
       'QTY', 'UI', 'MTP', 'PRICE', 'XPRICE', 'DISCNT1', 'DISCNT2', 'DISCNT3',
       'DISCNT4', 'DED', 'VAT', 'AMOUNT', 'CHGAMT', 'ACCTNO', 'PAID',
       'ACCT_NO', 'DONE', 'CANCELED'],
      dtype='object')

In [9]:
import pandas as pd
import numpy as np

def refill_last_cost_from_icmas(
    data: dict,
    df: pd.DataFrame,
    *,
    icmas_key: str = "raw_hq_icmas_products.csv",
    bcode_col: str = "BCODE",
    last_cost_col: str = "LAST_COST",
    icmas_cost_col: str = "COSTNET",
) -> pd.DataFrame:
    """
    Refill LAST_COST when it is 0 or NaN using COSTNET from ICMAS.
    Includes BCODE cleanup to avoid merge mismatch.
    """

    result = df.copy()

    # --- BCODE CLEANUP (VERY IMPORTANT for KCW datasets) ---
    result[bcode_col] = (
        result[bcode_col]
        .astype(str)
        .str.strip()
        .str.upper()
    )

    icmas = data[icmas_key][[bcode_col, icmas_cost_col]].copy()

    icmas[bcode_col] = (
        icmas[bcode_col]
        .astype(str)
        .str.strip()
        .str.upper()
    )

    # --- numeric safety ---
    result[last_cost_col] = pd.to_numeric(result[last_cost_col], errors="coerce")
    icmas[icmas_cost_col] = pd.to_numeric(icmas[icmas_cost_col], errors="coerce")

    # --- merge COSTNET ---
    result = result.merge(
        icmas,
        on=bcode_col,
        how="left",
        suffixes=("", "_ICMAS")
    )

    # --- detect invalid LAST_COST ---
    mask_invalid = result[last_cost_col].isna() | (result[last_cost_col] == 0)

    # --- refill only invalid rows ---
    result.loc[mask_invalid, last_cost_col] = result.loc[
        mask_invalid, icmas_cost_col
    ]

    # --- drop helper column ---
    result = result.drop(columns=[icmas_cost_col])

    return result

def qc_unknown(df, label):
    total = len(df)
    unk = (df["COST_STATUS"] == "UNKNOWN").sum()
    print(f"[{label}] UNKNOWN: {unk:,} / {total:,} ({(unk/total*100 if total else 0):.2f}%)")

In [10]:
import pandas as pd
import numpy as np
import re

_BCODE_RE = re.compile(r"^\d{8}$")

def remove_invalid_bcode(df: pd.DataFrame, *, bcode_col: str = "BCODE"):
    """
    Rule:
    - strip whitespace
    - BCODE must be exactly 8 digits (e.g., 22010585)
    Output:
    - clean_df, removed_idx
    """
    out = df.copy()

    b = out[bcode_col].astype("string").str.strip()
    out[bcode_col] = b  # keep cleaned BCODE

    valid = b.fillna("").str.match(_BCODE_RE)
    removed_idx = out.index[~valid].tolist()

    clean_df = out.loc[valid].copy()
    return clean_df, removed_idx


def _to_numeric_clean(series: pd.Series) -> pd.Series:
    """
    Convert common messy numeric strings to numbers.
    Handles: whitespace, commas. Non-convertible -> NaN.
    """
    s = series.astype("string").str.strip()
    s = s.str.replace(",", "", regex=False)  # "1,234.50" -> "1234.50"
    return pd.to_numeric(s, errors="coerce")


def remove_non_numeric_price_or_amount(
    df: pd.DataFrame,
    *,
    price_col: str = "PRICE",
    amount_col: str = "AMOUNT",
):
    """
    Rule:
    - PRICE must be numeric
    - AMOUNT must be numeric
    Output:
    - clean_df, removed_idx
    """
    out = df.copy()

    price_num = _to_numeric_clean(out[price_col])
    amount_num = _to_numeric_clean(out[amount_col])

    valid = price_num.notna() & amount_num.notna()

    # optionally overwrite with numeric versions (so downstream is safe)
    out[price_col] = price_num
    out[amount_col] = amount_num

    removed_idx = out.index[~valid].tolist()
    clean_df = out.loc[valid].copy()
    return clean_df, removed_idx


def remove_canceled_lines(df: pd.DataFrame, *, canceled_col: str = "CANCELED"):
    """
    Rule:
    - remove rows where CANCELED == 'Y' (case/whitespace insensitive)
    Output:
    - clean_df, removed_idx
    """
    out = df.copy()
    c = out[canceled_col].astype("string").str.strip().str.upper()

    is_canceled = c == "Y"
    removed_idx = out.index[is_canceled].tolist()

    clean_df = out.loc[~is_canceled].copy()
    return clean_df, removed_idx


In [11]:
!pip install tqdm



In [12]:
def enrich_sales_with_last_purchase_cost(
    sales: pd.DataFrame,
    purchases: pd.DataFrame,
    *,
    bcode_col: str = "BCODE",
    sale_date_col: str = "BILLDATE",
    purch_date_col: str = "BILLDATE",
    qty_col: str = "QTY",
    mtp_col: str = "MTP",
    amount_col: str = "AMOUNT",
    out_cost_col: str = "LAST_PURCHASE_COST",
    out_pdate_col: str = "LAST_PURCHASE_DATE",
    out_status_col: str = "COST_STATUS",
) -> pd.DataFrame:

    s = sales.copy()
    p = purchases.copy()

    s[bcode_col] = s[bcode_col].astype("string").str.strip()
    p[bcode_col] = p[bcode_col].astype("string").str.strip()

    s[sale_date_col] = pd.to_datetime(s[sale_date_col], errors="coerce")
    p[purch_date_col] = pd.to_datetime(p[purch_date_col], errors="coerce")

    denom = p[qty_col].astype(float) * p[mtp_col].astype(float)
    p["_UNIT_COST"] = np.where(denom != 0, p[amount_col].astype(float) / denom, np.nan)

    # Keep only valid purchases
    p = p[p[purch_date_col].notna() & p["_UNIT_COST"].notna()].copy()

    # ✅ Create a separate right-side date column so we don't overwrite sale BILLDATE
    p["_PURCH_DATE"] = p[purch_date_col]

    s["_POS"] = np.arange(len(s))
    s_valid = s[s[sale_date_col].notna()].copy()
    s_invalid = s[s[sale_date_col].isna()].copy()

    s_valid = s_valid.sort_values([sale_date_col, bcode_col, "_POS"], kind="mergesort")
    p = p.sort_values(["_PURCH_DATE", bcode_col], kind="mergesort")

    merged = pd.merge_asof(
        s_valid,
        p[[bcode_col, "_PURCH_DATE", "_UNIT_COST"]],
        left_on=sale_date_col,
        right_on="_PURCH_DATE",
        by=bcode_col,
        direction="backward",
        allow_exact_matches=True,
    )

    merged.rename(columns={"_UNIT_COST": out_cost_col, "_PURCH_DATE": out_pdate_col}, inplace=True)
    merged[out_status_col] = np.where(merged[out_cost_col].notna(), "OK", "UNKNOWN")

    if len(s_invalid) > 0:
        s_invalid[out_cost_col] = np.nan
        s_invalid[out_pdate_col] = pd.NaT
        s_invalid[out_status_col] = "UNKNOWN"
        merged = pd.concat([merged, s_invalid], ignore_index=False)

    merged = merged.sort_values("_POS", kind="mergesort").drop(columns=["_POS"])
    return merged


In [13]:
syp_sales_lines_cleaned, removed_bcode = remove_invalid_bcode(syp_sales_lines)
syp_sales_lines_cleaned, removed_nonnum = remove_non_numeric_price_or_amount(syp_sales_lines_cleaned)
syp_sales_lines_cleaned, removed_canceled = remove_canceled_lines(syp_sales_lines_cleaned)

total_rows = len(syp_sales_lines)

def pct(n, total):
    return (n / total * 100) if total else 0

print(f"Invalid BCODE removed: {len(removed_bcode)} ({pct(len(removed_bcode), total_rows):.2f}%)")
print(f"Non-numeric PRICE/AMOUNT removed: {len(removed_nonnum)} ({pct(len(removed_nonnum), total_rows):.2f}%)")
print(f"Canceled lines removed: {len(removed_canceled)} ({pct(len(removed_canceled), total_rows):.2f}%)")

total_removed = total_rows - len(syp_sales_lines_cleaned)
print(f"SYP Total removed: {total_removed} ({pct(total_removed, total_rows):.2f}%)")

Invalid BCODE removed: 7 (0.02%)
Non-numeric PRICE/AMOUNT removed: 5 (0.02%)
Canceled lines removed: 0 (0.00%)
SYP Total removed: 12 (0.04%)


In [14]:
hq_sales_lines_cleaned, removed_bcode = remove_invalid_bcode(hq_sales_lines)
hq_sales_lines_cleaned, removed_nonnum = remove_non_numeric_price_or_amount(hq_sales_lines_cleaned)
hq_sales_lines_cleaned, removed_canceled = remove_canceled_lines(hq_sales_lines_cleaned)

total_rows = len(hq_sales_lines)

def pct(n, total):
    return (n / total * 100) if total else 0

print(f"Invalid BCODE removed: {len(removed_bcode)} ({pct(len(removed_bcode), total_rows):.2f}%)")
print(f"Non-numeric PRICE/AMOUNT removed: {len(removed_nonnum)} ({pct(len(removed_nonnum), total_rows):.2f}%)")
print(f"Canceled lines removed: {len(removed_canceled)} ({pct(len(removed_canceled), total_rows):.2f}%)")

total_removed = total_rows - len(hq_sales_lines_cleaned)
print(f"HQ Total removed: {total_removed} ({pct(total_removed, total_rows):.2f}%)")

Invalid BCODE removed: 28960 (2.42%)
Non-numeric PRICE/AMOUNT removed: 566 (0.05%)
Canceled lines removed: 1794 (0.15%)
HQ Total removed: 31320 (2.62%)


In [15]:
purchase_lines_cleaned, removed_bcode = remove_invalid_bcode(purchase_lines)
purchase_lines_cleaned, removed_nonnum = remove_non_numeric_price_or_amount(purchase_lines_cleaned)
purchase_lines_cleaned, removed_canceled = remove_canceled_lines(purchase_lines_cleaned)

total_rows = len(purchase_lines)

def pct(n, total):
    return (n / total * 100) if total else 0

print(f"Invalid BCODE removed: {len(removed_bcode)} ({pct(len(removed_bcode), total_rows):.2f}%)")
print(f"Non-numeric PRICE/AMOUNT removed: {len(removed_nonnum)} ({pct(len(removed_nonnum), total_rows):.2f}%)")
print(f"Canceled lines removed: {len(removed_canceled)} ({pct(len(removed_canceled), total_rows):.2f}%)")

total_removed = total_rows - len(purchase_lines_cleaned)
print(f"Purchase HQ Total removed: {total_removed} ({pct(total_removed, total_rows):.2f}%)")

Invalid BCODE removed: 18324 (7.39%)
Non-numeric PRICE/AMOUNT removed: 1387 (0.56%)
Canceled lines removed: 13 (0.01%)
Purchase HQ Total removed: 19724 (7.96%)


In [16]:
hq_sales_enriched = enrich_sales_with_last_purchase_cost(
    hq_sales_lines_cleaned,
    purchase_lines_cleaned,
)

qc_unknown(hq_sales_enriched, "before refill")

hq_sales_enriched = refill_last_cost_from_icmas(
    data,
    hq_sales_enriched,
    last_cost_col="LAST_PURCHASE_COST",
)

qc_unknown(hq_sales_enriched, "after refill")



[before refill] UNKNOWN: 69,125 / 1,163,079 (5.94%)
[after refill] UNKNOWN: 69,125 / 1,163,079 (5.94%)


In [17]:
syp_sales_enriched = enrich_sales_with_last_purchase_cost(
    syp_sales_lines_cleaned,
    purchase_lines_cleaned,
)

qc_unknown(syp_sales_enriched, "before refill")

syp_sales_enriched = refill_last_cost_from_icmas(
    data,
    syp_sales_enriched,
    last_cost_col="LAST_PURCHASE_COST",
)

qc_unknown(syp_sales_enriched, "after refill")

[before refill] UNKNOWN: 490 / 33,302 (1.47%)
[after refill] UNKNOWN: 490 / 33,302 (1.47%)


In [18]:
hq_sales_enriched["BRANCH"] = "HQ"
syp_sales_enriched["BRANCH"] = "SYP"

In [19]:
sales_all = pd.concat([hq_sales_enriched, syp_sales_enriched], ignore_index=True)

sales_all["BRANCH"] = sales_all["BRANCH"].astype("string")
sales_all["LAST_PURCHASE_COST"] = pd.to_numeric(sales_all["LAST_PURCHASE_COST"], errors="coerce")
sales_all["BILLDATE"] = pd.to_datetime(sales_all["BILLDATE"], errors="coerce")

sales_all["BRANCH_BILLNO"] = sales_all["BRANCH"] + "-" + sales_all["BILLNO"].astype(str)


In [20]:
sales_all["BILLTYPE_STD"] = (
    sales_all["BILLNO"]
    .astype("string")
    .str.upper()
    .str.replace(r"^3", "", regex=True)   # remove leading 3 if exists
    .str.extract(r"^(TFV|TAD|TAR|TR|TD|TF|CN)", expand=False)
    .fillna("UNKNOWN")
)

In [21]:
import pandas as pd
import numpy as np

def _clean_str(s: pd.Series) -> pd.Series:
    return s.astype("string").str.strip()

def _to_dt(s: pd.Series) -> pd.Series:
    return pd.to_datetime(s, errors="coerce")

# ------------------------
# DimDate
# ------------------------
def build_dim_date_from_sales(sales_all: pd.DataFrame, *, date_col: str = "BILLDATE") -> pd.DataFrame:
    d = _to_dt(sales_all[date_col]).dropna().dt.normalize()
    if d.empty:
        return pd.DataFrame(columns=["Date", "DateKey", "Year", "Month", "Day", "YearMonth", "Quarter", "WeekNum"])

    date_range = pd.date_range(d.min(), d.max(), freq="D")
    dim = pd.DataFrame({"Date": date_range})
    dim["DateKey"] = dim["Date"].dt.strftime("%Y%m%d").astype(int)
    dim["Year"] = dim["Date"].dt.year
    dim["Month"] = dim["Date"].dt.month
    dim["Day"] = dim["Date"].dt.day
    dim["YearMonth"] = dim["Date"].dt.strftime("%Y-%m")
    dim["Quarter"] = dim["Date"].dt.quarter
    dim["WeekNum"] = dim["Date"].dt.isocalendar().week.astype(int)
    return dim

# ------------------------
# DimBranch
# ------------------------
def build_dim_branch(sales_all: pd.DataFrame, *, branch_col: str = "BRANCH") -> pd.DataFrame:
    dim = pd.DataFrame({"BRANCH": _clean_str(sales_all[branch_col])}).dropna()
    dim = dim[dim["BRANCH"] != ""].drop_duplicates().sort_values("BRANCH").reset_index(drop=True)
    dim["BranchKey"] = dim["BRANCH"]
    return dim

# ------------------------
# DimProduct (BCODE)
# ------------------------
def build_dim_product(
    sales_all: pd.DataFrame,
    *,
    bcode_col: str = "BCODE",
    detail_col: str = "DETAIL",
    ui_col: str = "UI",
    last_seen_date_col: str = "BILLDATE",
) -> pd.DataFrame:
    df = sales_all.copy()
    df[bcode_col] = _clean_str(df[bcode_col])
    df[detail_col] = _clean_str(df.get(detail_col, ""))
    df[ui_col] = _clean_str(df.get(ui_col, ""))
    df[last_seen_date_col] = _to_dt(df[last_seen_date_col])

    df = df[df[bcode_col].notna() & (df[bcode_col] != "")]
    df = df.sort_values([bcode_col, last_seen_date_col], kind="mergesort")
    last = df.groupby(bcode_col, sort=False).tail(1)

    dim = pd.DataFrame({
        "BCODE": last[bcode_col],
        "DETAIL": last.get(detail_col, pd.Series([pd.NA]*len(last))),
        "UI": last.get(ui_col, pd.Series([pd.NA]*len(last))),
        "LastSeenDate": last[last_seen_date_col].dt.normalize(),
    }).reset_index(drop=True)

    dim["ProductKey"] = dim["BCODE"]
    # add CATEGORY_CODE (first 2 digits) for easy relationship too
    dim["CATEGORY_CODE"] = dim["BCODE"].astype("string").str.slice(0, 2)
    return dim

# ------------------------
# DimCategory (first 2 digits of BCODE)
# ------------------------
def build_dim_category_from_bcode(
    sales_all: pd.DataFrame,
    *,
    bcode_col: str = "BCODE",
) -> pd.DataFrame:
    b = _clean_str(sales_all[bcode_col])
    cat = b.dropna().str.slice(0, 2)
    # keep only exactly 2 digits
    cat = cat[cat.str.match(r"^\d{2}$", na=False)]

    dim = pd.DataFrame({"CATEGORY_CODE": cat}).drop_duplicates().sort_values("CATEGORY_CODE").reset_index(drop=True)
    dim["CategoryKey"] = dim["CATEGORY_CODE"]
    return dim

# ------------------------
# DimCustomer (ACCTNO)
# ------------------------
def build_dim_customer(
    sales_all: pd.DataFrame,
    *,
    customer_col: str = "ACCTNO",
) -> pd.DataFrame:
    c = _clean_str(sales_all.get(customer_col, pd.Series([pd.NA]*len(sales_all))))
    dim = pd.DataFrame({"CUSTOMER_ACCTNO": c}).dropna()
    dim = dim[dim["CUSTOMER_ACCTNO"] != ""].drop_duplicates().sort_values("CUSTOMER_ACCTNO").reset_index(drop=True)
    dim["CustomerKey"] = dim["CUSTOMER_ACCTNO"]
    return dim

# ------------------------
# DimSupplier (ACCT_NO)
# ------------------------
def build_dim_supplier(
    sales_all: pd.DataFrame,
    *,
    supplier_col: str = "ACCT_NO",
) -> pd.DataFrame:
    s = _clean_str(sales_all.get(supplier_col, pd.Series([pd.NA]*len(sales_all))))
    dim = pd.DataFrame({"SUPPLIER_ACCT_NO": s}).dropna()
    dim = dim[dim["SUPPLIER_ACCT_NO"] != ""].drop_duplicates().sort_values("SUPPLIER_ACCT_NO").reset_index(drop=True)
    dim["SupplierKey"] = dim["SUPPLIER_ACCT_NO"]
    return dim

# -----------------------------
# DIM BILLTYPE (from BILLNO)
# -----------------------------
KNOWN_TYPES = ["TFV", "TAD", "TAR", "TR", "TD", "TF", "CN"]

def build_dim_billtype(sales_all):
    dim = pd.DataFrame({"BILLTYPE_STD": _clean_str(sales_all["BILLTYPE_STD"]).str.upper()})
    dim = dim.drop_duplicates().sort_values("BILLTYPE_STD").reset_index(drop=True)
    dim["BillTypeKey"] = dim["BILLTYPE_STD"]
    return dim

# ------------------------
# Wrapper
# ------------------------
def build_all_dims(sales_all):
    return {
        "dim_date": build_dim_date_from_sales(sales_all),
        "dim_product": build_dim_product(sales_all),
        "dim_category": build_dim_category_from_bcode(sales_all),
        "dim_customer": build_dim_customer(sales_all),
        "dim_supplier": build_dim_supplier(sales_all),
        "dim_branch": build_dim_branch(sales_all),
        "dim_billtype": build_dim_billtype(sales_all),
    }


In [22]:
dims = build_all_dims(sales_all)
{k: v.shape for k, v in dims.items()}

{'dim_date': (3887, 8),
 'dim_product': (33950, 6),
 'dim_category': (36, 2),
 'dim_customer': (2202, 2),
 'dim_supplier': (771, 2),
 'dim_branch': (2, 2),
 'dim_billtype': (8, 2)}

In [23]:
import os
out_dir = "/content/drive/MyDrive/kcw_analytics/03_curated"
os.makedirs(out_dir, exist_ok=True)

for name, df in dims.items():
    df.to_csv(f"{out_dir}/{name}.csv", index=False, encoding="utf-8-sig")


In [24]:
sales_all.to_csv(
    f"{out_dir}/fact_sales_all.csv",
    index=False,
    encoding="utf-8-sig"   # important for Thai + Excel
)

**DEBUG**

In [25]:
syp_sales_enriched

Unnamed: 0,ID,JOURMODE,JOURTYPE,JOURDATE,BILLTYPE,BILLDATE,BILLNO,LINE,ITEMNO,BCODE,...,CHGAMT,ACCTNO,PAID,ACCT_NO,DONE,CANCELED,LAST_PURCHASE_DATE,LAST_PURCHASE_COST,COST_STATUS,BRANCH
0,9,2,SJ,2025-06-23 00:00:00,1,2025-06-23,3K68-0000001,10,,15018750,...,,,Y,7VP,N,N,2025-05-15,109.350000,OK,SYP
1,10,2,SJ,2025-06-23 00:00:00,1,2025-06-23,3K68-0000001,20,,15013500,...,,,Y,7VP,N,N,2025-06-13,51.400000,OK,SYP
2,12,2,SJ,2025-06-23 00:00:00,1,2025-06-23,3K68-0000002,10,,13022630,...,,,Y,7STKG,N,N,2025-05-05,116.912500,OK,SYP
3,13,2,SJ,2025-06-23 00:00:00,1,2025-06-23,3K68-0000002,20,,12010135,...,,,Y,CRRF,N,N,2025-06-16,115.000000,OK,SYP
4,14,2,SJ,2025-06-23 00:00:00,1,2025-06-23,3K68-0000003,10,,14050200,...,,,Y,7BONUS,N,N,2025-06-10,30.373750,OK,SYP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33297,1947096,2,SJ,2026-02-09 00:00:00,1,2026-02-09,32K69-0001009,30,,07051647,...,,,Y,7KCW,N,N,2026-01-29,188.600000,OK,SYP
33298,1956879,2,SJ,2026-02-10 00:00:00,1,2026-02-10,33K69-0001017,10,,13018771,...,,ชคร,Y,SSW,N,N,2026-01-06,42.500000,OK,SYP
33299,1956880,2,SJ,2026-02-10 00:00:00,1,2026-02-10,32K69-0001010,10,,14050055,...,,,Y,SKT,N,N,2026-02-03,94.000000,OK,SYP
33300,1956881,2,SJ,2026-02-10 00:00:00,1,2026-02-10,32K69-0001011,10,,22010003,...,,,Y,7SSY,N,N,2026-02-03,165.953403,OK,SYP


In [26]:
df_unknown = syp_sales_enriched[syp_sales_enriched["LAST_PURCHASE_COST"].isna()]
df_unknown


Unnamed: 0,ID,JOURMODE,JOURTYPE,JOURDATE,BILLTYPE,BILLDATE,BILLNO,LINE,ITEMNO,BCODE,...,CHGAMT,ACCTNO,PAID,ACCT_NO,DONE,CANCELED,LAST_PURCHASE_DATE,LAST_PURCHASE_COST,COST_STATUS,BRANCH
50,20050,2,SJ,2025-06-25 00:00:00,1,2025-06-25,3K68-0000032,10,,70010011,...,,,Y,,N,N,NaT,,UNKNOWN,SYP
198,70107,2,SJ,2025-07-01 00:00:00,1,2025-07-01,3K68-0000122,10,,13010000,...,,,Y,,N,N,NaT,,UNKNOWN,SYP
219,70133,2,SJ,2025-07-01 00:00:00,1,2025-07-01,3K68-0000135,50,,13010000,...,,ชย,Y,,N,N,NaT,,UNKNOWN,SYP
237,70155,2,SJ,2025-07-01 00:00:00,1,2025-07-01,3K68-0000143,20,,70010011,...,,ชตขพ,Y,,N,N,NaT,,UNKNOWN,SYP
450,100340,2,SJ,2025-07-04 00:00:00,1,2025-07-04,3K68-0000193,10,,70010011,...,,,Y,,N,N,NaT,,UNKNOWN,SYP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32438,1926473,2,SJ,2026-02-05 00:00:00,1,2026-02-05,33K69-0000885,20,,70010400,...,,,Y,,N,N,NaT,,UNKNOWN,SYP
32903,1936716,2,SJ,2026-02-07 00:00:00,1,2026-02-07,33K69-0000953,30,,70010300,...,,,Y,,N,N,NaT,,UNKNOWN,SYP
33119,1946911,2,SJ,2026-02-09 00:00:00,1,2026-02-09,33K69-0000979,20,,70010400,...,,,Y,,N,N,NaT,,UNKNOWN,SYP
33174,1946966,2,SJ,2026-02-09 00:00:00,1,2026-02-09,32K69-0000980,10,,70010400,...,,,Y,,N,N,NaT,,UNKNOWN,SYP


In [27]:
pidet = data["raw_hq_pidet_purchase_lines.csv"].copy()
pidet_filtered = pidet[pidet["BCODE"] == "13010000"].copy()

pidet_filtered[["BCODE", 'BILLDATE', "QTY", "MTP", "PRICE", "AMOUNT"]]

Unnamed: 0,BCODE,BILLDATE,QTY,MTP,PRICE,AMOUNT


In [28]:
df_tfv_syp = sales_all[
    (sales_all["BILLTYPE_STD"] == "TAR") &
    (sales_all["BRANCH"] == "SYP")
]

df_tfv_syp

Unnamed: 0,ID,JOURMODE,JOURTYPE,JOURDATE,BILLTYPE,BILLDATE,BILLNO,LINE,ITEMNO,BCODE,...,PAID,ACCT_NO,DONE,CANCELED,LAST_PURCHASE_DATE,LAST_PURCHASE_COST,COST_STATUS,BRANCH,BRANCH_BILLNO,BILLTYPE_STD
1163089,29,0,SJ,2025-06-23 00:00:00,1.0,2025-06-23,3TAR6806-001,10,1,15018750,...,N,,N,N,2025-05-15,109.350000,OK,SYP,SYP-3TAR6806-001,TAR
1163090,30,0,SJ,2025-06-23 00:00:00,1.0,2025-06-23,3TAR6806-001,20,2,15013500,...,N,,N,N,2025-06-13,51.400000,OK,SYP,SYP-3TAR6806-001,TAR
1163091,31,0,SJ,2025-06-23 00:00:00,1.0,2025-06-23,3TAR6806-001,30,3,13022630,...,N,,N,N,2025-05-05,116.912500,OK,SYP,SYP-3TAR6806-001,TAR
1163092,32,0,SJ,2025-06-23 00:00:00,1.0,2025-06-23,3TAR6806-001,40,4,14050200,...,N,,N,N,2025-06-10,30.373750,OK,SYP,SYP-3TAR6806-001,TAR
1163093,33,0,SJ,2025-06-23 00:00:00,1.0,2025-06-23,3TAR6806-001,50,5,22050259,...,N,,N,N,2024-11-16,81.000000,OK,SYP,SYP-3TAR6806-001,TAR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1196159,1936897,0,SJ,2026-02-07 00:00:00,1.0,2026-02-07,3TAR6901-284,90,9,21050279,...,N,,N,N,2025-12-20,2788.928333,OK,SYP,SYP-3TAR6901-284,TAR
1196160,1936898,0,SJ,2026-02-07 00:00:00,1.0,2026-02-07,3TAR6901-284,100,10,22051820,...,N,,N,N,2025-12-08,276.500000,OK,SYP,SYP-3TAR6901-284,TAR
1196161,1936899,0,SJ,2026-02-07 00:00:00,1.0,2026-02-07,3TAR6901-285,10,1,03051342,...,N,,N,N,2026-01-05,110.200000,OK,SYP,SYP-3TAR6901-285,TAR
1196162,1936900,0,SJ,2026-02-07 00:00:00,1.0,2026-02-07,3TAR6901-285,20,2,13052134,...,N,,N,N,2025-11-06,11.130000,OK,SYP,SYP-3TAR6901-285,TAR
