In [1]:
!git clone https://github.com/pthengtr/kcw-analytics.git

Cloning into 'kcw-analytics'...
remote: Enumerating objects: 208, done.[K
remote: Counting objects: 100% (208/208), done.[K
remote: Compressing objects: 100% (161/161), done.[K
remote: Total 208 (delta 121), reused 71 (delta 20), pack-reused 0 (from 0)[K
Receiving objects: 100% (208/208), 165.63 KiB | 1.26 MiB/s, done.
Resolving deltas: 100% (121/121), done.


In [2]:
!cd /content/kcw-analytics && git pull origin main

From https://github.com/pthengtr/kcw-analytics
 * branch            main       -> FETCH_HEAD
Already up to date.


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os
import pandas as pd

folder = "/content/drive/MyDrive/kcw_analytics/01_raw"

data = {}

for file in os.listdir(folder):
    if file.endswith(".csv"):
        path = os.path.join(folder, file)
        data[file] = pd.read_csv(
            path,
            dtype={
              "BCODE": "string",
              "ITEMNO": "string",
              "BILLNO": "string",
            },
            encoding="utf-8-sig",
            low_memory=False   # stops chunk guessing
        )
        print(f"Loaded: {file} -> {data[file].shape}")



Loaded: raw_inventory_hq_2024.csv -> (4983, 8)
Loaded: raw_hq_pimas_purchase_bills.csv -> (83130, 49)
Loaded: raw_hq_sidet_sales_lines.csv -> (1194399, 38)
Loaded: raw_hq_simas_sales_bills.csv -> (484283, 49)
Loaded: raw_hq_pidet_purchase_lines.csv -> (247915, 41)
Loaded: raw_syp_pimas_purchase_bills.csv -> (2829, 49)
Loaded: raw_syp_simas_sales_bills.csv -> (11348, 49)
Loaded: raw_syp_pidet_purchase_lines.csv -> (26431, 41)
Loaded: raw_syp_sidet_sales_lines.csv -> (33314, 38)
Loaded: raw_hq_icmas_products.csv -> (114825, 94)


In [5]:
hq_sales_lines = data['raw_hq_sidet_sales_lines.csv'].copy()
syp_sales_lines = data['raw_syp_sidet_sales_lines.csv'].copy()
purchase_lines = data['raw_hq_pidet_purchase_lines.csv'].copy()

In [6]:
hq_sales_lines.columns

Index(['ID', 'JOURMODE', 'JOURTYPE', 'JOURDATE', 'BILLTYPE', 'BILLDATE',
       'BILLNO', 'LINE', 'ITEMNO', 'BCODE', 'PCODE', 'MCODE', 'DETAIL',
       'WHNUMBER', 'LOCATION1', 'STATUS', 'SERIAL', 'TAXIC', 'EXMPT', 'ISVAT',
       'QTY', 'UI', 'MTP', 'PRICE', 'XPRICE', 'DISCNT1', 'DISCNT2', 'DISCNT3',
       'DISCNT4', 'DED', 'VAT', 'AMOUNT', 'CHGAMT', 'ACCTNO', 'PAID',
       'ACCT_NO', 'DONE', 'CANCELED'],
      dtype='object')

In [7]:
import pandas as pd
import numpy as np

def refill_last_cost_from_icmas(
    data: dict,
    df: pd.DataFrame,
    *,
    icmas_key: str = "raw_hq_icmas_products.csv",
    bcode_col: str = "BCODE",
    last_cost_col: str = "LAST_COST",
    icmas_cost_col: str = "COSTNET",
) -> pd.DataFrame:
    """
    Refill LAST_COST when it is 0 or NaN using COSTNET from ICMAS.
    Includes BCODE cleanup to avoid merge mismatch.
    """

    result = df.copy()

    # --- BCODE CLEANUP (VERY IMPORTANT for KCW datasets) ---
    result[bcode_col] = (
        result[bcode_col]
        .astype(str)
        .str.strip()
        .str.upper()
    )

    icmas = data[icmas_key][[bcode_col, icmas_cost_col]].copy()

    icmas[bcode_col] = (
        icmas[bcode_col]
        .astype(str)
        .str.strip()
        .str.upper()
    )

    # --- numeric safety ---
    result[last_cost_col] = pd.to_numeric(result[last_cost_col], errors="coerce")
    icmas[icmas_cost_col] = pd.to_numeric(icmas[icmas_cost_col], errors="coerce")

    # --- merge COSTNET ---
    result = result.merge(
        icmas,
        on=bcode_col,
        how="left",
        suffixes=("", "_ICMAS")
    )

    # --- detect invalid LAST_COST ---
    mask_invalid = result[last_cost_col].isna() | (result[last_cost_col] == 0)

    # --- refill only invalid rows ---
    result.loc[mask_invalid, last_cost_col] = result.loc[
        mask_invalid, icmas_cost_col
    ]

    # --- drop helper column ---
    result = result.drop(columns=[icmas_cost_col])

    return result

def qc_unknown(df, label):
    total = len(df)
    unk = (df["COST_STATUS"] == "UNKNOWN").sum()
    print(f"[{label}] UNKNOWN: {unk:,} / {total:,} ({(unk/total*100 if total else 0):.2f}%)")

In [8]:
import numpy as np
import pandas as pd

_BCODE_RE = r"^\d{8}$"

def _to_numeric_clean(series: pd.Series) -> pd.Series:
    """
    Convert common messy numeric strings to numbers.
    Handles: whitespace, NBSP, commas.
    Non-convertible -> NaN.
    """
    s = series.astype("string").str.strip()
    s = s.str.replace("\u00A0", " ", regex=False)  # non-breaking space
    s = s.str.replace(",", "", regex=False)        # "1,234.50" -> "1234.50"
    return pd.to_numeric(s, errors="coerce")


def add_sales_quality_flags(
    df: pd.DataFrame,
    *,
    bcode_col: str = "BCODE",
    price_col: str = "PRICE",
    amount_col: str = "AMOUNT",
    canceled_col: str = "CANCELED",
    add_row_id: bool = True,
) -> pd.DataFrame:
    """
    Adds Power-BI-friendly numeric columns + data-quality flags, without removing rows.

    Output columns added:
      - BCODE (trimmed)
      - PRICE_NUM, AMOUNT_NUM (numeric)
      - IS_VALID (bool)
      - INVALID_REASON (text, e.g. "BAD_BCODE|BAD_AMOUNT")
      - ROW_ID (optional)
    """
    out = df.copy()

    # --- BCODE clean + valid ---
    b = out[bcode_col].astype("string").str.strip()
    out[bcode_col] = b
    b_ok = b.fillna("").str.match(_BCODE_RE)

    # --- numeric clean (keep raw, add numeric columns) ---
    price_num = _to_numeric_clean(out[price_col])
    amount_num = _to_numeric_clean(out[amount_col])

    # Power BI hates inf/-inf
    price_ok = price_num.notna() & np.isfinite(price_num.to_numpy())
    amount_ok = amount_num.notna() & np.isfinite(amount_num.to_numpy())

    out[f"{price_col}_NUM"] = price_num
    out[f"{amount_col}_NUM"] = amount_num

    # --- canceled flag ---
    c = out[canceled_col].astype("string").str.strip().str.upper()
    canceled_ok = c != "Y"

    # --- overall validity ---
    out["IS_VALID"] = b_ok & price_ok & amount_ok & canceled_ok

    # --- reason text (can contain multiple reasons) ---
    reason = pd.Series("", index=out.index, dtype="string")

    def add_reason(mask, label):
        nonlocal reason
        reason = np.where(
            mask,
            np.where(reason == "", label, reason + "|" + label),
            reason
        )
        reason = pd.Series(reason, index=out.index, dtype="string")

    add_reason(~b_ok, "BAD_BCODE")
    add_reason(~price_ok, "BAD_PRICE")
    add_reason(~amount_ok, "BAD_AMOUNT")
    add_reason(~canceled_ok, "CANCELED")

    out["INVALID_REASON"] = reason.replace("", pd.NA)

    if add_row_id and "ROW_ID" not in out.columns:
        out["ROW_ID"] = np.arange(len(out), dtype=np.int64)

    return out


In [9]:
!pip install tqdm



In [10]:
def enrich_sales_with_last_purchase_cost(
    sales: pd.DataFrame,
    purchases: pd.DataFrame,
    *,
    bcode_col: str = "BCODE",
    sale_date_col: str = "BILLDATE",
    purch_date_col: str = "BILLDATE",
    qty_col: str = "QTY",
    mtp_col: str = "MTP",
    amount_col: str = "AMOUNT",
    out_cost_col: str = "LAST_PURCHASE_COST",
    out_pdate_col: str = "LAST_PURCHASE_DATE",
    out_status_col: str = "COST_STATUS",
) -> pd.DataFrame:

    s = sales.copy()
    p = purchases.copy()

    s[bcode_col] = s[bcode_col].astype("string").str.strip()
    p[bcode_col] = p[bcode_col].astype("string").str.strip()

    s[sale_date_col] = pd.to_datetime(s[sale_date_col], errors="coerce")
    p[purch_date_col] = pd.to_datetime(p[purch_date_col], errors="coerce")

    denom = p[qty_col].astype(float) * p[mtp_col].astype(float)
    p["_UNIT_COST"] = np.where(denom != 0, p[amount_col].astype(float) / denom, np.nan)

    # Keep only valid purchases
    p = p[p[purch_date_col].notna() & p["_UNIT_COST"].notna()].copy()

    # ✅ Create a separate right-side date column so we don't overwrite sale BILLDATE
    p["_PURCH_DATE"] = p[purch_date_col]

    s["_POS"] = np.arange(len(s))
    s_valid = s[s[sale_date_col].notna()].copy()
    s_invalid = s[s[sale_date_col].isna()].copy()

    s_valid = s_valid.sort_values([sale_date_col, bcode_col, "_POS"], kind="mergesort")
    p = p.sort_values(["_PURCH_DATE", bcode_col], kind="mergesort")

    merged = pd.merge_asof(
        s_valid,
        p[[bcode_col, "_PURCH_DATE", "_UNIT_COST"]],
        left_on=sale_date_col,
        right_on="_PURCH_DATE",
        by=bcode_col,
        direction="backward",
        allow_exact_matches=True,
    )

    merged.rename(columns={"_UNIT_COST": out_cost_col, "_PURCH_DATE": out_pdate_col}, inplace=True)
    merged[out_status_col] = np.where(merged[out_cost_col].notna(), "OK", "UNKNOWN")

    if len(s_invalid) > 0:
        s_invalid[out_cost_col] = np.nan
        s_invalid[out_pdate_col] = pd.NaT
        s_invalid[out_status_col] = "UNKNOWN"
        merged = pd.concat([merged, s_invalid], ignore_index=False)

    merged = merged.sort_values("_POS", kind="mergesort").drop(columns=["_POS"])
    return merged


In [11]:
syp_sales_flagged = add_sales_quality_flags(syp_sales_lines)

# QC summary (no deletion)
total = len(syp_sales_flagged)
invalid = (~syp_sales_flagged["IS_VALID"]).sum()
print(f"Invalid: {invalid:,}/{total:,} ({invalid/total*100:.2f}%)")

print(
    syp_sales_flagged["INVALID_REASON"]
    .fillna("OK")
    .value_counts()
    .head(20)
)

# For analytics (optional): just filter in pandas
syp_sales_valid_only = syp_sales_flagged[syp_sales_flagged["IS_VALID"]].copy()


Invalid: 12/33,314 (0.04%)
INVALID_REASON
OK            33302
BAD_BCODE         7
BAD_AMOUNT        5
Name: count, dtype: Int64


In [12]:
hq_sales_flagged = add_sales_quality_flags(hq_sales_lines)

# QC summary (no deletion)
total = len(hq_sales_flagged)
invalid = (~hq_sales_flagged["IS_VALID"]).sum()
print(f"Invalid: {invalid:,}/{total:,} ({invalid/total*100:.2f}%)")

print(
    hq_sales_flagged["INVALID_REASON"]
    .fillna("OK")
    .value_counts()
    .head(20)
)

# For analytics (optional): just filter in pandas
syp_sales_valid_only = hq_sales_flagged[hq_sales_flagged["IS_VALID"]].copy()

Invalid: 31,320/1,194,399 (2.62%)
INVALID_REASON
OK                                1163079
BAD_BCODE                           28570
CANCELED                             1794
BAD_AMOUNT                            404
BAD_BCODE|BAD_PRICE                   280
BAD_PRICE                             132
BAD_BCODE|BAD_PRICE|BAD_AMOUNT         93
BAD_PRICE|BAD_AMOUNT                   29
BAD_BCODE|CANCELED                     17
BAD_AMOUNT|CANCELED                     1
Name: count, dtype: Int64


In [13]:
hq_sales_enriched = enrich_sales_with_last_purchase_cost(
    hq_sales_flagged,
    purchase_lines,
)

qc_unknown(hq_sales_enriched, "before refill")

hq_sales_enriched = refill_last_cost_from_icmas(
    data,
    hq_sales_enriched,
    last_cost_col="LAST_PURCHASE_COST",
)

qc_unknown(hq_sales_enriched, "after refill")



[before refill] UNKNOWN: 69,075 / 1,194,399 (5.78%)
[after refill] UNKNOWN: 69,075 / 1,194,399 (5.78%)


In [14]:
syp_sales_enriched = enrich_sales_with_last_purchase_cost(
    syp_sales_flagged,
    purchase_lines,
)

qc_unknown(syp_sales_enriched, "before refill")

syp_sales_enriched = refill_last_cost_from_icmas(
    data,
    syp_sales_enriched,
    last_cost_col="LAST_PURCHASE_COST",
)

qc_unknown(syp_sales_enriched, "after refill")

[before refill] UNKNOWN: 491 / 33,314 (1.47%)
[after refill] UNKNOWN: 491 / 33,314 (1.47%)


In [15]:
hq_sales_enriched["BRANCH"] = "HQ"
syp_sales_enriched["BRANCH"] = "SYP"

In [16]:
sales_all = pd.concat([hq_sales_enriched, syp_sales_enriched], ignore_index=True)

sales_all["BRANCH"] = sales_all["BRANCH"].astype("string")
sales_all["LAST_PURCHASE_COST"] = pd.to_numeric(sales_all["LAST_PURCHASE_COST"], errors="coerce")
sales_all["BILLDATE"] = pd.to_datetime(sales_all["BILLDATE"], errors="coerce")

sales_all["BRANCH_BILLNO"] = sales_all["BRANCH"] + "-" + sales_all["BILLNO"].astype(str)


In [17]:
sales_all["BILLTYPE_STD"] = (
    sales_all["BILLNO"]
    .astype("string")
    .str.upper()
    .str.replace(r"^3", "", regex=True)   # remove leading 3 if exists
    .str.extract(r"^(TFV|TAD|TAR|TR|TD|TF|CN)", expand=False)
    .fillna("UNKNOWN")
)

In [18]:
KEEP_COLS = [
    'BILLDATE',
    'BILLNO', 'BCODE', 'DETAIL',
    'STATUS', 'ISVAT',
    'QTY', 'UI', 'MTP', 'PRICE', 'XPRICE', 'DISCNT1', 'DISCNT2', 'DISCNT3',
    'DISCNT4', 'DED', 'VAT', 'AMOUNT', 'ACCTNO', 'PAID',
    'ACCT_NO', 'DONE', 'CANCELED',
    'PRICE_NUM', 'AMOUNT_NUM', 'IS_VALID', 'INVALID_REASON', 'ROW_ID',
    'LAST_PURCHASE_DATE', 'LAST_PURCHASE_COST', 'COST_STATUS',
    'BRANCH', 'BRANCH_BILLNO', 'BILLTYPE_STD'
]

# keep only columns that actually exist (prevents KeyError)
sales_all = sales_all[[c for c in KEEP_COLS if c in sales_all.columns]].copy()

In [19]:
import pandas as pd
import numpy as np

def _clean_str(s: pd.Series) -> pd.Series:
    s = s.astype("string")
    s = s.str.replace("\u00A0", " ", regex=False)  # NBSP
    s = s.str.strip()
    # convert common garbage to NA
    s = s.replace({"": pd.NA, "nan": pd.NA, "NaN": pd.NA, "None": pd.NA, "NULL": pd.NA, "null": pd.NA})
    return s


def _to_dt(s: pd.Series) -> pd.Series:
    return pd.to_datetime(s, errors="coerce")

# ------------------------
# DimDate
# ------------------------
def build_dim_date_from_sales(sales_all: pd.DataFrame, *, date_col: str = "BILLDATE") -> pd.DataFrame:
    d = _to_dt(sales_all[date_col]).dropna().dt.normalize()
    if d.empty:
        return pd.DataFrame(columns=["Date", "DateKey", "Year", "Month", "Day", "YearMonth", "Quarter", "WeekNum"])

    date_range = pd.date_range(d.min(), d.max(), freq="D")
    dim = pd.DataFrame({"Date": date_range})
    dim["DateKey"] = dim["Date"].dt.strftime("%Y%m%d").astype(int)
    dim["Year"] = dim["Date"].dt.year
    dim["Month"] = dim["Date"].dt.month
    dim["Day"] = dim["Date"].dt.day
    dim["YearMonth"] = dim["Date"].dt.strftime("%Y-%m")
    dim["Quarter"] = dim["Date"].dt.quarter
    dim["WeekNum"] = dim["Date"].dt.isocalendar().week.astype(int)
    return dim

# ------------------------
# DimBranch
# ------------------------
def build_dim_branch(sales_all: pd.DataFrame, *, branch_col: str = "BRANCH") -> pd.DataFrame:
    dim = pd.DataFrame({"BRANCH": _clean_str(sales_all[branch_col])}).dropna()
    dim = dim[dim["BRANCH"] != ""].drop_duplicates().sort_values("BRANCH").reset_index(drop=True)
    dim["BranchKey"] = dim["BRANCH"]
    return dim

# ------------------------
# DimProduct (BCODE)
# ------------------------
def build_dim_product(
    sales_all: pd.DataFrame,
    *,
    bcode_col: str = "BCODE",
    detail_col: str = "DETAIL",
    ui_col: str = "UI",
    last_seen_date_col: str = "BILLDATE",
) -> pd.DataFrame:
    df = sales_all.copy()
    df[bcode_col] = _clean_str(df[bcode_col])
    df[detail_col] = _clean_str(df.get(detail_col, ""))
    df[ui_col] = _clean_str(df.get(ui_col, ""))
    df[last_seen_date_col] = _to_dt(df[last_seen_date_col])

    df = df[df[bcode_col].notna() & (df[bcode_col] != "")]
    df = df.sort_values([bcode_col, last_seen_date_col], kind="mergesort")
    last = df.groupby(bcode_col, sort=False).tail(1)

    dim = pd.DataFrame({
        "BCODE": last[bcode_col],
        "DETAIL": last.get(detail_col, pd.Series([pd.NA]*len(last))),
        "UI": last.get(ui_col, pd.Series([pd.NA]*len(last))),
        "LastSeenDate": last[last_seen_date_col].dt.normalize(),
    }).reset_index(drop=True)

    dim["ProductKey"] = dim["BCODE"]
    # add CATEGORY_CODE (first 2 digits) for easy relationship too
    dim["CATEGORY_CODE"] = dim["BCODE"].astype("string").str.slice(0, 2)
    return dim

# ------------------------
# DimCategory (first 2 digits of BCODE)
# ------------------------
def build_dim_category_from_bcode(
    sales_all: pd.DataFrame,
    *,
    bcode_col: str = "BCODE",
) -> pd.DataFrame:
    b = _clean_str(sales_all[bcode_col])
    cat = b.dropna().str.slice(0, 2)
    # keep only exactly 2 digits
    cat = cat[cat.str.match(r"^\d{2}$", na=False)]

    dim = pd.DataFrame({"CATEGORY_CODE": cat}).drop_duplicates().sort_values("CATEGORY_CODE").reset_index(drop=True)
    dim["CategoryKey"] = dim["CATEGORY_CODE"]
    return dim

# ------------------------
# DimCustomer (ACCTNO)
# ------------------------
def build_dim_customer(sales_all: pd.DataFrame, *, customer_col="ACCTNO") -> pd.DataFrame:
    c = _clean_str(sales_all[customer_col]) if customer_col in sales_all.columns else pd.Series([pd.NA]*len(sales_all), dtype="string")
    dim = pd.DataFrame({"CustomerKey": c}).dropna()
    dim = dim[dim["CustomerKey"] != ""].drop_duplicates(subset=["CustomerKey"]).sort_values("CustomerKey").reset_index(drop=True)
    dim["CUSTOMER_ACCTNO"] = dim["CustomerKey"]
    return dim

# ------------------------
# DimSupplier (ACCT_NO)
# ------------------------
def build_dim_supplier(sales_all: pd.DataFrame, *, supplier_col="ACCT_NO") -> pd.DataFrame:
    s = _clean_str(sales_all[supplier_col]) if supplier_col in sales_all.columns else pd.Series([pd.NA]*len(sales_all), dtype="string")
    dim = pd.DataFrame({"SupplierKey": s}).dropna()
    dim = dim[dim["SupplierKey"] != ""].drop_duplicates(subset=["SupplierKey"]).sort_values("SupplierKey").reset_index(drop=True)
    dim["SUPPLIER_ACCT_NO"] = dim["SupplierKey"]
    return dim

# -----------------------------
# DIM BILLTYPE (from BILLNO)
# -----------------------------
KNOWN_TYPES = ["TFV", "TAD", "TAR", "TR", "TD", "TF", "CN"]

def build_dim_billtype(sales_all):
    dim = pd.DataFrame({"BILLTYPE_STD": _clean_str(sales_all["BILLTYPE_STD"]).str.upper()})
    dim = dim.drop_duplicates().sort_values("BILLTYPE_STD").reset_index(drop=True)
    dim["BillTypeKey"] = dim["BILLTYPE_STD"]
    return dim

# ------------------------
# Wrapper
# ------------------------
def build_all_dims(sales_all):
    return {
        "dim_date": build_dim_date_from_sales(sales_all),
        "dim_product": build_dim_product(sales_all),
        "dim_category": build_dim_category_from_bcode(sales_all),
        "dim_customer": build_dim_customer(sales_all),
        "dim_supplier": build_dim_supplier(sales_all),
        "dim_branch": build_dim_branch(sales_all),
        "dim_billtype": build_dim_billtype(sales_all),
    }


In [20]:
dims = build_all_dims(sales_all)
{k: v.shape for k, v in dims.items()}

{'dim_date': (3889, 8),
 'dim_product': (34042, 6),
 'dim_category': (37, 2),
 'dim_customer': (2226, 2),
 'dim_supplier': (805, 2),
 'dim_branch': (2, 2),
 'dim_billtype': (8, 2)}

In [21]:
import os
out_dir = "/content/drive/MyDrive/kcw_analytics/03_curated"
os.makedirs(out_dir, exist_ok=True)

for name, df in dims.items():
    df.to_csv(f"{out_dir}/{name}.csv", index=False, encoding="utf-8-sig")


In [22]:
sales_all.to_csv(
    f"{out_dir}/fact_sales_all.csv",
    index=False,
    encoding="utf-8-sig"   # important for Thai + Excel
)

**DEBUG**

In [23]:
sales_all

Unnamed: 0,BILLDATE,BILLNO,BCODE,DETAIL,STATUS,ISVAT,QTY,UI,MTP,PRICE,...,AMOUNT_NUM,IS_VALID,INVALID_REASON,ROW_ID,LAST_PURCHASE_DATE,LAST_PURCHASE_COST,COST_STATUS,BRANCH,BRANCH_BILLNO,BILLTYPE_STD
0,2015-06-20,KC1506-0006,,สลักปีกนก,1.0,N,1.0,ตัว,1.0,270.0,...,270.0,False,BAD_BCODE,0,2015-06-19,,OK,HQ,HQ-KC1506-0006,UNKNOWN
1,2015-06-20,KC1506-0006,,ลูกหมากปีกนกแท้ D-MAX,1.0,N,1.0,ตัว,1.0,750.0,...,750.0,False,BAD_BCODE,1,2015-06-19,,OK,HQ,HQ-KC1506-0006,UNKNOWN
2,2015-06-20,KC1506-0006,,แผ่นผ้าทราย,1.0,N,2.0,แผ่น,1.0,15.0,...,30.0,False,BAD_BCODE,2,2015-06-19,,OK,HQ,HQ-KC1506-0006,UNKNOWN
3,2015-06-20,KC1506-0006,,ยางกันโครง,1.0,N,1.0,ตัว,1.0,20.0,...,20.0,False,BAD_BCODE,3,2015-06-19,,OK,HQ,HQ-KC1506-0006,UNKNOWN
4,2015-06-20,KC1506-0006,,ยางรองคอยสปริง,1.0,N,2.0,ตัว,1.0,70.0,...,140.0,False,BAD_BCODE,4,2015-06-19,,OK,HQ,HQ-KC1506-0006,UNKNOWN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1227708,2026-02-09,32K69-0001009,07051647,"ไส้กรองเครื่อง กระดาษ BT50 PRO,R/G 2012",1.0,N,1.0,หน่วย,1.0,290.0,...,290.0,True,,33309,2026-01-29,188.600000,OK,SYP,SYP-32K69-0001009,UNKNOWN
1227709,2026-02-10,33K69-0001017,13018771,ปะเก็นหนัง 0.8มิล กลาง 0.8 มิล,1.0,N,1.0,หน่วย,1.0,80.0,...,80.0,True,,33310,2026-01-06,42.500000,OK,SYP,SYP-33K69-0001017,UNKNOWN
1227710,2026-02-10,32K69-0001010,14050055,กาวมหาอุด ตราช้าง,1.0,N,1.0,ชุด,1.0,120.0,...,120.0,True,,33311,2026-02-03,94.000000,OK,SYP,SYP-32K69-0001010,UNKNOWN
1227711,2026-02-10,32K69-0001011,22010003,น้ำมันเบรคเชลล์ บ. 1LT DOT3,1.0,N,1.0,กป.,1.0,190.0,...,190.0,True,,33312,2026-02-03,165.953403,OK,SYP,SYP-32K69-0001011,UNKNOWN


In [24]:
df_unknown = syp_sales_enriched[syp_sales_enriched["LAST_PURCHASE_COST"].isna()]
df_unknown


Unnamed: 0,ID,JOURMODE,JOURTYPE,JOURDATE,BILLTYPE,BILLDATE,BILLNO,LINE,ITEMNO,BCODE,...,CANCELED,PRICE_NUM,AMOUNT_NUM,IS_VALID,INVALID_REASON,ROW_ID,LAST_PURCHASE_DATE,LAST_PURCHASE_COST,COST_STATUS,BRANCH
50,20050,2,SJ,2025-06-25 00:00:00,1,2025-06-25,3K68-0000032,10,,70010011,...,N,400.0,-400.0,True,,50,NaT,,UNKNOWN,SYP
198,70107,2,SJ,2025-07-01 00:00:00,1,2025-07-01,3K68-0000122,10,,13010000,...,N,1000.0,1000.0,True,,198,NaT,,UNKNOWN,SYP
221,70133,2,SJ,2025-07-01 00:00:00,1,2025-07-01,3K68-0000135,50,,13010000,...,N,1000.0,-1000.0,True,,221,NaT,,UNKNOWN,SYP
239,70155,2,SJ,2025-07-01 00:00:00,1,2025-07-01,3K68-0000143,20,,70010011,...,N,400.0,-400.0,True,,239,NaT,,UNKNOWN,SYP
452,100340,2,SJ,2025-07-04 00:00:00,1,2025-07-04,3K68-0000193,10,,70010011,...,N,400.0,-400.0,True,,452,NaT,,UNKNOWN,SYP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32449,1926473,2,SJ,2026-02-05 00:00:00,1,2026-02-05,33K69-0000885,20,,70010400,...,N,400.0,-400.0,True,,32449,NaT,,UNKNOWN,SYP
32915,1936716,2,SJ,2026-02-07 00:00:00,1,2026-02-07,33K69-0000953,30,,70010300,...,N,300.0,-300.0,True,,32915,NaT,,UNKNOWN,SYP
33131,1946911,2,SJ,2026-02-09 00:00:00,1,2026-02-09,33K69-0000979,20,,70010400,...,N,400.0,-400.0,True,,33131,NaT,,UNKNOWN,SYP
33186,1946966,2,SJ,2026-02-09 00:00:00,1,2026-02-09,32K69-0000980,10,,70010400,...,N,400.0,-800.0,True,,33186,NaT,,UNKNOWN,SYP


In [25]:
pidet = data["raw_hq_pidet_purchase_lines.csv"].copy()
pidet_filtered = pidet[pidet["BCODE"] == "13010000"].copy()

pidet_filtered[["BCODE", 'BILLDATE', "QTY", "MTP", "PRICE", "AMOUNT"]]

Unnamed: 0,BCODE,BILLDATE,QTY,MTP,PRICE,AMOUNT


In [26]:
df_tfv_syp = sales_all[
    (sales_all["BILLTYPE_STD"] == "TAR") &
    (sales_all["BRANCH"] == "SYP")
]

df_tfv_syp

Unnamed: 0,BILLDATE,BILLNO,BCODE,DETAIL,STATUS,ISVAT,QTY,UI,MTP,PRICE,...,AMOUNT_NUM,IS_VALID,INVALID_REASON,ROW_ID,LAST_PURCHASE_DATE,LAST_PURCHASE_COST,COST_STATUS,BRANCH,BRANCH_BILLNO,BILLTYPE_STD
1194409,2025-06-23,3TAR6806-001,15018750,ลูกปืน 30-72-19 6306 2RS มีฝายาง,1.0,Y,1.0,ตับ,1.0,170.0,...,170.0,True,,10,2025-05-15,109.350000,OK,SYP,SYP-3TAR6806-001,TAR
1194410,2025-06-23,3TAR6806-001,15013500,ลูกปืน (ล้อหน้า L2600) 6205 2RSCM(ล้อหน้า L,1.0,Y,1.0,ตับ,1.0,85.0,...,85.0,True,,11,2025-06-13,51.400000,OK,SYP,SYP-3TAR6806-001,TAR
1194411,2025-06-23,3TAR6806-001,13022630,"สายพาน 52"" 12.5x1350",1.0,Y,1.0,เส้น,1.0,190.0,...,190.0,True,,12,2025-05-05,116.912500,OK,SYP,SYP-3TAR6806-001,TAR
1194412,2025-06-23,3TAR6806-001,14050200,สีสเปรย์ #36(No.300) บอร์นเงิน,1.0,Y,1.0,ก.ป.,1.0,45.0,...,45.0,True,,13,2025-06-10,30.373750,OK,SYP,SYP-3TAR6806-001,TAR
1194413,2025-06-23,3TAR6806-001,22050259,น้ำมันเบรค Brembo 0.5 L,1.0,Y,1.0,กป.,1.0,110.0,...,110.0,True,,14,2024-11-16,81.000000,OK,SYP,SYP-3TAR6806-001,TAR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1227491,2026-02-07,3TAR6901-284,21050279,แบตเตอรี่ FB HYBRID 3000 L,1.0,Y,1.0,หน่วย,1.0,3350.0,...,3350.0,True,,33092,2025-12-20,2788.928333,OK,SYP,SYP-3TAR6901-284,TAR
1227492,2026-02-07,3TAR6901-284,22051820,น้ำยาหม้อน้ำ มิตซูบิชิ 5L,1.0,Y,1.0,กป.,1.0,390.0,...,390.0,True,,33093,2025-12-08,276.500000,OK,SYP,SYP-3TAR6901-284,TAR
1227493,2026-02-07,3TAR6901-285,03051342,ท่อยางหม้อน้ำ บน D-MAX 2.5 4JA1,1.0,Y,1.0,ท่อน,1.0,160.0,...,160.0,True,,33094,2026-01-05,110.200000,OK,SYP,SYP-3TAR6901-285,TAR
1227494,2026-02-07,3TAR6901-285,13052134,เข็มขัดรัดท่อยาง-สแตนเลสกว้าง 30-40 (1.1/8-1.5...,1.0,Y,2.0,ตัว,1.0,25.0,...,50.0,True,,33095,2025-11-06,11.130000,OK,SYP,SYP-3TAR6901-285,TAR
