In [1]:
!git clone https://github.com/pthengtr/kcw-analytics.git

Cloning into 'kcw-analytics'...
remote: Enumerating objects: 156, done.[K
remote: Counting objects: 100% (156/156), done.[K
remote: Compressing objects: 100% (109/109), done.[K
remote: Total 156 (delta 82), reused 71 (delta 20), pack-reused 0 (from 0)[K
Receiving objects: 100% (156/156), 119.39 KiB | 2.78 MiB/s, done.
Resolving deltas: 100% (82/82), done.


In [2]:
!cd /content/kcw-analytics && git pull origin main

From https://github.com/pthengtr/kcw-analytics
 * branch            main       -> FETCH_HEAD
Already up to date.


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os
import pandas as pd

folder = "/content/drive/MyDrive/kcw_analytics/01_raw"

data = {}

for file in os.listdir(folder):
    if file.endswith(".csv"):
        path = os.path.join(folder, file)
        data[file] = pd.read_csv(
            path,
            dtype={
              "BCODE": "string",
              "ITEMNO": "string",
              "BILLNO": "string",
            },
            encoding="utf-8-sig",
            low_memory=False   # stops chunk guessing
        )
        print(f"Loaded: {file} -> {data[file].shape}")



Loaded: raw_hq_pimas_purchase_bills.csv -> (82716, 49)
Loaded: raw_hq_simas_sales_bills.csv -> (481937, 49)
Loaded: raw_hq_pidet_purchase_lines.csv -> (246580, 41)
Loaded: raw_hq_sidet_sales_lines.csv -> (1187205, 38)
Loaded: raw_hq_icmas_products.csv -> (114755, 94)
Loaded: raw_syp_pimas_purchase_bills.csv -> (2721, 49)
Loaded: raw_syp_simas_sales_bills.csv -> (10368, 49)
Loaded: raw_syp_sidet_sales_lines.csv -> (30374, 38)
Loaded: raw_syp_pidet_purchase_lines.csv -> (25555, 41)
Loaded: raw_inventory_hq_2024.csv -> (4983, 8)


In [12]:
hq_sales_lines = data['raw_hq_sidet_sales_lines.csv'].copy()
syp_sales_lines = data['raw_syp_sidet_sales_lines.csv'].copy()

In [7]:
hq_sales_lines.columns

Index(['ID', 'JOURMODE', 'JOURTYPE', 'JOURDATE', 'BILLTYPE', 'BILLDATE',
       'BILLNO', 'LINE', 'ITEMNO', 'BCODE', 'PCODE', 'MCODE', 'DETAIL',
       'WHNUMBER', 'LOCATION1', 'STATUS', 'SERIAL', 'TAXIC', 'EXMPT', 'ISVAT',
       'QTY', 'UI', 'MTP', 'PRICE', 'XPRICE', 'DISCNT1', 'DISCNT2', 'DISCNT3',
       'DISCNT4', 'DED', 'VAT', 'AMOUNT', 'CHGAMT', 'ACCTNO', 'PAID',
       'ACCT_NO', 'DONE', 'CANCELED'],
      dtype='object')

In [8]:
import pandas as pd
import numpy as np
import re

_BCODE_RE = re.compile(r"^\d{8}$")

def remove_invalid_bcode(df: pd.DataFrame, *, bcode_col: str = "BCODE"):
    """
    Rule:
    - strip whitespace
    - BCODE must be exactly 8 digits (e.g., 22010585)
    Output:
    - clean_df, removed_idx
    """
    out = df.copy()

    b = out[bcode_col].astype("string").str.strip()
    out[bcode_col] = b  # keep cleaned BCODE

    valid = b.fillna("").str.match(_BCODE_RE)
    removed_idx = out.index[~valid].tolist()

    clean_df = out.loc[valid].copy()
    return clean_df, removed_idx


def _to_numeric_clean(series: pd.Series) -> pd.Series:
    """
    Convert common messy numeric strings to numbers.
    Handles: whitespace, commas. Non-convertible -> NaN.
    """
    s = series.astype("string").str.strip()
    s = s.str.replace(",", "", regex=False)  # "1,234.50" -> "1234.50"
    return pd.to_numeric(s, errors="coerce")


def remove_non_numeric_price_or_amount(
    df: pd.DataFrame,
    *,
    price_col: str = "PRICE",
    amount_col: str = "AMOUNT",
):
    """
    Rule:
    - PRICE must be numeric
    - AMOUNT must be numeric
    Output:
    - clean_df, removed_idx
    """
    out = df.copy()

    price_num = _to_numeric_clean(out[price_col])
    amount_num = _to_numeric_clean(out[amount_col])

    valid = price_num.notna() & amount_num.notna()

    # optionally overwrite with numeric versions (so downstream is safe)
    out[price_col] = price_num
    out[amount_col] = amount_num

    removed_idx = out.index[~valid].tolist()
    clean_df = out.loc[valid].copy()
    return clean_df, removed_idx


def remove_canceled_lines(df: pd.DataFrame, *, canceled_col: str = "CANCELED"):
    """
    Rule:
    - remove rows where CANCELED == 'Y' (case/whitespace insensitive)
    Output:
    - clean_df, removed_idx
    """
    out = df.copy()
    c = out[canceled_col].astype("string").str.strip().str.upper()

    is_canceled = c == "Y"
    removed_idx = out.index[is_canceled].tolist()

    clean_df = out.loc[~is_canceled].copy()
    return clean_df, removed_idx


In [19]:
syp_sales_lines_cleaned, removed_bcode = remove_invalid_bcode(syp_sales_lines)
syp_sales_lines_cleaned, removed_nonnum = remove_non_numeric_price_or_amount(syp_sales_lines_cleaned)
syp_sales_lines_cleaned, removed_canceled = remove_canceled_lines(syp_sales_lines_cleaned)

total_rows = len(syp_sales_lines)

def pct(n, total):
    return (n / total * 100) if total else 0

print(f"Invalid BCODE removed: {len(removed_bcode)} ({pct(len(removed_bcode), total_rows):.2f}%)")
print(f"Non-numeric PRICE/AMOUNT removed: {len(removed_nonnum)} ({pct(len(removed_nonnum), total_rows):.2f}%)")
print(f"Canceled lines removed: {len(removed_canceled)} ({pct(len(removed_canceled), total_rows):.2f}%)")

total_removed = total_rows - len(syp_sales_lines_cleaned)
print(f"SYP Total removed: {total_removed} ({pct(total_removed, total_rows):.2f}%)")

Invalid BCODE removed: 6 (0.02%)
Non-numeric PRICE/AMOUNT removed: 4 (0.01%)
Canceled lines removed: 0 (0.00%)
SYP Total removed: 10 (0.03%)


In [20]:
hq_sales_lines_cleaned, removed_bcode = remove_invalid_bcode(hq_sales_lines)
hq_sales_lines_cleaned, removed_nonnum = remove_non_numeric_price_or_amount(hq_sales_lines_cleaned)
hq_sales_lines_cleaned, removed_canceled = remove_canceled_lines(hq_sales_lines_cleaned)

total_rows = len(hq_sales_lines)

def pct(n, total):
    return (n / total * 100) if total else 0

print(f"Invalid BCODE removed: {len(removed_bcode)} ({pct(len(removed_bcode), total_rows):.2f}%)")
print(f"Non-numeric PRICE/AMOUNT removed: {len(removed_nonnum)} ({pct(len(removed_nonnum), total_rows):.2f}%)")
print(f"Canceled lines removed: {len(removed_canceled)} ({pct(len(removed_canceled), total_rows):.2f}%)")

total_removed = total_rows - len(hq_sales_lines_cleaned)
print(f"HQ Total removed: {total_removed} ({pct(total_removed, total_rows):.2f}%)")

Invalid BCODE removed: 28958 (2.44%)
Non-numeric PRICE/AMOUNT removed: 553 (0.05%)
Canceled lines removed: 1698 (0.14%)
HQ Total removed: 31209 (2.63%)
