In [1]:
!pip install sqlalchemy pyodbc



In [2]:
from sqlalchemy import create_engine
import pandas as pd
import urllib.parse

server = r'KSS'
database = 'PARTS9'
username = "python_reader"
password = "12345678" 

odbc_str = (
    "DRIVER={ODBC Driver 17 for SQL Server};"
    f"SERVER={server};"
    f"DATABASE={database};"
    f"UID={username};"
    f"PWD={password};"
    "TrustServerCertificate=yes;"
)

engine = create_engine("mssql+pyodbc:///?odbc_connect=" + urllib.parse.quote_plus(odbc_str))


In [3]:
def read_last_5y(table):
    query = f"""
    SELECT *
    FROM dbo.{table}
    WHERE BILLDATE >= DATEADD(YEAR, -5, (SELECT MAX(BILLDATE) FROM dbo.{table}))
    """
    return pd.read_sql(query, engine)

df_hq_sidet = read_last_5y("SIDET")
df_hq_pidet = read_last_5y("PIDET")
df_hq_simas = read_last_5y("SIMAS")
df_hq_pimas = read_last_5y("PIMAS")

df_hq_icmas = pd.read_sql("SELECT * FROM dbo.ICMAS;", engine)

In [4]:
import os

os.chdir(r"G:\Shared drives\KCW-Data\kcw_analytics\01_raw")

# now you can save with short name
df_hq_icmas.to_csv("raw_hq_icmas_products.csv", index=False, encoding="utf-8-sig")
df_hq_pidet.to_csv("raw_hq_pidet_purchase_lines.csv", index=False, encoding="utf-8-sig")
df_hq_pimas.to_csv("raw_hq_pimas_purchase_bills.csv", index=False, encoding="utf-8-sig")
df_hq_sidet.to_csv("raw_hq_sidet_sales_lines.csv", index=False, encoding="utf-8-sig")
df_hq_simas.to_csv("raw_hq_simas_sales_bills.csv", index=False, encoding="utf-8-sig")

In [5]:
data = {}

data['raw_syp_pimas_purchase_bills.csv'] = pd.read_csv("raw_syp_pimas_purchase_bills.csv")
data['raw_syp_pidet_purchase_lines.csv'] = pd.read_csv("raw_syp_pidet_purchase_lines.csv")
data['raw_syp_sidet_sales_lines.csv'] = pd.read_csv("raw_syp_sidet_sales_lines.csv")
data['raw_syp_simas_sales_bills.csv'] = pd.read_csv("raw_syp_simas_sales_bills.csv")

data['raw_hq_pimas_purchase_bills.csv'] = df_hq_pimas.copy()
data['raw_hq_pidet_purchase_lines.csv'] = df_hq_pidet.copy()
data['raw_hq_sidet_sales_lines.csv'] = df_hq_sidet.copy()
data['raw_hq_simas_sales_bills.csv'] = df_hq_simas.copy()

data['raw_hq_icmas_products.csv'] = df_hq_icmas.copy()

In [6]:
hq_sales_lines = data['raw_hq_sidet_sales_lines.csv'].copy()
syp_sales_lines = data['raw_syp_sidet_sales_lines.csv'].copy()
purchase_lines = data['raw_hq_pidet_purchase_lines.csv'].copy()

In [7]:
import sys
import importlib

# ensure repo is on path
repo_path = r"C:\Users\Windows 11\Notebook\kcw-analytics"
if repo_path not in sys.path:
    sys.path.append(repo_path)

# import the module (NOT individual functions)
import src.kcw.supabase_utils as supabase_utils

# reload to pick up latest .py changes
importlib.reload(supabase_utils)

add_sales_quality_flags = supabase_utils.add_sales_quality_flags
enrich_sales_with_last_purchase_cost = supabase_utils.enrich_sales_with_last_purchase_cost
qc_unknown = supabase_utils.qc_unknown
refill_last_cost_from_icmas = supabase_utils.refill_last_cost_from_icmas
build_all_dims = supabase_utils.build_all_dims

In [8]:
syp_sales_flagged = add_sales_quality_flags(syp_sales_lines)

# QC summary (no deletion)
total = len(syp_sales_flagged)
invalid = (~syp_sales_flagged["IS_VALID"]).sum()
print(f"Invalid: {invalid:,}/{total:,} ({invalid/total*100:.2f}%)")

print(
    syp_sales_flagged["INVALID_REASON"]
    .fillna("OK")
    .value_counts()
    .head(20)
)

# For analytics (optional): just filter in pandas
syp_sales_valid_only = syp_sales_flagged[syp_sales_flagged["IS_VALID"]].copy()

Invalid: 35,910/35,910 (100.00%)
INVALID_REASON
BAD_BCODE               35904
BAD_BCODE|BAD_AMOUNT        6
Name: count, dtype: Int64


In [9]:
hq_sales_flagged = add_sales_quality_flags(hq_sales_lines)

# QC summary (no deletion)
total = len(hq_sales_flagged)
invalid = (~hq_sales_flagged["IS_VALID"]).sum()
print(f"Invalid: {invalid:,}/{total:,} ({invalid/total*100:.2f}%)")

print(
    hq_sales_flagged["INVALID_REASON"]
    .fillna("OK")
    .value_counts()
    .head(20)
)

# For analytics (optional): just filter in pandas
syp_sales_valid_only = hq_sales_flagged[hq_sales_flagged["IS_VALID"]].copy()

Invalid: 5,933/732,407 (0.81%)
INVALID_REASON
OK                                726474
BAD_BCODE                           3796
CANCELED                            1552
BAD_AMOUNT                           385
BAD_PRICE                             90
BAD_BCODE|BAD_PRICE|BAD_AMOUNT        50
BAD_BCODE|BAD_PRICE                   32
BAD_PRICE|BAD_AMOUNT                  22
BAD_BCODE|CANCELED                     5
BAD_AMOUNT|CANCELED                    1
Name: count, dtype: Int64


In [10]:
hq_sales_enriched = enrich_sales_with_last_purchase_cost(
    hq_sales_flagged,
    purchase_lines,
)

qc_unknown(hq_sales_enriched, "before refill")

hq_sales_enriched = refill_last_cost_from_icmas(
    data,
    hq_sales_enriched,
    last_cost_col="LAST_PURCHASE_COST",
)

qc_unknown(hq_sales_enriched, "after refill")

[before refill] UNKNOWN: 65,973 / 732,407 (9.01%)
[after refill] UNKNOWN: 65,973 / 732,407 (9.01%)


In [11]:

syp_sales_enriched = enrich_sales_with_last_purchase_cost(
    syp_sales_flagged,
    purchase_lines,
)

qc_unknown(syp_sales_enriched, "before refill")

syp_sales_enriched = refill_last_cost_from_icmas(
    data,
    syp_sales_enriched,
    last_cost_col="LAST_PURCHASE_COST",
)

qc_unknown(syp_sales_enriched, "after refill")

[before refill] UNKNOWN: 35,904 / 35,910 (99.98%)
[after refill] UNKNOWN: 35,904 / 35,910 (99.98%)


In [12]:

hq_sales_enriched["BRANCH"] = "HQ"
syp_sales_enriched["BRANCH"] = "SYP"

In [13]:
sales_all = pd.concat([hq_sales_enriched, syp_sales_enriched], ignore_index=True)

sales_all["BRANCH"] = sales_all["BRANCH"].astype("string")
sales_all["LAST_PURCHASE_COST"] = pd.to_numeric(sales_all["LAST_PURCHASE_COST"], errors="coerce")
sales_all["BILLDATE"] = pd.to_datetime(sales_all["BILLDATE"], errors="coerce")

sales_all["BRANCH_BILLNO"] = sales_all["BRANCH"] + "-" + sales_all["BILLNO"].astype(str)

In [14]:
sales_all["BILLTYPE_STD"] = (
    sales_all["BILLNO"]
    .astype("string")
    .str.upper()
    .str.replace(r"^3", "", regex=True)   # remove leading 3 if exists
    .str.extract(r"^(TFV|TAD|TAR|TR|TD|TF|CN|DN)", expand=False)
    .fillna("UNKNOWN")
)

In [15]:
KEEP_COLS = [
    'BILLDATE', 'BILLTYPE', 'JOURMODE',
    'BILLNO', 'BCODE', 'DETAIL',
    'STATUS', 'ISVAT',
    'QTY', 'UI', 'MTP', 'PRICE', 'XPRICE', 'DISCNT1', 'DISCNT2', 'DISCNT3',
    'DISCNT4', 'DED', 'VAT', 'AMOUNT', 'ACCTNO', 'PAID',
    'ACCT_NO', 'DONE', 'CANCELED',
    'PRICE_NUM', 'AMOUNT_NUM', 'IS_VALID', 'INVALID_REASON', 'ROW_ID',
    'LAST_PURCHASE_DATE', 'LAST_PURCHASE_COST', 'COST_STATUS',
    'BRANCH', 'BRANCH_BILLNO', 'BILLTYPE_STD'
]

# keep only columns that actually exist (prevents KeyError)
sales_all = sales_all[[c for c in KEEP_COLS if c in sales_all.columns]].copy()

In [16]:
dims = build_all_dims(sales_all)
{k: v.shape for k, v in dims.items()}

{'dim_date': (1827, 8),
 'dim_product': (35435, 6),
 'dim_category': (40, 2),
 'dim_account': (2360, 4),
 'dim_branch': (2, 3),
 'dim_billtype': (9, 2)}

In [17]:
import shutil
from pathlib import Path

out_dir = Path(r"G:\Shared drives\KCW-Data\kcw_analytics\03_curated")

In [18]:

import os
os.makedirs(out_dir, exist_ok=True)

for name, df in dims.items():
    df.to_csv(f"{out_dir}/{name}.csv", index=False, encoding="utf-8-sig")

In [19]:

sales_all.to_csv(
    f"{out_dir}/fact_sales_all.csv",
    index=False,
    encoding="utf-8-sig"   # important for Thai + Excel
)