In [1]:
!pip install sqlalchemy pyodbc



In [2]:
from sqlalchemy import create_engine
import pandas as pd
import urllib.parse

server = r'KSS'
database = 'PARTS9'
username = "python_reader"
password = "12345678" 

odbc_str = (
    "DRIVER={ODBC Driver 17 for SQL Server};"
    f"SERVER={server};"
    f"DATABASE={database};"
    f"UID={username};"
    f"PWD={password};"
    "TrustServerCertificate=yes;"
)

engine = create_engine("mssql+pyodbc:///?odbc_connect=" + urllib.parse.quote_plus(odbc_str))


In [3]:
def read_last_5y(table):
    query = f"""
    SELECT *
    FROM dbo.{table}
    WHERE BILLDATE >= DATEADD(YEAR, -5, (SELECT MAX(BILLDATE) FROM dbo.{table}))
    """
    return pd.read_sql(query, engine)

In [4]:
df_hq_sidet = read_last_5y("SIDET")
df_hq_pidet = read_last_5y("PIDET")
df_hq_simas = read_last_5y("SIMAS")
df_hq_pimas = read_last_5y("PIMAS")

df_hq_icmas = pd.read_sql("SELECT * FROM dbo.ICMAS;", engine)

In [5]:
df_hq_pvmas = pd.read_sql("SELECT * FROM dbo.PVMAS;", engine)

In [6]:
import os

os.chdir(r"G:\Shared drives\KCW-Data\kcw_analytics\01_raw")

In [7]:
# now you can save with short name
df_hq_icmas.to_csv("raw_hq_icmas_products.csv", index=False, encoding="utf-8-sig")
df_hq_pidet.to_csv("raw_hq_pidet_purchase_lines.csv", index=False, encoding="utf-8-sig")
df_hq_pimas.to_csv("raw_hq_pimas_purchase_bills.csv", index=False, encoding="utf-8-sig")
df_hq_sidet.to_csv("raw_hq_sidet_sales_lines.csv", index=False, encoding="utf-8-sig")
df_hq_simas.to_csv("raw_hq_simas_sales_bills.csv", index=False, encoding="utf-8-sig")

In [8]:
df_hq_pvmas.to_csv("raw_hq_pvmas_notes_vouchers.csv", index=False, encoding="utf-8-sig")

In [9]:
data = {}

data['raw_syp_pimas_purchase_bills.csv'] = pd.read_csv("raw_syp_pimas_purchase_bills.csv", dtype={
              "BCODE": "string",
              "BILLNO": "string",
            },)
data['raw_syp_pidet_purchase_lines.csv'] = pd.read_csv("raw_syp_pidet_purchase_lines.csv", dtype={
              "BCODE": "string",
              "BILLNO": "string",
            },)
data['raw_syp_sidet_sales_lines.csv'] = pd.read_csv("raw_syp_sidet_sales_lines.csv", dtype={
              "BCODE": "string",
              "BILLNO": "string",
            },)
data['raw_syp_simas_sales_bills.csv'] = pd.read_csv("raw_syp_simas_sales_bills.csv", dtype={
              "BCODE": "string",
              "BILLNO": "string",
            },)

data['raw_hq_pimas_purchase_bills.csv'] = df_hq_pimas.copy()
data['raw_hq_pidet_purchase_lines.csv'] = df_hq_pidet.copy()
data['raw_hq_sidet_sales_lines.csv'] = df_hq_sidet.copy()
data['raw_hq_simas_sales_bills.csv'] = df_hq_simas.copy()

data['raw_hq_icmas_products.csv'] = df_hq_icmas.copy()

In [10]:
hq_sales_lines = data['raw_hq_sidet_sales_lines.csv'].copy()
syp_sales_lines = data['raw_syp_sidet_sales_lines.csv'].copy()
purchase_lines = data['raw_hq_pidet_purchase_lines.csv'].copy()

In [11]:
purchase_lines["TAXIC"] = purchase_lines["TAXIC"].fillna("N")

In [12]:
import sys
import importlib

# ensure repo is on path
repo_path = r"C:\Users\Windows 11\Notebook\kcw-analytics"
if repo_path not in sys.path:
    sys.path.append(repo_path)

# import the module (NOT individual functions)
import src.kcw.supabase_utils as supabase_utils

# reload to pick up latest .py changes
importlib.reload(supabase_utils)

add_sales_quality_flags = supabase_utils.add_sales_quality_flags
enrich_sales_with_last_purchase_cost = supabase_utils.enrich_sales_with_last_purchase_cost
qc_unknown = supabase_utils.qc_unknown
refill_last_cost_from_icmas = supabase_utils.refill_last_cost_from_icmas
build_all_dims = supabase_utils.build_all_dims

In [13]:
syp_sales_flagged = add_sales_quality_flags(syp_sales_lines)

# QC summary (no deletion)
total = len(syp_sales_flagged)
invalid = (~syp_sales_flagged["IS_VALID"]).sum()
print(f"Invalid: {invalid:,}/{total:,} ({invalid/total*100:.2f}%)")

print(
    syp_sales_flagged["INVALID_REASON"]
    .fillna("OK")
    .value_counts()
    .head(20)
)

# For analytics (optional): just filter in pandas
syp_sales_valid_only = syp_sales_flagged[syp_sales_flagged["IS_VALID"]].copy()

Invalid: 13/36,049 (0.04%)
INVALID_REASON
OK            36036
BAD_BCODE         7
BAD_AMOUNT        6
Name: count, dtype: Int64


In [14]:
hq_sales_flagged = add_sales_quality_flags(hq_sales_lines)

# QC summary (no deletion)
total = len(hq_sales_flagged)
invalid = (~hq_sales_flagged["IS_VALID"]).sum()
print(f"Invalid: {invalid:,}/{total:,} ({invalid/total*100:.2f}%)")

print(
    hq_sales_flagged["INVALID_REASON"]
    .fillna("OK")
    .value_counts()
    .head(20)
)

# For analytics (optional): just filter in pandas
syp_sales_valid_only = hq_sales_flagged[hq_sales_flagged["IS_VALID"]].copy()

Invalid: 5,933/732,930 (0.81%)
INVALID_REASON
OK                                726997
BAD_BCODE                           3794
CANCELED                            1553
BAD_AMOUNT                           385
BAD_PRICE                             90
BAD_BCODE|BAD_PRICE|BAD_AMOUNT        51
BAD_BCODE|BAD_PRICE                   32
BAD_PRICE|BAD_AMOUNT                  22
BAD_BCODE|CANCELED                     5
BAD_AMOUNT|CANCELED                    1
Name: count, dtype: Int64


In [15]:
hq_sales_enriched = enrich_sales_with_last_purchase_cost(
    hq_sales_flagged,
    purchase_lines,
)

qc_unknown(hq_sales_enriched, "before refill")

hq_sales_enriched = refill_last_cost_from_icmas(
    data,
    hq_sales_enriched,
    last_cost_col="LAST_PURCHASE_COST",
)

qc_unknown(hq_sales_enriched, "after refill")

[before refill] UNKNOWN: 65,583 / 732,930 (8.95%)
[after refill] UNKNOWN: 65,583 / 732,930 (8.95%)


[after refill] UNKNOWN: 65,973 / 732,419 (9.01%)


In [16]:

syp_sales_enriched = enrich_sales_with_last_purchase_cost(
    syp_sales_flagged,
    purchase_lines,
)

qc_unknown(syp_sales_enriched, "before refill")

syp_sales_enriched = refill_last_cost_from_icmas(
    data,
    syp_sales_enriched,
    last_cost_col="LAST_PURCHASE_COST",
)

qc_unknown(syp_sales_enriched, "after refill")

[before refill] UNKNOWN: 781 / 36,049 (2.17%)
[after refill] UNKNOWN: 781 / 36,049 (2.17%)


In [17]:

hq_sales_enriched["BRANCH"] = "HQ"
syp_sales_enriched["BRANCH"] = "SYP"

In [18]:
sales_all = pd.concat([hq_sales_enriched, syp_sales_enriched], ignore_index=True)

sales_all["BRANCH"] = sales_all["BRANCH"].astype("string")
sales_all["LAST_PURCHASE_COST"] = pd.to_numeric(sales_all["LAST_PURCHASE_COST"], errors="coerce")
sales_all["BILLDATE"] = pd.to_datetime(sales_all["BILLDATE"], errors="coerce")

sales_all["BRANCH_BILLNO"] = sales_all["BRANCH"] + "-" + sales_all["BILLNO"].astype(str)

In [19]:
sales_all["BILLTYPE_STD"] = (
    sales_all["BILLNO"]
    .astype("string")
    .str.upper()
    .str.replace(r"^3", "", regex=True)   # remove leading 3 if exists
    .str.extract(r"^(TFV|TAD|TAR|TR|TD|TF|CN|DN)", expand=False)
    .fillna("UNKNOWN")
)

In [20]:
KEEP_COLS = [
    'BILLDATE', 'BILLTYPE', 'JOURMODE',
    'BILLNO', 'BCODE', 'DETAIL',
    'STATUS', 'TAXIC', 'ISVAT',
    'QTY', 'UI', 'MTP', 'PRICE', 'XPRICE', 'DISCNT1', 'DISCNT2', 'DISCNT3',
    'DISCNT4', 'DED', 'VAT', 'AMOUNT', 'ACCTNO', 'PAID',
    'ACCT_NO', 'DONE', 'CANCELED',
    'PRICE_NUM', 'AMOUNT_NUM', 'IS_VALID', 'INVALID_REASON', 'ROW_ID',
    'LAST_PURCHASE_DATE', 'LAST_PURCHASE_COST', 'COST_STATUS',
    'BRANCH', 'BRANCH_BILLNO', 'BILLTYPE_STD'
]

# keep only columns that actually exist (prevents KeyError)
sales_all = sales_all[[c for c in KEEP_COLS if c in sales_all.columns]].copy()

In [21]:
dims = build_all_dims(sales_all)
{k: v.shape for k, v in dims.items()}

{'dim_date': (1827, 8),
 'dim_product': (29405, 6),
 'dim_category': (36, 2),
 'dim_account': (2361, 4),
 'dim_branch': (2, 3),
 'dim_billtype': (9, 2)}

In [22]:
hq_sales_bills = data['raw_hq_simas_sales_bills.csv'].copy()
syp_sales_bills = data['raw_syp_simas_sales_bills.csv'].copy()

In [23]:
hq_sales_bills["BRANCH"] = "HQ"
syp_sales_bills["BRANCH"] = "SYP"

In [24]:
hq_sales_bills = hq_sales_bills.dropna(axis=1, how="all")
syp_sales_bills = syp_sales_bills.dropna(axis=1, how="all")

sales_bills_all = pd.concat(
    [hq_sales_bills, syp_sales_bills],
    ignore_index=True
)

sales_bills_all["BRANCH"] = sales_bills_all["BRANCH"].astype("string")

In [25]:
sales_bills_all["BILLTYPE_STD"] = (
    sales_bills_all["BILLNO"]
    .astype("string")
    .str.upper()
    .str.replace(r"^3", "", regex=True)   # remove leading 3 if exists
    .str.extract(r"^(TFV|TAD|TAR|TR|TD|TF|CN|DN)", expand=False)
    .fillna("UNKNOWN")
)

In [26]:
sales_bills_all.columns

Index(['ID', 'JOURMODE', 'JOURTYPE', 'JOURDATE', 'JOURNO', 'JOURTIME',
       'DEPTNO', 'BOOKNO', 'BILLTYPE', 'BILLDATE', 'BILLTIME', 'BILLNO',
       'LINES', 'TAXIC', 'DISCOUNT', 'DEDUCT', 'BEFORETAX', 'VAT', 'TAX',
       'AFTERTAX', 'EXEMPT', 'SVCCHG', 'PAID', 'CASHED', 'CASHAMT', 'CHKAMT',
       'DUEAMT', 'PAYSTAT', 'ACCTNO', 'ACCTNAME', 'ADDR1', 'ADDR2', 'PO',
       'SALE', 'RE', 'TERM', 'DUEDATE', 'NOTEDATE', 'NOTENO', 'VOUCDATE1',
       'VOUCNO1', 'VOUCDATE2', 'VOUCNO2', 'POSTED1', 'POSTED2', 'REMARKS',
       'CANCELED', 'DONE', 'BRANCH', 'BILLTYPE_STD'],
      dtype='object')

In [27]:
KEEP_COLS = [
   'JOURMODE', 'JOURTYPE', 'JOURDATE', 'JOURNO', 'JOURTIME',
       'DEPTNO', 'BOOKNO', 'BILLTYPE', 'BILLDATE', 'BILLTIME', 'BILLNO',
       'LINES', 'TAXIC', 'DISCOUNT', 'DEDUCT', 'BEFORETAX', 'VAT', 'TAX',
       'AFTERTAX', 'EXEMPT', 'SVCCHG', 'PAID', 'CASHED', 'CASHAMT', 'CHKAMT',
       'DUEAMT', 'PAYSTAT', 'ACCTNO', 'ACCTNAME', 'ADDR1', 'ADDR2', 'PO',
       'SALE', 'RE', 'TERM', 'DUEDATE', 'NOTEDATE', 'NOTENO', 'VOUCDATE1',
       'VOUCNO1', 'VOUCDATE2', 'VOUCNO2', 'POSTED1', 'POSTED2', 'REMARKS',
       'CANCELED', 'DONE', 'BRANCH', 'BILLTYPE_STD'
]

# keep only columns that actually exist (prevents KeyError)
sales_bills_all = sales_bills_all[[c for c in KEEP_COLS if c in sales_bills_all.columns]].copy()

In [28]:
# # take only needed column from header table
# taxic_map = sales_bills_all[["BILLNO", "TAXIC"]].copy()

# # merge into sales lines
# sales_all = sales_all.merge(
#     taxic_map,
#     on="BILLNO",
#     how="left"
# )

# # default TAXIC = "Y" if missing
# sales_all["TAXIC"] = (
#     sales_all["TAXIC"]
#     .astype("string")
#     .str.strip()
#     .str.upper()
#     .fillna("Y")
# )


In [29]:
import shutil
from pathlib import Path

out_dir = Path(r"G:\Shared drives\KCW-Data\kcw_analytics\03_curated")

In [30]:

import os
os.makedirs(out_dir, exist_ok=True)

for name, df in dims.items():
    df.to_csv(f"{out_dir}/{name}.csv", index=False, encoding="utf-8-sig")

In [31]:

sales_all.to_csv(
    f"{out_dir}/fact_sales_all.csv",
    index=False,
    encoding="utf-8-sig"   # important for Thai + Excel
)

In [32]:
sales_bills_all.to_csv(
    f"{out_dir}/fact_sales_bills_all.csv",
    index=False,
    encoding="utf-8-sig"   # important for Thai + Excel
)