In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import pandas as pd

folder = "/content/drive/Shareddrives/KCW-Data/kcw_analytics/01_raw"

data = {}

for file in os.listdir(folder):
    if file.endswith(".csv"):
        path = os.path.join(folder, file)
        data[file] = pd.read_csv(
            path,
            dtype={
              "BCODE": "string",
              "ITEMNO": "string",
              "BILLNO": "string",
            },
            encoding="utf-8-sig",
            low_memory=False   # stops chunk guessing
        )
        print(f"Loaded: {file} -> {data[file].shape}")

Loaded: raw_inventory_hq_2024.csv -> (4983, 8)
Loaded: raw_hq_icmas_products.csv -> (114890, 94)
Loaded: raw_hq_pimas_purchase_bills.csv -> (50127, 49)
Loaded: raw_hq_pidet_purchase_lines.csv -> (153425, 41)
Loaded: raw_hq_sidet_sales_lines.csv -> (732564, 38)
Loaded: raw_hq_simas_sales_bills.csv -> (275955, 49)
Loaded: raw_syp_pimas_purchase_bills.csv -> (2908, 49)
Loaded: raw_syp_sidet_sales_lines.csv -> (35978, 38)
Loaded: raw_syp_simas_sales_bills.csv -> (12182, 49)
Loaded: raw_syp_pidet_purchase_lines.csv -> (27182, 41)


In [4]:
df_simas = data["raw_hq_simas_sales_bills.csv"].copy()
df_sidet = data["raw_hq_sidet_sales_lines.csv"].copy()

df_sidet["AMOUNT"] = pd.to_numeric(df_sidet["AMOUNT"], errors="coerce")
df_simas["AFTERTAX"] = pd.to_numeric(df_simas["AFTERTAX"], errors="coerce")
df_simas["BEFORETAX"] = pd.to_numeric(df_simas["BEFORETAX"], errors="coerce")

In [11]:
df_simas.columns

Index(['ID', 'JOURMODE', 'JOURTYPE', 'JOURDATE', 'JOURNO', 'JOURTIME',
       'DEPTNO', 'BOOKNO', 'BILLTYPE', 'BILLDATE', 'BILLTIME', 'BILLNO',
       'LINES', 'TAXIC', 'DISCOUNT', 'DEDUCT', 'BEFORETAX', 'VAT', 'TAX',
       'AFTERTAX', 'EXEMPT', 'SVCCHG', 'WITHHOLD', 'PAID', 'CASHED', 'CASHAMT',
       'CHKAMT', 'DUEAMT', 'PAYSTAT', 'ACCTNO', 'ACCTNAME', 'ADDR1', 'ADDR2',
       'PO', 'SALE', 'RE', 'TERM', 'DUEDATE', 'NOTEDATE', 'NOTENO',
       'VOUCDATE1', 'VOUCNO1', 'VOUCDATE2', 'VOUCNO2', 'POSTED1', 'POSTED2',
       'REMARKS', 'CANCELED', 'DONE', 'BILLTYPE_BILLNO'],
      dtype='object')

In [22]:
df_simas["BILLTYPE_BILLNO"] = df_simas["BILLTYPE"].astype(str) + "_" + df_simas["BILLNO"].astype(str)

non_unique = (
    df_simas[df_simas.duplicated(subset=["BILLTYPE_BILLNO"], keep=False)]
    .sort_values("BILLTYPE_BILLNO")
)

non_unique[['BILLNO', 'BILLTYPE', 'BILLTYPE_BILLNO', 'BOOKNO', 'TAXIC', 'BILLDATE', 'JOURMODE', 'JOURTYPE']]

Unnamed: 0,BILLNO,BILLTYPE,BILLTYPE_BILLNO,BOOKNO,TAXIC,BILLDATE,JOURMODE,JOURTYPE
179398,1122787,0,0_1122787,1,N,2024-07-15,1,SJ
179925,1122787,0,0_1122787,1,N,2024-07-18,2,SJ
166,1K65,1,1_1K65,1,N,2021-02-23,2,SJ
772,1K65,1,1_1K65,1,N,2021-02-27,2,SJ
1096,1K65,1,1_1K65,1,N,2021-03-01,2,SJ
2471,1K65,1,1_1K65,1,N,2021-03-10,2,SJ
7497,4K64-00001,1,1_4K64-00001,5,N,2021-04-10,2,SJ
7608,4K64-00001,1,1_4K64-00001,5,N,2021-04-11,2,SJ
221064,4K68-0000375,1,1_4K68-0000375,4,N,2025-04-08,2,SJ
221065,4K68-0000375,1,1_4K68-0000375,4,N,2025-04-08,2,SJ


In [20]:
filtered = df_sidet[df_sidet["BILLNO"].isin(["4K68-0000375"])]

filtered[['BILLDATE', 'BILLNO', 'BILLTYPE']]

Unnamed: 0,BILLDATE,BILLNO,BILLTYPE
559394,2025-04-08,4K68-0000375,1.0
559395,2025-04-08,4K68-0000375,1.0


In [34]:
df_sidet.columns

Index(['ID', 'JOURMODE', 'JOURTYPE', 'JOURDATE', 'BILLTYPE', 'BILLDATE',
       'BILLNO', 'LINE', 'ITEMNO', 'BCODE', 'PCODE', 'MCODE', 'DETAIL',
       'WHNUMBER', 'LOCATION1', 'STATUS', 'SERIAL', 'TAXIC', 'EXMPT', 'ISVAT',
       'QTY', 'UI', 'MTP', 'PRICE', 'XPRICE', 'DISCNT1', 'DISCNT2', 'DISCNT3',
       'DISCNT4', 'DED', 'VAT', 'AMOUNT', 'CHGAMT', 'ACCTNO', 'PAID',
       'ACCT_NO', 'DONE', 'CANCELED'],
      dtype='object')

In [62]:
df_sidet["ISVAT"]

Unnamed: 0,ISVAT
0,N
1,N
2,N
3,N
4,N
...,...
732414,N
732415,N
732416,Y
732417,N


In [63]:
# ensure numeric first (legacy POS safe)
cols = ["AMOUNT", "VAT", "DED"]
df_sidet[cols] = df_sidet[cols].apply(pd.to_numeric, errors="coerce")

# fill null VAT with 0 (important)
df_sidet["VAT"] = df_sidet["VAT"].fillna(0)

# ðŸ”µ create VAT-removed column at line level
df_sidet["BASE_AMOUNT"] = df_sidet["AMOUNT"].where(
    df_sidet["ISVAT"] == "N",
    (100 / (100 + df_sidet["VAT"])) * df_sidet["AMOUNT"]
)

# groupby bill
sidet_sum = (
    df_sidet
    .groupby("BILLNO", as_index=False)
    .agg(
        SIDET_AMOUNT_SUM=("AMOUNT", "sum"),
        SIDET_BASE_AMOUNT_SUM=("BASE_AMOUNT", "sum")
    )
)


In [64]:
df_check = df_simas.merge(
    sidet_sum,
    on="BILLNO",
    how="left"
)

In [65]:
df_check["DELTA"] = (
    pd.to_numeric(df_check["BEFORETAX"], errors="coerce")
    - pd.to_numeric(df_check["SIDET_BASE_AMOUNT_SUM"], errors="coerce")
)

In [66]:
df_problem = df_check[df_check["DELTA"].abs() > 0.01]

In [82]:
df_problem = df_problem.copy()

df_problem["BILLDATE"] = pd.to_datetime(
    df_problem["BILLDATE"],
    errors="coerce"
)

df_problem_21feb = df_problem[
    (df_problem["BILLDATE"] >= "2026-02-20") &
    (df_problem["BILLDATE"] < "2026-02-21")
]

In [85]:
df_problem_21feb[['BILLNO', 'BILLTYPE','BEFORETAX','AFTERTAX','VAT', 'DEDUCT','SIDET_AMOUNT_SUM', 'DELTA', 'TAXIC']]

Unnamed: 0,BILLNO,BILLTYPE,BEFORETAX,AFTERTAX,VAT,DEDUCT,SIDET_AMOUNT_SUM,DELTA,TAXIC
275410,8K69-0003078,1,100.0,100.0,0.0,9.0,109.0,-9.0,N
275417,8K69-0003082,1,6900.0,6900.0,0.0,10.0,6910.0,-10.0,N
275424,6K69-0001960,1,15.0,15.0,0.0,0.5,15.5,-0.5,N
275433,TD6902-114,1,3190.66,3414.01,7.0,,3190.66,208.734766,N
275438,6K69-0001964,1,1260.0,1260.0,0.0,-5.0,1255.0,5.0,N
275465,8K69-0003097,1,655.0,655.0,0.0,1.0,656.0,-1.0,N
275468,TD6902-117,1,2784.02,2978.9,7.0,,2784.02,182.13215,N
275472,6K69-0001974,1,7960.0,7960.0,0.0,5.0,7965.0,-5.0,N
275500,TFV6902-003,2,-2782.0,-2782.0,0.0,6567.36,-197.12,-2584.88,N
275524,6K69-0001998,1,2160.0,2160.0,0.0,1.0,2161.0,-1.0,N


In [87]:
filtered = df_sidet[df_sidet["BILLNO"].isin(["TFV6902-003"])]

filtered[['BCODE','BILLNO','ISVAT','DISCNT1', 'DISCNT2', 'DISCNT3',
       'DISCNT4', 'DED', 'AMOUNT']]

Unnamed: 0,BCODE,BILLNO,ISVAT,DISCNT1,DISCNT2,DISCNT3,DISCNT4,DED,AMOUNT
721579,22050198,TFV6902-003,N,,,,,,2324.88
721580,12051898,TFV6902-003,N,,,,,,130.0
721581,12051753,TFV6902-003,N,,,,,,40.0
721582,12050530,TFV6902-003,N,,,,,,90.0
731193,12053133,TFV6902-003,N,,,,,,-2782.0


In [78]:
df_sidet.columns

Index(['ID', 'JOURMODE', 'JOURTYPE', 'JOURDATE', 'BILLTYPE', 'BILLDATE',
       'BILLNO', 'LINE', 'ITEMNO', 'BCODE', 'PCODE', 'MCODE', 'DETAIL',
       'WHNUMBER', 'LOCATION1', 'STATUS', 'SERIAL', 'TAXIC', 'EXMPT', 'ISVAT',
       'QTY', 'UI', 'MTP', 'PRICE', 'XPRICE', 'DISCNT1', 'DISCNT2', 'DISCNT3',
       'DISCNT4', 'DED', 'VAT', 'AMOUNT', 'CHGAMT', 'ACCTNO', 'PAID',
       'ACCT_NO', 'DONE', 'CANCELED', 'BASE_AMOUNT'],
      dtype='object')

In [79]:
df_simas.columns

Index(['ID', 'JOURMODE', 'JOURTYPE', 'JOURDATE', 'JOURNO', 'JOURTIME',
       'DEPTNO', 'BOOKNO', 'BILLTYPE', 'BILLDATE', 'BILLTIME', 'BILLNO',
       'LINES', 'TAXIC', 'DISCOUNT', 'DEDUCT', 'BEFORETAX', 'VAT', 'TAX',
       'AFTERTAX', 'EXEMPT', 'SVCCHG', 'WITHHOLD', 'PAID', 'CASHED', 'CASHAMT',
       'CHKAMT', 'DUEAMT', 'PAYSTAT', 'ACCTNO', 'ACCTNAME', 'ADDR1', 'ADDR2',
       'PO', 'SALE', 'RE', 'TERM', 'DUEDATE', 'NOTEDATE', 'NOTENO',
       'VOUCDATE1', 'VOUCNO1', 'VOUCDATE2', 'VOUCNO2', 'POSTED1', 'POSTED2',
       'REMARKS', 'CANCELED', 'DONE'],
      dtype='object')

In [89]:
filtered = df_simas[df_simas["BILLNO"].isin(["TFV6902-003"])]
filtered[['BILLNO','VAT', 'TAX', 'BEFORETAX', 'AFTERTAX', 'TAXIC']]

Unnamed: 0,BILLNO,VAT,TAX,BEFORETAX,AFTERTAX,TAXIC
272324,TFV6902-003,0.0,0.0,2584.88,2584.88,N
275500,TFV6902-003,0.0,0.0,-2782.0,-2782.0,N


In [61]:
df_problem_21feb[['BILLNO','DISCOUNT', 'TAX', 'VAT', 'DEDUCT','BEFORETAX', 'AFTERTAX', 'SIDET_AMOUNT_SUM', 'DELTA', 'SIDET_BASE_AMOUNT_SUM']]

Unnamed: 0,BILLNO,DISCOUNT,TAX,VAT,DEDUCT,BEFORETAX,AFTERTAX,SIDET_AMOUNT_SUM,DELTA,SIDET_BASE_AMOUNT_SUM
275596,TAD6902-453,,75.56,7.0,,1079.44,1155.0,1155.0,-75.56,1155.0
275599,TAD6902-454,,51.68,7.0,,738.32,790.0,790.0,-51.68,790.0
275600,TAD6902-455,,18.97,7.0,,271.03,290.0,290.0,-18.97,290.0
275601,TAD6902-456,,136.73,7.0,,1953.27,2090.0,2090.0,-136.73,2090.0
275604,TAD6902-457,,18.97,7.0,,271.03,290.0,290.0,-18.97,290.0
275607,TAD6902-458,,5.17,7.0,,73.83,79.0,79.0,-5.17,79.0
275608,TAD6902-459,,12.43,7.0,,177.57,190.0,190.0,-12.43,190.0
275618,TAD6902-460,,183.18,7.0,,2616.82,2800.0,2800.0,-183.18,2800.0
275620,8K69-0003147,,0.0,0.0,8.0,3200.0,3200.0,3208.0,-8.0,3208.0
275622,TAD6902-461,,128.88,7.0,,1841.12,1970.0,1970.0,-128.88,1970.0


In [11]:
df_problem_21feb[['DELTA','AFTERTAX','SIDET_AMOUNT_SUM', 'DEDUCT', 'TAX']].sum()

Unnamed: 0,0
DELTA,840.13
AFTERTAX,37262.8
SIDET_AMOUNT_SUM,36422.67
DEDUCT,46.5
TAX,886.63


In [22]:
total = (
    df_check.loc[
        (df_check["BILLDATE"] >= "2026-02-21") &
        (df_check["BILLDATE"] <  "2026-02-22"),
        ["BEFORETAX"]
    ]
    .sum(axis=1)
    .sum()
)

print(total)

340133.16


In [45]:
mask = (
    (df_sidet["BILLDATE"] >= "2026-02-21") &
    (df_sidet["BILLDATE"] <  "2026-02-22")
)

subset = df_sidet.loc[mask].copy()

# fill VAT null with 0
subset["VAT"] = subset["VAT"].fillna(0)

base_amount = subset["AMOUNT"].where(
    subset["VAT"] <= 0,
    (100 / (100 + subset["VAT"])) * subset["AMOUNT"]
)

total = base_amount.sum()

print(total + 886.63 - 46.50)

340191.14485981304


In [21]:
total = (
    df_check.loc[
        (df_check["BILLDATE"] >= "2026-02-21") &
        (df_check["BILLDATE"] <  "2026-02-22"),
        ["SIDET_AMOUNT_SUM"]
    ]
    .sum(axis=1)
    .sum()
)

print(total)

350174.04000000004
