In [1]:
# Build Non-Idempotent Loader (The Problem)

import pandas as pd
import os

def load_non_idempotent(daily_file, warehouse_file):
    """
    NON-IDEMPOTENT loader: blindly appends daily data to warehouse.
    Running this twice = DUPLICATE DATA.
    """
    print(f"\n{'=' * 50}")
    print(f"NON-IDEMPOTENT LOAD: {daily_file}")
    print(f"{'=' * 50}")
    
    # Read daily extract
    daily_df = pd.read_csv(daily_file)
    print(f"Daily rows to load: {len(daily_df)}")
    
    # Read current warehouse
    if os.path.exists(warehouse_file) and os.path.getsize(warehouse_file) > 0:
        warehouse_df = pd.read_csv(warehouse_file)
    else:
        warehouse_df = pd.DataFrame(columns=daily_df.columns)
    
    print(f"Warehouse rows BEFORE: {len(warehouse_df)}")
    
    # BLIND APPEND — the problem!
    warehouse_df = pd.concat([warehouse_df, daily_df], ignore_index=True)
    
    # Save back
    warehouse_df.to_csv(warehouse_file, index=False)
    print(f"Warehouse rows AFTER: {len(warehouse_df)}")
    
    return warehouse_df

    # Reset warehouse
pd.DataFrame(columns=["order_id","order_date","customer","product","amount"]).to_csv("warehouse.csv", index=False)

# First run
result = load_non_idempotent("sales_2024_06_01.csv", "warehouse.csv")
print(f"\nWarehouse after first run:")
print(result)




NON-IDEMPOTENT LOAD: sales_2024_06_01.csv
Daily rows to load: 3
Warehouse rows BEFORE: 0
Warehouse rows AFTER: 3

Warehouse after first run:
  order_id  order_date customer   product  amount
0     S001  2024-06-01    Alice  Widget A    75.0
1     S002  2024-06-01      Bob  Widget B    50.0
2     S003  2024-06-01  Charlie  Widget A   150.0


  warehouse_df = pd.concat([warehouse_df, daily_df], ignore_index=True)


In [2]:
# Second run — same file, same day
result = load_non_idempotent("sales_2024_06_01.csv", "warehouse.csv")
print(f"\nWarehouse after SECOND run (re-run):")
print(result)
print(f"\n⚠️  PROBLEM: {len(result)} rows — we have DUPLICATES!")



NON-IDEMPOTENT LOAD: sales_2024_06_01.csv
Daily rows to load: 3
Warehouse rows BEFORE: 3
Warehouse rows AFTER: 6

Warehouse after SECOND run (re-run):
  order_id  order_date customer   product  amount
0     S001  2024-06-01    Alice  Widget A    75.0
1     S002  2024-06-01      Bob  Widget B    50.0
2     S003  2024-06-01  Charlie  Widget A   150.0
3     S001  2024-06-01    Alice  Widget A    75.0
4     S002  2024-06-01      Bob  Widget B    50.0
5     S003  2024-06-01  Charlie  Widget A   150.0

⚠️  PROBLEM: 6 rows — we have DUPLICATES!


In [3]:
# Build Idempotent Loader (The Fix)

def load_idempotent(daily_file, warehouse_file):
    """
    IDEMPOTENT loader: delete existing data for the partition, then insert.
    Running this 1 time or 100 times produces the SAME result.
    """
    print(f"\n{'=' * 50}")
    print(f"IDEMPOTENT LOAD: {daily_file}")
    print(f"{'=' * 50}")
    
    # Read daily extract
    daily_df = pd.read_csv(daily_file)
    partition_date = daily_df["order_date"].iloc[0]
    print(f"Daily rows to load: {len(daily_df)}")
    print(f"Partition date: {partition_date}")
    
    # Read current warehouse
    if os.path.exists(warehouse_file) and os.path.getsize(warehouse_file) > 0:
        warehouse_df = pd.read_csv(warehouse_file)
    else:
        warehouse_df = pd.DataFrame(columns=daily_df.columns)
    
    print(f"Warehouse rows BEFORE: {len(warehouse_df)}")
    
    # STEP 1: DELETE existing rows for this partition date
    rows_before_delete = len(warehouse_df)
    warehouse_df = warehouse_df[warehouse_df["order_date"] != partition_date]
    rows_deleted = rows_before_delete - len(warehouse_df)
    print(f"Rows deleted for partition {partition_date}: {rows_deleted}")
    
    # STEP 2: INSERT fresh data
    warehouse_df = pd.concat([warehouse_df, daily_df], ignore_index=True)
    print(f"Rows inserted: {len(daily_df)}")
    
    # Save back
    warehouse_df.to_csv(warehouse_file, index=False)
    print(f"Warehouse rows AFTER: {len(warehouse_df)}")
    
    return warehouse_df


In [4]:
# Reset warehouse
pd.DataFrame(columns=["order_id","order_date","customer","product","amount"]).to_csv("warehouse.csv", index=False)

print("===== RUN 1 =====")
result1 = load_idempotent("sales_2024_06_01.csv", "warehouse.csv")

print("\n===== RUN 2 (re-run) =====")
result2 = load_idempotent("sales_2024_06_01.csv", "warehouse.csv")

print("\n===== RUN 3 (another re-run) =====")
result3 = load_idempotent("sales_2024_06_01.csv", "warehouse.csv")

print("\n===== IDEMPOTENCY VERIFICATION =====")
print(f"Rows after run 1: {len(result1)}")
print(f"Rows after run 2: {len(result2)}")
print(f"Rows after run 3: {len(result3)}")

are_equal = result1.equals(result2) and result2.equals(result3)
print(f"All results identical: {are_equal}")

if are_equal:
    print("✅ IDEMPOTENT: Safe to re-run!")
else:
    print("❌ NOT IDEMPOTENT: Results differ!")


===== RUN 1 =====

IDEMPOTENT LOAD: sales_2024_06_01.csv
Daily rows to load: 3
Partition date: 2024-06-01
Warehouse rows BEFORE: 0
Rows deleted for partition 2024-06-01: 0
Rows inserted: 3
Warehouse rows AFTER: 3

===== RUN 2 (re-run) =====

IDEMPOTENT LOAD: sales_2024_06_01.csv
Daily rows to load: 3
Partition date: 2024-06-01
Warehouse rows BEFORE: 3
Rows deleted for partition 2024-06-01: 3
Rows inserted: 3
Warehouse rows AFTER: 3

===== RUN 3 (another re-run) =====

IDEMPOTENT LOAD: sales_2024_06_01.csv
Daily rows to load: 3
Partition date: 2024-06-01
Warehouse rows BEFORE: 3
Rows deleted for partition 2024-06-01: 3
Rows inserted: 3
Warehouse rows AFTER: 3

===== IDEMPOTENCY VERIFICATION =====
Rows after run 1: 3
Rows after run 2: 3
Rows after run 3: 3
All results identical: True
✅ IDEMPOTENT: Safe to re-run!


  warehouse_df = pd.concat([warehouse_df, daily_df], ignore_index=True)


In [6]:
# Reset warehouse
pd.DataFrame(columns=["order_id","order_date","customer","product","amount"]).to_csv("warehouse.csv", index=False)

print("===== LOADING JUNE 1 =====")
load_idempotent("sales_2024_06_01.csv", "warehouse.csv")

print("\n===== LOADING JUNE 2 =====")
load_idempotent("sales_2024_06_02.csv", "warehouse.csv")

print("\n===== FINAL WAREHOUSE =====")
final = pd.read_csv("warehouse.csv")
print(final)
print(f"\nTotal rows: {len(final)}")
print(f"Unique dates: {sorted(final['order_date'].unique())}")


===== LOADING JUNE 1 =====

IDEMPOTENT LOAD: sales_2024_06_01.csv
Daily rows to load: 3
Partition date: 2024-06-01
Warehouse rows BEFORE: 0
Rows deleted for partition 2024-06-01: 0
Rows inserted: 3
Warehouse rows AFTER: 3

===== LOADING JUNE 2 =====

IDEMPOTENT LOAD: sales_2024_06_02.csv
Daily rows to load: 2
Partition date: 2024-06-02
Warehouse rows BEFORE: 3
Rows deleted for partition 2024-06-02: 0
Rows inserted: 2
Warehouse rows AFTER: 5

===== FINAL WAREHOUSE =====
  order_id  order_date customer   product  amount
0     S001  2024-06-01    Alice  Widget A    75.0
1     S002  2024-06-01      Bob  Widget B    50.0
2     S003  2024-06-01  Charlie  Widget A   150.0
3     S004  2024-06-02    Diana  Widget C    30.0
4     S005  2024-06-02    Alice  Widget B   100.0

Total rows: 5
Unique dates: ['2024-06-01', '2024-06-02']


  warehouse_df = pd.concat([warehouse_df, daily_df], ignore_index=True)


In [7]:
print("\n===== RE-RUN JUNE 1 =====")
load_idempotent("sales_2024_06_01.csv", "warehouse.csv")

final = pd.read_csv("warehouse.csv")
print(f"\nTotal rows: {len(final)}")
print(f"June 1 rows: {len(final[final['order_date'] == '2024-06-01'])}")
print(f"June 2 rows: {len(final[final['order_date'] == '2024-06-02'])}")



===== RE-RUN JUNE 1 =====

IDEMPOTENT LOAD: sales_2024_06_01.csv
Daily rows to load: 3
Partition date: 2024-06-01
Warehouse rows BEFORE: 5
Rows deleted for partition 2024-06-01: 3
Rows inserted: 3
Warehouse rows AFTER: 5

Total rows: 5
June 1 rows: 3
June 2 rows: 2
