In [1]:
import pandas as pd
import json

df_clean = pd.read_csv(
    "../data/processed/cleaned_transactions.csv",
    parse_dates=["InvoiceDate"]
)

print("Cleaned Dataset Shape:", df_clean.shape)
df_clean.head()

Cleaned Dataset Shape: (333234, 13)


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalPrice,Year,Month,DayOfWeek,Hour
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,15.3,2010,12,2,8
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,2010,12,2,8
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,22.0,2010,12,2,8
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,2010,12,2,8
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,2010,12,2,8


In [2]:
assert df_clean.isnull().sum().sum() == 0, "❌ Missing values found!"
print("✅ Check 1 Passed: No missing values")

✅ Check 1 Passed: No missing values


In [3]:
assert (df_clean["Quantity"] > 0).all(), "❌ Negative quantities found!"
print("✅ Check 2 Passed: All quantities positive")

✅ Check 2 Passed: All quantities positive


In [4]:
assert (df_clean["UnitPrice"] > 0).all(), "❌ Invalid prices found!"
print("✅ Check 3 Passed: All prices positive")

✅ Check 3 Passed: All prices positive


In [5]:
assert df_clean["CustomerID"].dtype == "int64", "❌ CustomerID is not integer!"
print("✅ Check 4 Passed: CustomerID is integer")

✅ Check 4 Passed: CustomerID is integer


In [6]:
print("Date Range:")
print("Start:", df_clean["InvoiceDate"].min())
print("End:", df_clean["InvoiceDate"].max())

Date Range:
Start: 2010-12-01 08:26:00
End: 2011-12-09 12:50:00


In [7]:
validation_report = {
    "total_rows": len(df_clean),
    "total_columns": len(df_clean.columns),
    "date_range": {
        "start": str(df_clean["InvoiceDate"].min().date()),
        "end": str(df_clean["InvoiceDate"].max().date())
    },
    "unique_customers": int(df_clean["CustomerID"].nunique()),
    "unique_products": int(df_clean["StockCode"].nunique()),
    "unique_countries": int(df_clean["Country"].nunique()),
    "total_revenue": float(df_clean["TotalPrice"].sum()),
    "average_order_value": float(
        df_clean.groupby("InvoiceNo")["TotalPrice"].sum().mean()
    ),
    "validation_passed": True,
    "checks": {
        "no_missing_values": True,
        "all_quantities_positive": True,
        "all_prices_positive": True,
        "customer_id_is_integer": True
    }
}

with open("../data/processed/validation_report.json", "w") as f:
    json.dump(validation_report, f, indent=4)

validation_report

{'total_rows': 333234,
 'total_columns': 13,
 'date_range': {'start': '2010-12-01', 'end': '2011-12-09'},
 'unique_customers': 4191,
 'unique_products': 3392,
 'unique_countries': 37,
 'total_revenue': 4299476.184,
 'average_order_value': 255.52574491857837,
 'validation_passed': True,
 'checks': {'no_missing_values': True,
  'all_quantities_positive': True,
  'all_prices_positive': True,
  'customer_id_is_integer': True}}