In [1]:
import sqlite3
import pandas as pd

db_path = r"C:\Users\vedas\Downloads\enterprise-analytics-platform\data\raw\enterprise_analytics.db"
conn = sqlite3.connect(db_path)

print("✅ Connected to SQLite DB")


✅ Connected to SQLite DB


In [2]:
df = pd.read_sql_query("SELECT * FROM fct_order_lines;", conn)
df.shape


(112650, 17)

In [3]:
tests = []

def add_test(name, failed_count, total_rows):
    tests.append({
        "test_name": name,
        "failed_rows": int(failed_count),
        "total_rows": int(total_rows),
        "failed_pct": round((failed_count / total_rows) * 100, 4) if total_rows else 0.0,
        "status": "PASS" if failed_count == 0 else "FAIL"
    })

total = len(df)

# 1) Null checks (critical keys)
add_test("Null order_id", df["order_id"].isna().sum(), total)
add_test("Null customer_id", df["customer_id"].isna().sum(), total)
add_test("Null product_id", df["product_id"].isna().sum(), total)

# 2) Duplicate key check for fact line key: (order_id, order_item_id)
dup_keys = df.duplicated(subset=["order_id", "order_item_id"]).sum()
add_test("Duplicate (order_id, order_item_id)", dup_keys, total)

# 3) Validity checks
add_test("Price < 0", (df["price"] < 0).sum(), total)
add_test("Freight < 0", (df["freight_value"] < 0).sum(), total)
add_test("Total_paid < 0", (df["total_paid"] < 0).sum(), total)

# 4) Date logic checks (only when dates exist)
date_logic_fail = ((df["delivered_date"].notna()) & (df["order_date"].notna()) & (df["delivered_date"] < df["order_date"])).sum()
add_test("Delivered date earlier than order date", date_logic_fail, total)

# 5) Delivery days sanity
delivery_days_neg = (df["delivery_days"].notna() & (df["delivery_days"] < 0)).sum()
add_test("Negative delivery_days", delivery_days_neg, total)

quality_report = pd.DataFrame(tests)
quality_report


Unnamed: 0,test_name,failed_rows,total_rows,failed_pct,status
0,Null order_id,0,112650,0.0,PASS
1,Null customer_id,0,112650,0.0,PASS
2,Null product_id,0,112650,0.0,PASS
3,"Duplicate (order_id, order_item_id)",0,112650,0.0,PASS
4,Price < 0,0,112650,0.0,PASS
5,Freight < 0,0,112650,0.0,PASS
6,Total_paid < 0,0,112650,0.0,PASS
7,Delivered date earlier than order date,0,112650,0.0,PASS
8,Negative delivery_days,0,112650,0.0,PASS


In [6]:
output_path = r"C:\Users\vedas\Downloads\enterprise-analytics-platform\data\curated\data_quality_report.csv"
quality_report.to_csv(output_path, index=False)

print("✅ Data quality report saved to:", output_path)


✅ Data quality report saved to: C:\Users\vedas\Downloads\enterprise-analytics-platform\data\curated\data_quality_report.csv


In [7]:
conn.close()
print("✅ DB connection closed")


✅ DB connection closed
