Load interim parquet files

In [1]:
import os
import pandas as pd

# notebook is in: project_root/notebooks
PROJECT_ROOT = os.path.abspath("..")

INTERIM_DIR = os.path.join(PROJECT_ROOT, "data", "interim")
PROCESSED_DIR = os.path.join(PROJECT_ROOT, "data", "processed")

os.makedirs(PROCESSED_DIR, exist_ok=True)

orders_df = pd.read_parquet(os.path.join(INTERIM_DIR, "orders.parquet"))
order_products_df = pd.read_parquet(os.path.join(INTERIM_DIR, "order_products.parquet"))
products_df = pd.read_parquet(os.path.join(INTERIM_DIR, "products.parquet"))
aisles_df = pd.read_parquet(os.path.join(INTERIM_DIR, "aisles.parquet"))
departments_df = pd.read_parquet(os.path.join(INTERIM_DIR, "departments.parquet"))

print("✅ Loaded interim data")
print("orders_df:", orders_df.shape)
print("order_products_df:", order_products_df.shape)
print("products_df:", products_df.shape)
print("aisles_df:", aisles_df.shape)
print("departments_df:", departments_df.shape)


✅ Loaded interim data
orders_df: (3421083, 7)
order_products_df: (33819106, 4)
products_df: (49688, 4)
aisles_df: (134, 2)
departments_df: (21, 2)


*<B>This table is the product catalog enriched with aisle + department.*<B>

In [2]:
dim_products_df = (
    products_df
    .merge(aisles_df, on="aisle_id", how="left")
    .merge(departments_df, on="department_id", how="left")
)

print("dim_products_df:", dim_products_df.shape)
dim_products_df.head()


dim_products_df: (49688, 6)


Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department
0,1,Chocolate Sandwich Cookies,61,19,cookies cakes,snacks
1,2,All-Seasons Salt,104,13,spices seasonings,pantry
2,3,Robust Golden Unsweetened Oolong Tea,94,7,tea,beverages
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,frozen meals,frozen
4,5,Green Chile Anytime Sauce,5,13,marinades meat preparation,pantry


*<B>Data quality checks<B>*


In [3]:
missing_aisle = dim_products_df["aisle"].isna().mean()
missing_dept = dim_products_df["department"].isna().mean()

print(f"Missing aisle rate: {missing_aisle:.6f}")
print(f"Missing department rate: {missing_dept:.6f}")

if missing_aisle > 0.001 or missing_dept > 0.001:
    print("⚠️ Too many missing values — check joins.")
else:
    print("✅ dim_products_df looks good.")

Missing aisle rate: 0.000000
Missing department rate: 0.000000
✅ dim_products_df looks good.


*<B>Build fact_order_items<B> (core table for Apriori/FP-Growth later.)*

In [4]:
fact_order_items_df = (
    order_products_df
    .merge(
        dim_products_df[["product_id", "product_name", "aisle_id", "aisle", "department_id", "department"]],
        on="product_id",
        how="left"
    )
)

print("fact_order_items_df:", fact_order_items_df.shape)
fact_order_items_df.head()


fact_order_items_df: (33819106, 9)


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,aisle,department_id,department
0,2,33120,1,1,Organic Egg Whites,86,eggs,16,dairy eggs
1,2,28985,2,1,Michigan Organic Kale,83,fresh vegetables,4,produce
2,2,9327,3,0,Garlic Powder,104,spices seasonings,13,pantry
3,2,45918,4,1,Coconut Butter,19,oils vinegars,13,pantry
4,2,30035,5,0,Natural Sweetener,17,baking ingredients,13,pantry


*<B>Data quality checks (fact)<B>*

In [5]:
missing_names = fact_order_items_df["product_name"].isna().mean()
print(f"Missing product_name rate: {missing_names:.6f}")

if missing_names > 0.001:
    print("⚠️ Missing product names — check products join.")
else:
    print("✅ fact_order_items_df looks good.")

Missing product_name rate: 0.000000
✅ fact_order_items_df looks good.


*<B>Data quality checks (fact)<B>*

In [6]:
print("Unique orders:", fact_order_items_df["order_id"].nunique())
print("Unique products:", fact_order_items_df["product_id"].nunique())
print("Total rows (order-items):", len(fact_order_items_df))


Unique orders: 3346083
Unique products: 49685
Total rows (order-items): 33819106


*<B>Save Processed Files*<B>

In [8]:
dim_products_df.to_parquet(os.path.join(PROCESSED_DIR, "dim_products.parquet"), index=False)
fact_order_items_df.to_parquet(os.path.join(PROCESSED_DIR, "fact_order_items.parquet"), index=False)

print("✅ Saved processed tables to:", PROCESSED_DIR)


✅ Saved processed tables to: /Users/pattern115/Desktop/1st Year Project/data/processed
