<div style="background:linear-gradient(135deg,#0ea5e9,#1d4ed8);padding:22px;border-radius:16px;color:white;box-shadow:0 10px 25px rgba(0,0,0,.18);">
  <div style="font-size:12px;letter-spacing:.12em;opacity:.85;text-transform:uppercase">Data Engineering</div>
  <div style="font-size:22px;font-weight:800;line-height:1.2;">Phase A — Ingestion & Validation</div>
  <div style="margin-top:8px;font-size:14px;opacity:.95">Load raw Instacart CSVs, validate schema, and persist to Parquet for downstream layers.</div>
</div>

In [3]:
import os
import pandas as pd

# You are inside: instacart-retail-project/notebooks
PROJECT_ROOT = os.path.abspath("..")

RAW_DIR = os.path.join(PROJECT_ROOT, "data", "raw")
INTERIM_DIR = os.path.join(PROJECT_ROOT, "data", "interim")
PROCESSED_DIR = os.path.join(PROJECT_ROOT, "data", "processed")

os.makedirs(INTERIM_DIR, exist_ok=True)
os.makedirs(PROCESSED_DIR, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("RAW_DIR:", RAW_DIR)


PROJECT_ROOT: /Users/imakdoun/PycharmProjects/instacart-retail-project
RAW_DIR: /Users/imakdoun/PycharmProjects/instacart-retail-project/data/raw


<div style="margin-top:14px;background:rgba(255,255,255,.75);backdrop-filter:blur(10px);-webkit-backdrop-filter:blur(10px);
border:1px solid rgba(59,130,246,.25);border-left:6px solid #2563eb;border-radius:14px;padding:16px;">
  <div style="font-weight:800;color:#0f172a;font-size:15px;">Step 1 — Verify Raw Inputs</div>
  <ul style="margin-top:10px;color:#334155;font-size:13.5px;line-height:1.7;"><li>Check that all expected CSV files exist in <code>data/raw</code>.</li>
<li>Fail fast if any file is missing to avoid partial pipelines.</li></ul>
  
</div>

In [5]:
expected_files = [
    "orders.csv",
    "order_products__prior.csv",
    "order_products__train.csv",
    "products.csv",
    "aisles.csv",
    "departments.csv",
]

missing = [f for f in expected_files if not os.path.exists(os.path.join(RAW_DIR, f))]

if missing:
    raise FileNotFoundError(f"Missing files in data/raw: {missing}")

print("All raw files found.")


All raw files found.


<div style="margin-top:14px;background:rgba(255,255,255,.75);backdrop-filter:blur(10px);-webkit-backdrop-filter:blur(10px);
border:1px solid rgba(59,130,246,.25);border-left:6px solid #2563eb;border-radius:14px;padding:16px;">
  <div style="font-weight:800;color:#0f172a;font-size:15px;">Step 2 — Load Raw Tables</div>
  <ul style="margin-top:10px;color:#334155;font-size:13.5px;line-height:1.7;"><li>Read CSVs into DataFrames with consistent paths.</li>
<li>Apply minimal cleaning only when required (keep Bronze semantics).</li></ul>
  <div style="margin-top:10px;font-size:13px;opacity:.85">Output: in-memory DataFrames (orders, order_products, products, aisles, departments)</div>
</div>

In [6]:
def load_csv(filename, dtypes=None):
    path = os.path.join(RAW_DIR, filename)
    df = pd.read_csv(path, dtype=dtypes)
    print(f"{filename}: {df.shape}")
    return df

orders_df = load_csv("orders.csv")
# Handle missing values in days_since_prior_order
# In the Instacart dataset, days_since_prior_order is NaN for a customer's first-ever order.
# We replace NaN with -1 to explicitly indicate "first order".
# This is not required for association rule mining,
# but it ensures the column remains numerically consistent for future analysis (e.g., segmentation or time-based features).

orders_df["days_since_prior_order"] = (
    orders_df["days_since_prior_order"]
    .fillna(-1)
)
prior_df = load_csv("order_products__prior.csv")
train_df = load_csv("order_products__train.csv")
products_df = load_csv("products.csv")
aisles_df = load_csv("aisles.csv")
departments_df = load_csv("departments.csv")

order_products_df = pd.concat([prior_df, train_df], ignore_index=True)
print("order_products_df:", order_products_df.shape)


orders.csv: (3421083, 7)
order_products__prior.csv: (32434489, 4)
order_products__train.csv: (1384617, 4)
products.csv: (49688, 4)
aisles.csv: (134, 2)
departments.csv: (21, 2)
order_products_df: (33819106, 4)


<div style="margin-top:14px;background:rgba(255,255,255,.75);backdrop-filter:blur(10px);-webkit-backdrop-filter:blur(10px);
border:1px solid rgba(59,130,246,.25);border-left:6px solid #2563eb;border-radius:14px;padding:16px;">
  <div style="font-weight:800;color:#0f172a;font-size:15px;">Step 3 — Schema & Quality Gates</div>
  <ul style="margin-top:10px;color:#334155;font-size:13.5px;line-height:1.7;"><li>Validate required columns for each table.</li>
<li>Catch missing fields early before modeling (Gold layer).</li></ul>
  
</div>

In [7]:
def check_required_cols(df, required, name):
    missing_cols = [c for c in required if c not in df.columns]
    if missing_cols:
        raise ValueError(f" {name} missing columns: {missing_cols}")
    print(f" {name}: required columns OK")

check_required_cols(orders_df, ["order_id", "user_id", "eval_set"], "orders_df")
check_required_cols(order_products_df, ["order_id", "product_id", "add_to_cart_order", "reordered"], "order_products_df")
check_required_cols(products_df, ["product_id", "product_name", "aisle_id", "department_id"], "products_df")

# Quick null scan
print("\nNulls in orders_df:\n", orders_df.isna().sum().sort_values(ascending=False).head(10))
print("\nNulls in order_products_df:\n", order_products_df.isna().sum().sort_values(ascending=False).head(10))
print("\nNulls in products_df:\n", products_df.isna().sum().sort_values(ascending=False).head(10))


 orders_df: required columns OK
 order_products_df: required columns OK
 products_df: required columns OK

Nulls in orders_df:
 order_id                  0
user_id                   0
eval_set                  0
order_number              0
order_dow                 0
order_hour_of_day         0
days_since_prior_order    0
dtype: int64

Nulls in order_products_df:
 order_id             0
product_id           0
add_to_cart_order    0
reordered            0
dtype: int64

Nulls in products_df:
 product_id       0
product_name     0
aisle_id         0
department_id    0
dtype: int64


<div style="margin-top:14px;background:rgba(255,255,255,.75);backdrop-filter:blur(10px);-webkit-backdrop-filter:blur(10px);
border:1px solid rgba(59,130,246,.25);border-left:6px solid #2563eb;border-radius:14px;padding:16px;">
  <div style="font-weight:800;color:#0f172a;font-size:15px;">Step 4 — Persist Silver Parquet</div>
  <ul style="margin-top:10px;color:#334155;font-size:13.5px;line-height:1.7;"><li>Write Parquet files to <code>data/interim</code>.</li>
<li>Parquet improves IO performance and stabilizes schema for modeling.</li></ul>
  <div style="margin-top:10px;font-size:13px;opacity:.85">Output: <code>orders.parquet</code>, <code>order_products.parquet</code>, <code>products.parquet</code>, <code>aisles.parquet</code>, <code>departments.parquet</code></div>
</div>

In [9]:
orders_df.to_parquet(os.path.join(INTERIM_DIR, "orders.parquet"), index=False)
order_products_df.to_parquet(os.path.join(INTERIM_DIR, "order_products.parquet"), index=False)
products_df.to_parquet(os.path.join(INTERIM_DIR, "products.parquet"), index=False)
aisles_df.to_parquet(os.path.join(INTERIM_DIR, "aisles.parquet"), index=False)
departments_df.to_parquet(os.path.join(INTERIM_DIR, "departments.parquet"), index=False)

print("Saved interim parquet files to:", INTERIM_DIR)

Saved interim parquet files to: /Users/imakdoun/PycharmProjects/instacart-retail-project/data/interim
