*<B>Paths + imports<B>*

In [1]:
import os
import pandas as pd

# You are inside: instacart-retail-project/notebooks
PROJECT_ROOT = os.path.abspath("..")

RAW_DIR = os.path.join(PROJECT_ROOT, "data", "raw")
INTERIM_DIR = os.path.join(PROJECT_ROOT, "data", "interim")
PROCESSED_DIR = os.path.join(PROJECT_ROOT, "data", "processed")

os.makedirs(INTERIM_DIR, exist_ok=True)
os.makedirs(PROCESSED_DIR, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("RAW_DIR:", RAW_DIR)


PROJECT_ROOT: /Users/pattern115/Desktop/1st Year Project
RAW_DIR: /Users/pattern115/Desktop/1st Year Project/data/raw


*<B>Confirm raw files exist<B>*


In [2]:
expected_files = [
    "orders.csv",
    "order_products__prior.csv",
    "order_products__train.csv",
    "products.csv",
    "aisles.csv",
    "departments.csv",
]

missing = [f for f in expected_files if not os.path.exists(os.path.join(RAW_DIR, f))]

if missing:
    raise FileNotFoundError(f"Missing files in data/raw: {missing}")

print("✅ All raw files found.")


✅ All raw files found.


*<B>Load CSVs (defines all DFs properly)<B>*


In [3]:
def load_csv(filename, dtypes=None):
    path = os.path.join(RAW_DIR, filename)
    df = pd.read_csv(path, dtype=dtypes)
    print(f"{filename}: {df.shape}")
    return df

orders_df = load_csv("orders.csv")
prior_df = load_csv("order_products__prior.csv")
train_df = load_csv("order_products__train.csv")
products_df = load_csv("products.csv")
aisles_df = load_csv("aisles.csv")
departments_df = load_csv("departments.csv")

order_products_df = pd.concat([prior_df, train_df], ignore_index=True)
print("order_products_df:", order_products_df.shape)


orders.csv: (3421083, 7)
order_products__prior.csv: (32434489, 4)
order_products__train.csv: (1384617, 4)
products.csv: (49688, 4)
aisles.csv: (134, 2)
departments.csv: (21, 2)
order_products_df: (33819106, 4)


*<B>Basic validation checks<B>*

In [4]:
def check_required_cols(df, required, name):
    missing_cols = [c for c in required if c not in df.columns]
    if missing_cols:
        raise ValueError(f"❌ {name} missing columns: {missing_cols}")
    print(f"✅ {name}: required columns OK")

check_required_cols(orders_df, ["order_id", "user_id", "eval_set"], "orders_df")
check_required_cols(order_products_df, ["order_id", "product_id", "add_to_cart_order", "reordered"], "order_products_df")
check_required_cols(products_df, ["product_id", "product_name", "aisle_id", "department_id"], "products_df")

# Quick null scan
print("\nNulls in orders_df:\n", orders_df.isna().sum().sort_values(ascending=False).head(10))
print("\nNulls in order_products_df:\n", order_products_df.isna().sum().sort_values(ascending=False).head(10))
print("\nNulls in products_df:\n", products_df.isna().sum().sort_values(ascending=False).head(10))


✅ orders_df: required columns OK
✅ order_products_df: required columns OK
✅ products_df: required columns OK

Nulls in orders_df:
 days_since_prior_order    206209
order_id                       0
user_id                        0
eval_set                       0
order_number                   0
order_dow                      0
order_hour_of_day              0
dtype: int64

Nulls in order_products_df:
 order_id             0
product_id           0
add_to_cart_order    0
reordered            0
dtype: int64

Nulls in products_df:
 product_id       0
product_name     0
aisle_id         0
department_id    0
dtype: int64


*<B>Save as Parquet*<B>


In [5]:
orders_df.to_parquet(os.path.join(INTERIM_DIR, "orders.parquet"), index=False)
order_products_df.to_parquet(os.path.join(INTERIM_DIR, "order_products.parquet"), index=False)
products_df.to_parquet(os.path.join(INTERIM_DIR, "products.parquet"), index=False)
aisles_df.to_parquet(os.path.join(INTERIM_DIR, "aisles.parquet"), index=False)
departments_df.to_parquet(os.path.join(INTERIM_DIR, "departments.parquet"), index=False)

print("✅ Saved interim parquet files to:", INTERIM_DIR)

✅ Saved interim parquet files to: /Users/pattern115/Desktop/1st Year Project/data/interim
