# 01 — Data Validation & Exploration

Validate raw CSV files, check for nulls/duplicates, and preview all entities before pipeline processing.

In [1]:
import sys, os
sys.path.insert(0, os.path.abspath(".."))

import pandas as pd

# Load all raw CSVs
orders = pd.read_csv("../data/raw/orders.csv")
order_items = pd.read_csv("../data/raw/order_items.csv")
menu_items = pd.read_csv("../data/raw/menu_items.csv")
categories = pd.read_csv("../data/raw/categories.csv")
customers = pd.read_csv("../data/raw/customers.csv")
payments = pd.read_csv("../data/raw/payments.csv")
staff = pd.read_csv("../data/raw/staff.csv")

print("=== Row Counts ===")
for name, df in [("orders", orders), ("order_items", order_items),
                 ("menu_items", menu_items), ("categories", categories),
                 ("customers", customers), ("payments", payments), ("staff", staff)]:
    print(f"  {name:15s}: {len(df):>6,} rows, {df.isnull().sum().sum()} nulls, {df.duplicated().sum()} duplicates")

=== Row Counts ===
  orders         :  5,000 rows, 0 nulls, 0 duplicates
  order_items    : 12,895 rows, 0 nulls, 0 duplicates
  menu_items     :     45 rows, 0 nulls, 0 duplicates
  categories     :      6 rows, 0 nulls, 0 duplicates
  customers      :    200 rows, 0 nulls, 0 duplicates
  payments       :  5,000 rows, 0 nulls, 0 duplicates
  staff          :     10 rows, 0 nulls, 0 duplicates


## Orders Preview

In [2]:
orders.head(10)

Unnamed: 0,order_id,customer_id,staff_id,order_timestamp,order_status,location
0,1,80,10,2022-08-07 12:01:59,completed,Suburb Plaza
1,2,176,10,2022-04-22 19:19:54,completed,Downtown
2,3,28,7,2022-03-07 21:52:31,completed,Airport Terminal
3,4,188,10,2022-12-01 20:29:27,completed,Airport Terminal
4,5,8,8,2022-12-17 18:24:21,completed,Airport Terminal
5,6,133,4,2023-02-23 21:35:00,completed,Downtown
6,7,5,2,2023-01-25 19:28:50,completed,Airport Terminal
7,8,141,6,2022-12-09 18:27:47,completed,Airport Terminal
8,9,190,8,2023-03-26 19:12:13,completed,Airport Terminal
9,10,33,5,2022-05-31 15:45:34,completed,Downtown


## Validate Orders (null timestamps, required columns)

In [3]:
from src.services.validator import OrderValidator

validated_orders = OrderValidator().validate(orders)
print(f"Before: {len(orders)} → After validation: {len(validated_orders)}")
validated_orders.info()

Before: 5000 → After validation: 5000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   order_id         5000 non-null   int64 
 1   customer_id      5000 non-null   int64 
 2   staff_id         5000 non-null   int64 
 3   order_timestamp  5000 non-null   object
 4   order_status     5000 non-null   object
 5   location         5000 non-null   object
dtypes: int64(3), object(3)
memory usage: 234.5+ KB


## Menu Items & Categories

In [4]:
print("Categories:")
print(categories.to_string(index=False))
print(f"\nMenu items: {len(menu_items)} items across {categories['category_name'].nunique()} categories")
menu_items.merge(categories, on="category_id").groupby("category_name")["item_name"].count()

Categories:
 category_id category_name
           1      Fastfood
           2     Beverages
           3        Coffee
           4           Tea
           5      Desserts
           6   Main Course

Menu items: 45 items across 6 categories


category_name
Beverages       8
Coffee          8
Desserts        6
Fastfood       10
Main Course     8
Tea             5
Name: item_name, dtype: int64

## Data Types & Schema Check

In [5]:
for name, df in [("orders", orders), ("order_items", order_items), ("payments", payments)]:
    print(f"\n=== {name} ===")
    print(df.dtypes.to_string())


=== orders ===
order_id            int64
customer_id         int64
staff_id            int64
order_timestamp    object
order_status       object
location           object

=== order_items ===
order_item_id    int64
order_id         int64
menu_item_id     int64
quantity         int64
item_price       int64

=== payments ===
payment_id             int64
order_id               int64
payment_method        object
payment_amount       float64
payment_timestamp     object
