# Step 2 — Hard Pre-Filters

**CSAO Rail Recommendation System · Zomathon Hackathon**

Deterministic rules applied **before** any ML model. Five filters in strict order:

| Filter | Purpose | Key Rule |
|--------|---------|----------|
| **A** | Availability & Margin | Remove out-of-stock and < 10 % margin items |
| **B** | Dietary Toggle | Session-level veg / vegan / non-veg enforcement |
| **C** | Cuisine Coherence | Keep only the dominant cuisine; suppress combo-covered subcategories |
| **D** | Quantity Saturation | Cap per-subcategory count in cart |
| **E** | Dedup & Fatigue | Remove cart duplicates; suppress categories ignored 3× in a row |

In [None]:
import sys, os
sys.path.insert(0, os.path.abspath(".."))

import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns

from hard_filters import HardFilterPipeline

DATA = "../data"
restaurants = pd.read_csv(f"{DATA}/restaurants.csv")
menu        = pd.read_csv(f"{DATA}/menu_items.csv")
users       = pd.read_csv(f"{DATA}/users.csv")
orders      = pd.read_csv(f"{DATA}/order_history.csv")
sessions    = pd.read_csv(f"{DATA}/sessions.csv")
events      = pd.read_csv(f"{DATA}/cart_events.csv")

pipe = HardFilterPipeline(menu, restaurants, users)

print(f"Menu items  : {len(menu):>6,}")
print(f"Sessions    : {len(sessions):>6,}")
print(f"Cart events : {len(events):>6,}")
print("Pipeline loaded.")

In [None]:
def get_cart_items(session_id):
    """Return the set of item_ids in a session's cart."""
    se = events[events["session_id"] == session_id]
    return set(se[se["cart_position"].notna()]["item_id"].tolist())

def get_cart_df(cart_ids):
    """Return menu rows for items in the cart."""
    return menu[menu["item_id"].isin(cart_ids)]

---
## Filter A — Availability & Margin

In [None]:
print("=== Menu-wide Availability & Margin Stats ===")
print(f"  Out of stock  : {(~menu['availability']).sum()} items")
print(f"  Margin < 10%  : {(menu['margin_pct'] < 10).sum()} items")
print(f"  Both combined : {((~menu['availability']) | (menu['margin_pct'] < 10)).sum()} items")
print()

sample_rest = restaurants.iloc[0]["restaurant_id"]
rest_menu = menu[menu["restaurant_id"] == sample_rest].copy()
print(f"--- Restaurant {sample_rest}: {restaurants.iloc[0]['name']} ---")
print(f"  Total items: {len(rest_menu)}")

filtered_a, log_a = pipe.filter_a_availability_margin(rest_menu, sample_rest)
print(f"  After Filter A: {len(filtered_a)}")
print(f"  Removed (out of stock) : {log_a['removed_out_of_stock']}")
print(f"  Removed (low margin)   : {log_a['removed_low_margin']}")

removed_items = rest_menu[~rest_menu["item_id"].isin(filtered_a["item_id"])]
if len(removed_items) > 0:
    print("\n  Items removed:")
    for _, r in removed_items.iterrows():
        reason = []
        if not r["availability"]:
            reason.append("out-of-stock")
        if r["margin_pct"] < 10:
            reason.append(f"margin={r['margin_pct']:.1f}%")
        print(f"    {r['item_id']} {r['name']:<30} {', '.join(reason)}")

---
## Filter B — Dietary Toggle

In [None]:
# Case 1: Session with veg toggle
veg_sessions = sessions[sessions["dietary_toggle"] == "veg"]
print(f"Sessions with veg toggle: {len(veg_sessions)}")

vs = veg_sessions.iloc[0]
rid = vs["restaurant_id"]
rest_items = menu[menu["restaurant_id"] == rid].copy()
rest_items_avail = rest_items[(rest_items["availability"]) & (rest_items["margin_pct"] >= 10)]
cart_ids = get_cart_items(vs["session_id"])
cart_df = get_cart_df(cart_ids)

print(f"\n--- Session {vs['session_id']} (veg toggle) ---")
print(f"  Restaurant: {rid}  |  Candidates before: {len(rest_items_avail)}")
print(f"  Non-veg candidates: {(~rest_items_avail['veg_flag']).sum()}")

filtered_b, log_b = pipe.filter_b_dietary(
    rest_items_avail, "veg", cart_df, vs["user_id"], vs["start_time"]
)
print(f"  After Filter B (veg): {len(filtered_b)}")
print(f"  Removed: {log_b['input'] - log_b['output']} non-veg items")
print(f"  Log: {log_b}")

In [None]:
# Case 2: Session with no toggle — inference from cart
none_sessions = sessions[sessions["dietary_toggle"] == "none"]
ns = none_sessions.iloc[0]
rid2 = ns["restaurant_id"]
rest_items2 = menu[menu["restaurant_id"] == rid2].copy()
rest_items2_avail = rest_items2[(rest_items2["availability"]) & (rest_items2["margin_pct"] >= 10)]
cart_ids2 = get_cart_items(ns["session_id"])
cart_df2 = get_cart_df(cart_ids2)

cart_veg_status = "all veg" if cart_df2["veg_flag"].all() else "has non-veg"
print(f"--- Session {ns['session_id']} (no toggle, cart is {cart_veg_status}) ---")
print(f"  Cart items: {list(cart_df2['name'].values)}")

filtered_b2, log_b2 = pipe.filter_b_dietary(
    rest_items2_avail, "none", cart_df2, ns["user_id"], ns["start_time"]
)
print(f"  After Filter B: {len(filtered_b2)} (no hard filter expected for 'none')")
print(f"  Inference: {log_b2.get('inferred', 'N/A')}")

In [None]:
# Case 3: Veg day override — find a user with veg_days set
users_with_vd = users[users["veg_days"].apply(lambda x: len(json.loads(x)) > 0)]
print(f"Users with veg days configured: {len(users_with_vd)}")

sample_user = users_with_vd.iloc[0]
vd = json.loads(sample_user["veg_days"])
day_names = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
print(f"  User {sample_user['user_id']} veg days: {[day_names[d] for d in vd]}")

# Create a mock session on one of those veg days
from datetime import datetime, timedelta
base = datetime(2025, 12, 1)  # Monday = weekday 0
target_day = vd[0]
mock_start = base + timedelta(days=(target_day - base.weekday()) % 7)
print(f"  Mock session date: {mock_start.strftime('%A %Y-%m-%d')} (weekday {mock_start.weekday()})")

any_rest = restaurants.iloc[0]["restaurant_id"]
rest_items3 = menu[menu["restaurant_id"] == any_rest].copy()
rest_items3_avail = rest_items3[(rest_items3["availability"]) & (rest_items3["margin_pct"] >= 10)]

filtered_vd, log_vd = pipe.filter_b_dietary(
    rest_items3_avail, "none", pd.DataFrame(), sample_user["user_id"], mock_start.isoformat()
)
print(f"  Toggle was 'none' but veg_day override applied: {log_vd.get('veg_day_override', False)}")
print(f"  Effective toggle: {log_vd['effective_toggle']}")
print(f"  Candidates: {log_vd['input']} -> {log_vd['output']}")

---
## Filter C — Cuisine Coherence

In [None]:
# Single-cuisine cart — dominant cuisine from cart
ni_rests = restaurants[restaurants["primary_cuisine"] == "North Indian"]
ni_rest = ni_rests.iloc[0]["restaurant_id"]
ni_menu = menu[(menu["restaurant_id"] == ni_rest) & (menu["availability"]) & (menu["margin_pct"] >= 10)]

# Grab 2 North Indian items as cart
ni_mains = ni_menu[ni_menu["category"] == "main"].head(2)
cart_c = ni_mains.copy()
print(f"--- Cuisine Coherence: North Indian cart ---")
print(f"  Cart: {list(cart_c['name'].values)}")
print(f"  Cart cuisine tags: {list(cart_c['cuisine_tag'].values)}")
print(f"  Total candidates: {len(ni_menu)}")

filtered_c, log_c = pipe.filter_c_cuisine_coherence(ni_menu, cart_c, ni_rest)
print(f"  Dominant cuisine: {log_c['dominant_cuisine']}")
print(f"  After Filter C: {log_c['output']}")
print(f"  Source: {log_c['source']}")

In [None]:
# Combo/thali scenario — suppressing covered subcategories
combos = menu[(menu["is_combo"]) & (menu["subcategory"].isin(["thali", "meal_combo"]))]
print(f"Total combos/thalis in menu: {len(combos)}")

sample_combo = combos.iloc[0]
combo_rest = sample_combo["restaurant_id"]
combo_menu = menu[
    (menu["restaurant_id"] == combo_rest)
    & (menu["availability"])
    & (menu["margin_pct"] >= 10)
]

cart_combo = combo_menu[combo_menu["item_id"] == sample_combo["item_id"]]
components = json.loads(sample_combo["combo_components"])
print(f"\n--- Combo in cart: {sample_combo['name']} ---")
print(f"  Components: {components}")
print(f"  Candidates before: {len(combo_menu)}")

filtered_combo, log_combo = pipe.filter_c_cuisine_coherence(combo_menu, cart_combo, combo_rest)
print(f"  Suppressed subcategories: {log_combo.get('combo_suppressed_subcats', 'none')}")
print(f"  After Filter C: {log_combo['output']}")

In [None]:
# Empty cart — should default to restaurant's primary cuisine
empty_cart = pd.DataFrame()
mixed_rest = restaurants.iloc[2]["restaurant_id"]
mixed_menu = menu[
    (menu["restaurant_id"] == mixed_rest)
    & (menu["availability"])
    & (menu["margin_pct"] >= 10)
]
rest_cuisine = restaurants.iloc[2]["primary_cuisine"]

filtered_empty, log_empty = pipe.filter_c_cuisine_coherence(mixed_menu, empty_cart, mixed_rest)
print(f"--- Empty cart: defaults to restaurant cuisine ---")
print(f"  Restaurant: {mixed_rest} ({rest_cuisine})")
print(f"  Source: {log_empty['source']}")
print(f"  Dominant cuisine: {log_empty['dominant_cuisine']}")
print(f"  Candidates: {log_empty['input']} -> {log_empty['output']}")

---
## Filter D — Quantity Saturation

In [None]:
# Demonstrate saturation using 'curry' subcategory (cap=4) — more items per restaurant
# Find a restaurant with 4+ curry items so the cap is actually hit
curry_items = menu[(menu["subcategory"] == "curry") & (menu["availability"]) & (menu["margin_pct"] >= 10)]
best_rid, best_count = None, 0
for rid in curry_items["restaurant_id"].unique():
    c = len(curry_items[curry_items["restaurant_id"] == rid])
    if c > best_count:
        best_rid, best_count = rid, c

curry_rest = best_rid
rest_full_menu = menu[
    (menu["restaurant_id"] == curry_rest)
    & (menu["availability"])
    & (menu["margin_pct"] >= 10)
]
curries_available = rest_full_menu[rest_full_menu["subcategory"] == "curry"]
cart_curries = curries_available.head(4)  # fill to cap (cap=4)

print(f"--- Quantity Saturation Demo (curry, cap=4) ---")
print(f"  Restaurant: {curry_rest} ({best_count} curry items available)")
print(f"  Cart has {len(cart_curries)} curries: {list(cart_curries['name'].values)}")

curry_cands_before = len(rest_full_menu[rest_full_menu["subcategory"] == "curry"])
filtered_d, log_d = pipe.filter_d_quantity_saturation(rest_full_menu, cart_curries)
curry_cands_after = len(filtered_d[filtered_d["subcategory"] == "curry"])
print(f"  Curry candidates before Filter D: {curry_cands_before}")
print(f"  Curry candidates after  Filter D: {curry_cands_after} (saturated)")
print(f"  Non-curry items unaffected: {len(filtered_d[filtered_d['subcategory'] != 'curry'])}")
print(f"  Removed (saturated): {log_d['removed_saturated']}")
print(f"  Scale factor: {log_d['scale_factor']}")

In [None]:
# Group order scenario — cart > 5 items, thresholds scale up
big_rest = restaurants.iloc[0]["restaurant_id"]
big_menu = menu[
    (menu["restaurant_id"] == big_rest)
    & (menu["availability"])
    & (menu["margin_pct"] >= 10)
]
big_cart = big_menu.head(7)  # 7 items = group order
print(f"--- Group Order (cart size {len(big_cart)}) ---")
print(f"  Subcategories in cart: {dict(big_cart['subcategory'].value_counts())}")

filtered_big, log_big = pipe.filter_d_quantity_saturation(big_menu, big_cart)
print(f"  Scale factor: {log_big['scale_factor']} (scaled up for group order)")
print(f"  Candidates: {log_big['input']} -> {log_big['output']}")
print(f"  Removed: {log_big['removed_saturated']}")

---
## Filter E — Deduplication & Recommendation Fatigue

In [None]:
# Deduplication — items already in cart removed from candidates
dedup_rest = restaurants.iloc[0]["restaurant_id"]
dedup_menu = menu[
    (menu["restaurant_id"] == dedup_rest)
    & (menu["availability"])
    & (menu["margin_pct"] >= 10)
]
cart_for_dedup = set(dedup_menu.head(4)["item_id"].tolist())

print(f"--- Deduplication ---")
print(f"  Items in cart: {cart_for_dedup}")
print(f"  Candidates before: {len(dedup_menu)}")

filtered_e, log_e = pipe.filter_e_dedup_fatigue(dedup_menu, cart_for_dedup)
print(f"  After dedup: {log_e['output']}")
print(f"  Removed (duplicates): {log_e['removed_duplicates']}")

In [None]:
# Recommendation fatigue — simulate 3 consecutive ignores of 'beverage'
ignore_tracker = {"beverage": 3, "dessert": 1}

print(f"--- Recommendation Fatigue ---")
print(f"  Ignore tracker: {ignore_tracker}")
bev_before = len(dedup_menu[dedup_menu["category"] == "beverage"])
print(f"  Beverage candidates before: {bev_before}")

filtered_fat, log_fat = pipe.filter_e_dedup_fatigue(
    dedup_menu, set(), ignore_tracker
)
bev_after = len(filtered_fat[filtered_fat["category"] == "beverage"])
print(f"  Beverage candidates after: {bev_after} (suppressed)")
print(f"  Dessert candidates: {len(filtered_fat[filtered_fat['category'] == 'dessert'])} (not suppressed, only 1 ignore)")
print(f"  Suppressed categories: {log_fat.get('suppressed_categories', 'none')}")
print(f"  Total removed by fatigue: {log_fat['removed_fatigue']}")

---
## Full Pipeline Funnel — 100 Random Sessions

In [None]:
np.random.seed(42)
sample_sessions = sessions.sample(100, random_state=42)

funnel_records = []
for _, s in sample_sessions.iterrows():
    cart_ids = get_cart_items(s["session_id"])
    try:
        _, flog = pipe.run_filters(
            restaurant_id=s["restaurant_id"],
            user_id=s["user_id"],
            session_start=s["start_time"],
            dietary_toggle=s["dietary_toggle"],
            cart_item_ids=cart_ids,
        )
        funnel_records.append({
            "session_id": s["session_id"],
            "Initial": flog["initial"],
            "After A": flog["after_A"],
            "After B": flog["after_B"],
            "After C": flog["after_C"],
            "After D": flog["after_D"],
            "After E": flog["after_E"],
        })
    except Exception as e:
        pass

funnel_df = pd.DataFrame(funnel_records)
print(f"Successfully processed {len(funnel_df)} / 100 sessions")
print()
print("=== Average Candidate Counts at Each Stage ===")
stages = ["Initial", "After A", "After B", "After C", "After D", "After E"]
means = funnel_df[stages].mean()
for stage in stages:
    bar = "█" * int(means[stage] / means["Initial"] * 40)
    print(f"  {stage:<10} {means[stage]:>6.1f}  {bar}")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart — average candidates at each stage
ax = axes[0]
colors = ["#4C72B0", "#55A868", "#C44E52", "#8172B2", "#CCB974", "#64B5CD"]
ax.bar(stages, means.values, color=colors, edgecolor="white", linewidth=0.5)
for i, v in enumerate(means.values):
    ax.text(i, v + 0.3, f"{v:.1f}", ha="center", fontsize=9, fontweight="bold")
ax.set_ylabel("Avg candidates")
ax.set_title("Filter Funnel — Average Candidates per Stage")
ax.set_ylim(0, means["Initial"] * 1.15)

# Box plot — distribution across sessions
ax2 = axes[1]
funnel_df[stages].boxplot(ax=ax2, vert=True, patch_artist=True,
                          boxprops=dict(facecolor="#64B5CD", alpha=0.7),
                          medianprops=dict(color="#C44E52", linewidth=2))
ax2.set_ylabel("Candidates")
ax2.set_title("Filter Funnel — Distribution Across 100 Sessions")

plt.tight_layout()
plt.show()

---
## Summary Statistics

In [None]:
# Reduction ratios per filter
funnel_df["pct_A"] = (1 - funnel_df["After A"] / funnel_df["Initial"]) * 100
funnel_df["pct_B"] = (1 - funnel_df["After B"] / funnel_df["After A"]) * 100
funnel_df["pct_C"] = (1 - funnel_df["After C"] / funnel_df["After B"]) * 100
funnel_df["pct_D"] = (1 - funnel_df["After D"] / funnel_df["After C"]) * 100
funnel_df["pct_E"] = (1 - funnel_df["After E"] / funnel_df["After D"]) * 100
funnel_df["pct_total"] = (1 - funnel_df["After E"] / funnel_df["Initial"]) * 100

print("=" * 62)
print("  FILTER REDUCTION SUMMARY (avg % removed at each stage)")
print("=" * 62)
labels = {
    "pct_A": "Filter A (Availability & Margin)",
    "pct_B": "Filter B (Dietary Toggle)",
    "pct_C": "Filter C (Cuisine Coherence)",
    "pct_D": "Filter D (Quantity Saturation)",
    "pct_E": "Filter E (Dedup & Fatigue)",
    "pct_total": "TOTAL pipeline reduction",
}
for col, label in labels.items():
    val = funnel_df[col].mean()
    bar = "█" * int(val / 2)
    marker = "  ◄ largest" if col == funnel_df[["pct_A","pct_B","pct_C","pct_D","pct_E"]].mean().idxmax() else ""
    sep = "—" * 50 if col == "pct_total" else ""
    if sep:
        print(f"  {sep}")
    print(f"  {label:<40} {val:>5.1f}%  {bar}{marker}")
print("=" * 62)

avg_final = funnel_df["After E"].mean()
avg_initial = funnel_df["Initial"].mean()
print(f"\n  Average: {avg_initial:.0f} items → {avg_final:.0f} candidates")
print(f"  Ready for Step 3 (Candidate Generation)")