<a href="https://colab.research.google.com/github/sanaafrin008/sanaafrin008/blob/main/Predictive%20Restaurant%20Recommender%20.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
# ===========================================
# Predictive Restaurant Recommender (Colab)
# End-to-end: data prep → features → models → submission
# ===========================================

import os, math, gc
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
from collections import defaultdict

# -------------------------------------------
# 0) CONFIG
# -------------------------------------------
RANDOM_STATE = 42
NEG_PER_POS = 5         # how many negative vendors per (customer,location)
TOP_N_VENDORS = None    # None = use all vendors; or set (e.g., 300) to limit for speed
THRESHOLD = 0.5         # probability threshold to output 0/1

np.random.seed(RANDOM_STATE)

# -------------------------------------------
# 1) LOAD DATA
# -------------------------------------------
def exists(fname): return os.path.exists(fname)

required = [
    "orders.csv", "train_customers.csv", "train_locations.csv",
    "vendors.csv", "test_customers.csv", "test_locations.csv"
]
for f in required:
    print(("✅" if exists(f) else "❌"), f)

orders          = pd.read_csv("orders.csv", low_memory=False)
train_customers = pd.read_csv("train_customers.csv")
train_locations = pd.read_csv("train_locations.csv")
vendors         = pd.read_csv("vendors.csv")
test_customers  = pd.read_csv("test_customers.csv")
test_locations  = pd.read_csv("test_locations.csv")

sample_sub = None
if exists("SampleSubmission.csv"):
    try:
        sample_sub = pd.read_csv("SampleSubmission.csv")
        print("✅ Loaded SampleSubmission.csv (will drive exact test rows).")
    except Exception as e:
        print("⚠️ Could not read SampleSubmission.csv:", e)

# -------------------------------------------
# 2) NORMALIZE COLUMN NAMES & KEYS
# -------------------------------------------
# orders has LOCATION_NUMBER (upper); vendors use id for vendor_id; orders may have LOCATION_TYPE (upper)
orders = orders.rename(columns={
    "LOCATION_NUMBER":"location_number",
    "LOCATION_TYPE":"location_type"
})
vendors = vendors.rename(columns={"id":"vendor_id"})

# Make sure location key exists in train_locations
# Expected columns: ['customer_id', 'location_number', 'location_type', 'latitude', 'longitude']
print("Train Locations columns:", train_locations.columns.tolist())
print("Test Locations columns:", test_locations.columns.tolist())

# Ensure numeric where expected
for df, cols in [(orders, ["item_count","grand_total","vendor_discount_amount",
                           "deliverydistance","preparationtime","delivery_time"]),
                 (train_locations, ["location_number","latitude","longitude"]),
                 (test_locations, ["location_number","latitude","longitude"]),
                 (vendors, ["vendor_id","latitude","longitude","delivery_charge","serving_distance","commission","rank","vendor_rating"])]:
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")

# -------------------------------------------
# 3) POSITIVES: unique (customer, location_number, vendor_id) from orders
# -------------------------------------------
# If orders lacks location_number for some rows, try to fill from 'CID X LOC_NUM X VENDOR'
if "location_number" not in orders.columns or orders["location_number"].isna().all():
    if "CID X LOC_NUM X VENDOR" in orders.columns:
        # parse location number from composite key "... X loc X vendor"
        def parse_locnum(s):
            try:
                parts = str(s).split(" X ")
                return float(parts[1])
            except:
                return np.nan
        orders["location_number"] = orders["CID X LOC_NUM X VENDOR"].map(parse_locnum)
    else:
        raise ValueError("orders.csv lacks location_number and composite key; cannot build positives.")

pos = orders.loc[:, ["customer_id","location_number","vendor_id"]].dropna()
pos["target"] = 1
pos = pos.drop_duplicates()

# Optionally limit #vendors (speed)
if TOP_N_VENDORS:
    vendor_counts = orders["vendor_id"].value_counts().index[:TOP_N_VENDORS]
    pos = pos[pos["vendor_id"].isin(vendor_counts)]

print("Positives:", pos.shape)

# -------------------------------------------
# 4) NEGATIVES: sample vendors the customer-location never ordered from
# -------------------------------------------
# Build map: for each (customer,location), which vendors were ordered
pos_key = set(zip(pos.customer_id, pos.location_number, pos.vendor_id))
active_vendors = sorted(pos["vendor_id"].dropna().unique().tolist())

# group locations per customer from TRAIN locations (history side)
cust_locs = train_locations.groupby("customer_id")["location_number"].unique()

neg_rows = []
for cid, loc_list in cust_locs.items():
    for loc in loc_list:
        # pick negative vendors (not ordered for this cid,loc)
        # speed: sample without replacement from active_vendors
        tried = 0
        sampled = 0
        # build a small random subset for speed
        vendor_pool = np.random.permutation(active_vendors)
        for vid in vendor_pool:
            tried += 1
            if (cid, loc, vid) not in pos_key:
                neg_rows.append((cid, loc, vid, 0))
                sampled += 1
                if sampled >= NEG_PER_POS:
                    break

neg = pd.DataFrame(neg_rows, columns=["customer_id","location_number","vendor_id","target"])
print("Negatives (sampled):", neg.shape)

# Combine
train_pairs = pd.concat([pos, neg], ignore_index=True)
print("Train pairs (pos+neg):", train_pairs.shape)

# -------------------------------------------
# 5) FEATURE ENGINEERING
# -------------------------------------------
# 5a) Aggregates from orders (history)
def safe_count(df, by_cols, name):
    g = orders.groupby(by_cols).size().reset_index(name=name)
    return g

# customer-level
f_cust_cnt   = safe_count(orders, ["customer_id"], "f_cust_orders")
# vendor-level
f_ven_cnt    = safe_count(orders, ["vendor_id"], "f_vendor_orders")
# customer-vendor history (how many times this c ordered from this v)
f_cv_cnt     = safe_count(orders, ["customer_id","vendor_id"], "f_cv_orders")
# location-vendor popularity (how often this vendor was used at this location overall)
f_lv_cnt     = safe_count(orders, ["location_number","vendor_id"], "f_lv_orders")
# customer-location activity
f_cl_cnt     = safe_count(orders, ["customer_id","location_number"], "f_cl_orders")

# 5b) geo distance (customer location ↔ vendor)
def haversine(lat1, lon1, lat2, lon2):
    lat1 = np.radians(lat1); lon1 = np.radians(lon1)
    lat2 = np.radians(lat2); lon2 = np.radians(lon2)
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return 6371.0 * c  # km

# 5c) Merge cores: customers (demographics), locations (geo), vendors (meta)
# Vendors already renamed to vendor_id
vendors_slim = vendors.copy()
# pick a useful subset of vendor features (add more if you like)
keep_vendor_cols = [c for c in ["vendor_id","latitude","longitude","vendor_category_en","delivery_charge",
                                "serving_distance","commission","rank","vendor_rating","status","verified",
                                "language"] if c in vendors_slim.columns]
vendors_slim = vendors_slim[keep_vendor_cols].drop_duplicates("vendor_id")

# Prepare train feature table
feat = train_pairs.merge(train_customers, on="customer_id", how="left") \
                  .merge(train_locations, on=["customer_id","location_number"], how="left") \
                  .merge(vendors_slim, on="vendor_id", how="left")

# rename to avoid col overlaps
feat = feat.rename(columns={
    "latitude_x":"cust_lat", "longitude_x":"cust_lon",
    "latitude_y":"vend_lat", "longitude_y":"vend_lon"
} if "latitude_x" in feat.columns else {})

# If there wasn't a collision, ensure columns exist
if "cust_lat" not in feat.columns and "latitude" in train_locations.columns:
    feat = feat.rename(columns={"latitude":"cust_lat","longitude":"cust_lon"})
if "vend_lat" not in feat.columns and "latitude" in vendors_slim.columns:
    feat = feat.rename(columns={"latitude":"vend_lat","longitude":"vend_lon"})

# compute distance
feat["dist_km"] = haversine(feat["cust_lat"], feat["cust_lon"], feat["vend_lat"], feat["vend_lon"])

# add aggregates
for gdf in [f_cust_cnt, f_ven_cnt, f_cv_cnt, f_lv_cnt, f_cl_cnt]:
    feat = feat.merge(gdf, on=[c for c in gdf.columns if c.startswith(("customer_id","vendor_id","location_number"))], how="left")

# fill NA numeric
for c in feat.columns:
    if pd.api.types.is_numeric_dtype(feat[c]):
        feat[c] = feat[c].fillna(0)

# encode categoricals with simple mapping (train-time fit)
def fit_map(series):
    vals = series.astype(str).fillna("UNK").unique().tolist()
    return {v:i for i,v in enumerate(vals)}

cat_cols = []
for c in ["gender","status","language","location_type","vendor_category_en","verified","status"]:
    if c in feat.columns and feat[c].dtype == object:
        cat_cols.append(c)

encoders = {}
for c in cat_cols:
    encoders[c] = fit_map(feat[c])
    feat[c] = feat[c].astype(str).map(encoders[c]).fillna(-1).astype(int)

# target
y = feat["target"].astype(int)

# select features (drop IDs/text/date-ish)
drop_cols = set(["target","customer_id","vendor_id","location_number","dob","created_at","updated_at"])
X = feat.drop(columns=[c for c in drop_cols if c in feat.columns], errors="ignore")

# Keep only numeric columns
X = X.select_dtypes(include=[np.number])

# -------------------------------------------
# 6) TRAIN/VAL SPLIT + MODELS
# -------------------------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

# Baseline: RandomForest
rf = RandomForestClassifier(
    n_estimators=300, max_depth=None, n_jobs=-1, random_state=RANDOM_STATE
)
rf.fit(X_train, y_train)
rf_val_prob = rf.predict_proba(X_val)[:,1]
rf_val_pred = (rf_val_prob >= THRESHOLD).astype(int)

print("RandomForest  AUC:", round(roc_auc_score(y_val, rf_val_prob), 4))
print("RandomForest  ACC:", round(accuracy_score(y_val, rf_val_pred), 4))

# Stronger: LightGBM (optional)
use_lgbm = False
try:
    import lightgbm as lgb
    use_lgbm = True
except:
    pass

if use_lgbm:
    dtrain = lgb.Dataset(X_train, label=y_train)
    dval   = lgb.Dataset(X_val, label=y_val, reference=dtrain)
    params = dict(
        objective="binary", metric="auc", boosting_type="gbdt",
        num_leaves=64, learning_rate=0.05,
        feature_fraction=0.85, bagging_fraction=0.8, bagging_freq=5,
        verbosity=-1, seed=RANDOM_STATE
    )
    lgbm = lgb.train(
        params, dtrain, valid_sets=[dtrain, dval],
        num_boost_round=2000
    )
    lgb_val_prob = lgbm.predict(X_val, num_iteration=lgbm.best_iteration)
    lgb_val_pred = (lgb_val_prob >= THRESHOLD).astype(int)
    print("LightGBM      AUC:", round(roc_auc_score(y_val, lgb_val_prob), 4))
    print("LightGBM      ACC:", round(accuracy_score(y_val, lgb_val_pred), 4))

# choose best model for inference
best_model_name = "lgbm" if use_lgbm else "rf"
best_model = lgbm if use_lgbm else rf
print("Using model:", best_model_name)

# -------------------------------------------
# 7) BUILD TEST CANDIDATES (CID × LOC_NUM × VENDOR)
#    Prefer SampleSubmission.csv to match exactly.
# -------------------------------------------
if sample_sub is not None and "CID X LOC_NUM X VENDOR" in sample_sub.columns:
    # Use exactly the keys they expect
    keys = sample_sub["CID X LOC_NUM X VENDOR"].astype(str)

    def parse_key(s):
        parts = str(s).split(" X ")
        return parts[0], float(parts[1]), int(parts[2])
    parsed = keys.map(parse_key)
    test_pairs = pd.DataFrame(parsed.tolist(), columns=["customer_id","location_number","vendor_id"])
else:
    # Generate all combinations of test customers × their locations × vendors seen in training
    test_pairs = (
        test_locations[["customer_id","location_number","location_type","latitude","longitude"]]
        .merge(test_customers, on="customer_id", how="left")
    )[["customer_id","location_number"]] \
    .drop_duplicates()

    vendor_list = active_vendors if TOP_N_VENDORS is None else active_vendors[:TOP_N_VENDORS]
    test_pairs = test_pairs.assign(_key=1).merge(
        pd.DataFrame({"vendor_id": vendor_list, "_key":[1]*len(vendor_list)}),
        on="_key", how="left"
    ).drop(columns="_key")

print("Test candidate pairs:", test_pairs.shape)

# -------------------------------------------
# 8) BUILD TEST FEATURES (mirror training)
# -------------------------------------------
# Ensure test_locations is unique on the merge key before merging
test_locations_unique = test_locations.drop_duplicates(subset=["customer_id", "location_number"])

test_feat = test_pairs.merge(test_customers, on="customer_id", how="left")
print("Shape after merging test_pairs with test_customers:", test_feat.shape)

test_feat = test_feat.merge(test_locations_unique, on=["customer_id","location_number"], how="left")
print("Shape after merging with test_locations_unique:", test_feat.shape)


test_feat = test_feat.merge(vendors_slim, on="vendor_id", how="left")
print("Shape after merging with vendors_slim:", test_feat.shape)


# Align location/vendor lat/lon naming then compute distance
if "latitude_x" in test_feat.columns:
    test_feat = test_feat.rename(columns={
        "latitude_x":"cust_lat","longitude_x":"cust_lon",
        "latitude_y":"vend_lat","longitude_y":"vend_lon"
    })
else:
    if "latitude" in test_locations_unique.columns:
        test_feat = test_feat.rename(columns={"latitude":"cust_lat","longitude":"cust_lon"})
    if "latitude" in vendors_slim.columns:
        test_feat = test_feat.rename(columns={"latitude":"vend_lat","longitude":"vend_lon"})

test_feat["dist_km"] = haversine(test_feat["cust_lat"], test_feat["cust_lon"], test_feat["vend_lat"], test_feat["vend_lon"])

# merge aggregates
for gdf in [f_cust_cnt, f_ven_cnt, f_cv_cnt, f_lv_cnt, f_cl_cnt]:
    test_feat = test_feat.merge(gdf, on=[c for c in gdf.columns if c.startswith(("customer_id","vendor_id","location_number"))], how="left")

# fill numeric NA
for c in test_feat.columns:
    if pd.api.types.is_numeric_dtype(test_feat[c]):
        test_feat[c] = test_feat[c].fillna(0)


# apply categorical encoders (same maps as train; unseen -> -1)
for c in cat_cols:
    if c in test_feat.columns:
        test_feat[c] = test_feat[c].astype(str).map(encoders[c]).fillna(-1).astype(int)

# select & order columns as X had
X_cols = X.columns.tolist()
X_test = test_feat.reindex(columns=X_cols, fill_value=0)

# Ensure same order and unique keys for X_test as test_pairs
# This aligns the test features with the submission structure to prevent the ValueError
X_test = X_test.loc[test_pairs.index]

# -------------------------------------------
# 9) PREDICT ON TEST
# -------------------------------------------
if best_model_name == "lgbm":
    test_prob = best_model.predict(X_test, num_iteration=best_model.best_iteration)
else:
    test_prob = best_model.predict_proba(X_test)[:,1]

test_pred = (test_prob >= THRESHOLD).astype(int)



✅ orders.csv
✅ train_customers.csv
✅ train_locations.csv
✅ vendors.csv
✅ test_customers.csv
✅ test_locations.csv
Train Locations columns: ['customer_id', 'location_number', 'location_type', 'latitude', 'longitude']
Test Locations columns: ['customer_id', 'location_number', 'location_type', 'latitude', 'longitude']
Positives: (80142, 4)
Negatives (sampled): (297515, 4)
Train pairs (pos+neg): (377657, 4)
RandomForest  AUC: 0.9987
RandomForest  ACC: 0.9909
LightGBM      AUC: 0.9988
LightGBM      ACC: 0.9914
Using model: lgbm
Test candidate pairs: (1672000, 3)
Shape after merging test_pairs with test_customers: (1673600, 10)
Shape after merging with test_locations_unique: (1673600, 13)
Shape after merging with vendors_slim: (1673600, 24)


In [None]:
print(len(sub), len(test_pred))


In [None]:
from xgboost import XGBRegressor

# Create and train model using the correct variable names
model = XGBRegressor()
model.fit(X_train, y_train)

# Make predictions on the test set using the correct variable name
test_pred = model.predict(X_test)

In [None]:
test_pred = model.predict(X_test)

In [4]:
sub["target"] = test_pred[:len(sub)]


In [5]:
test_pred = test_pred[:len(sub)]  # trim extra
sub["target"] = test_pred



In [6]:
print("len(X_test):", len(X_test))
print("len(test_pred):", len(test_pred))
print("len(sub):", len(sub))
# Removed print("len(test):", len(test)) as 'test' is not defined

len(X_test): 1673600
len(test_pred): 1672000
len(sub): 1672000


In [8]:
# Assume sub and X_test are already defined

print("Unique IDs in sub:", sub["CID X LOC_NUM X VENDOR"].nunique())
print("Rows in sub:", len(sub))
print("Rows in X_test:", len(X_test))

Unique IDs in sub: 1672000
Rows in sub: 1672000
Rows in X_test: 1672000


In [17]:
sub["target"] = test_pred


In [18]:
submission = sub[["CID X LOC_NUM X VENDOR", "target"]].copy()
submission.to_csv("submission.csv", index=False)
print("✅ Submission file created: submission.csv")


✅ Submission file created: submission.csv


In [19]:
print(submission.head())
print("Shape of submission:", submission.shape)


  CID X LOC_NUM X VENDOR  target
0        Z59FTQD X 0 X 4       0
1       Z59FTQD X 0 X 13       0
2       Z59FTQD X 0 X 20       0
3       Z59FTQD X 0 X 23       0
4       Z59FTQD X 0 X 28       0
Shape of submission: (1672000, 2)
