# PART 15: Setup & Data Loading

In [1]:
# Fixed thresholds and weights are defined in this file.

from pathlib import Path
import pandas as pd
import numpy as np

In [2]:
# 0) Road and entry/exit files
ROOT = Path("..")  # notebook is defaulted under 'notebooks/'
DATA_DIR = ROOT / "data" / "processed"
RAW_FILE = DATA_DIR / "delivery_cleaned.csv"  # for coordinates and context

X_TEST_CSV = DATA_DIR / "X_test.csv"   # _row_id, Order_ID + features
Y_TEST_CSV = DATA_DIR / "y_test.csv"   # _row_id, Order_ID, y_true
Y_PRED_CSV = DATA_DIR / "y_pred.csv"   # _row_id, Order_ID, y_pred

OUT_RISK   = DATA_DIR / "risk_scored.csv"
OUT_ROUTE  = DATA_DIR / "smart_routing.csv"

# PART 16: Loading and Merging Artifacts

In [3]:
X_test = pd.read_csv(X_TEST_CSV)
y_test = pd.read_csv(Y_TEST_CSV)
y_pred = pd.read_csv(Y_PRED_CSV)

# Are the expected identity columns available?
for col in ["_row_id", "Order_ID"]:
    if col not in X_test.columns:
        raise KeyError(f"Expecting '{col}' inside X_test..")
    if col not in y_test.columns:
        raise KeyError(f"Expecting '{col}' inside y_test.")
    if col not in y_pred.columns:
        raise KeyError(f"Expecting '{col}' inside y_pred.")
        
# Merge
df = (
    X_test.merge(y_test[["_row_id", "Order_ID", "y_true"]], on=["_row_id", "Order_ID"], how="left")
         .merge(y_pred[["_row_id", "Order_ID", "y_pred"]], on=["_row_id", "Order_ID"], how="left")
)

In [4]:
# Optional: Extract coordinate and contextual fields from the raw file (if any)
raw_cols_needed = [
    "Store_Latitude","Store_Longitude","Drop_Latitude","Drop_Longitude",
    "Vehicle","Traffic","Order_Period","Order_Hour","Area","Category"
]
raw_cols_present = []
try:
    raw_df = pd.read_csv(RAW_FILE)
    # If _row_id does not exist in delivery_cleaned.csv, add it.
    if "_row_id" not in raw_df.columns:
        raw_df = raw_df.reset_index().rename(columns={"index": "_row_id"})
    raw_cols_present = ["_row_id"] + [c for c in raw_cols_needed if c in raw_df.columns]
    if raw_cols_present:
        df = df.merge(raw_df[raw_cols_present], on="_row_id", how="left")
except FileNotFoundError:
    print("Warning: delivery_cleaned.csv not found; coordinate/context fields will not be added.")

# PART 17: Errors and Risk Levels

In [5]:
# Error metrics
if df["y_true"].isna().any() or df["y_pred"].isna().any():
    raise ValueError("There is a NaN in y_true or y_pred. Check the artifact compatibility.")

df["error"] = df["y_true"] - df["y_pred"]
df["abs_error"] = df["error"].abs()

# Risk threshold method: fixed threshold (in minutes)
# We can update the following constants according to the operation/SLA if desired.
LOW_MAX   = 15   # 0-15: low
MED_MAX   = 30   # 15-30: medium
# 30+: high

def label_risk(abs_err: float) -> str:
    if abs_err <= LOW_MAX:
        return "low"
    elif abs_err <= MED_MAX:
        return "medium"
    else:
        return "high"

df["risk_level"] = df["abs_error"].apply(label_risk)

# Optional: Data-driven dynamic thresholds (IQR/percentile)
# Q50, Q75, Q90, etc. Step 10 is disabled because it is out of scope.
# pct50, pct75, pct90 = np.percentile(df["abs_error"], [50, 75, 90])

# PART 18: Priority Score

In [6]:
# Factor maps
risk_map = {"low": 1.0, "medium": 2.0, "high": 3.0}
traffic_map = {
    "Low": 1.0, "Medium": 2.0, "High": 3.0,
    "low": 1.0, "medium": 2.0, "high": 3.0
}
vehicle_map = {
    # Faster vehicles can have a lower coefficient (balancing risk).
    "motorcycle": 1.0,
    "bicycle": 1.4,
    "car": 1.1,
    "scooter": 1.1,
}

In [7]:
# Peak hour factor: According to Order_Period
def peak_factor(period: str) -> float:
    if pd.isna(period):
        return 1.0
    p = str(period).lower()
    if any(k in p for k in ["lunch", "evening", "rush", "peak"]):
        return 2.0
    return 1.0

In [8]:
# Weights (No Step 10; fixed values)
W_RISK    = 0.50
W_TRAFFIC = 0.25
W_PEAK    = 0.15
W_VEHICLE = 0.10

In [9]:
# Mapping columns (apply default if none exist)
df["risk_factor"] = df["risk_level"].map(risk_map).fillna(1.0)
if "Traffic" in df.columns:
    df["traffic_factor"] = df["Traffic"].map(traffic_map).fillna(1.0)
else:
    df["traffic_factor"] = 1.0

if "Vehicle" in df.columns:
    df["vehicle_factor"] = df["Vehicle"].str.lower().map(vehicle_map).fillna(1.2)
else:
    df["vehicle_factor"] = 1.2

if "Order_Period" in df.columns:
    df["peak_factor"] = df["Order_Period"].apply(peak_factor)
else:
    df["peak_factor"] = 1.0

In [10]:
# Score
cols_needed = ["risk_factor","traffic_factor","peak_factor","vehicle_factor"]
if df[cols_needed].isna().any().any():
    raise ValueError("NaN was found in the priority score; check the factor columns.")

df["priority_score"] = (
    W_RISK*df["risk_factor"] +
    W_TRAFFIC*df["traffic_factor"] +
    W_PEAK*df["peak_factor"] +
    W_VEHICLE*df["vehicle_factor"]
)

# PART 19: Clustering & intracluster sorting â€” default OFF

In [11]:
ENABLE_CLUSTERING = False  # If I set it to True, a sample flow will run with KMeans.

if ENABLE_CLUSTERING:
    from sklearn.cluster import KMeans
    from sklearn.neighbors import NearestNeighbors

    # Are the coordinates available?
    for c in ["Drop_Latitude","Drop_Longitude"]:
        if c not in df.columns:
            raise KeyError("The following coordinates are required for clustering: 'Drop_Latitude' and 'Drop_Longitude'.")

    coords = df[["Drop_Latitude","Drop_Longitude"]].values
    n = len(df)
    k = max(2, min(15, round(n/50)))  # a crude intuition

    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    df["cluster_id"] = km.fit_predict(coords)

    # Simple nearest neighbor order within a set (example)
    seq_list = np.zeros(n, dtype=int)
    for cid, grp in df.groupby("cluster_id"):
        idx = grp.index.values
        if len(idx) <= 1:
            seq_list[idx] = 1
            continue
        sub = grp[["Drop_Latitude","Drop_Longitude"]].values
        # Starting point: the point closest to the cluster center
        center = sub.mean(axis=0, keepdims=True)
        start = np.argmin(((sub - center)**2).sum(axis=1))
        # Greedy NN
        visited = [start]
        remain = set(range(len(sub))) - set(visited)
        while remain:
            last = visited[-1]
            dists = ((sub[list(remain)] - sub[last])**2).sum(axis=1)
            nxt_local = list(remain)[int(np.argmin(dists))]
            visited.append(nxt_local)
            remain.remove(nxt_local)
        # Assign the rows
        for order, local_i in enumerate(visited, start=1):
            seq_list[idx[local_i]] = order
    df["seq_in_cluster"] = seq_list

# PART 20: Outputs

In [12]:
# risk_scored.csv: error and risk-focused summary
risk_cols = [
    "_row_id","Order_ID","y_true","y_pred","error","abs_error","risk_level",
    "Traffic","Vehicle","Order_Period","Order_Hour","Area","Category"
]
risk_cols = [c for c in risk_cols if c in df.columns]

df[risk_cols].to_csv(OUT_RISK, index=False)

In [13]:
# smart_routing.csv: priority and (if applicable) coordinates/cluster information
route_cols = [
    "_row_id","Order_ID","y_pred","risk_level","priority_score",
    "Traffic","Vehicle","Order_Period","Order_Hour","Area","Category",
    "Store_Latitude","Store_Longitude","Drop_Latitude","Drop_Longitude",
    "cluster_id","seq_in_cluster"
]
route_cols = [c for c in route_cols if c in df.columns]

df[route_cols].to_csv(OUT_ROUTE, index=False)

In [14]:
# A brief summary
print("\nSummary:")
print(df["risk_level"].value_counts(dropna=False).rename_axis("risk").to_frame("count"))
print("\npriority_score examples:")
print(df[["Order_ID","risk_level","priority_score"]].head(10))


Summary:
        count
risk         
low      4456
medium   2773
high     1474

priority_score examples:
        Order_ID risk_level  priority_score
0  qjzw184742800        low            1.17
1  zgdu330581471     medium            1.67
2  geop858602748        low            1.17
3  psqt401935008     medium            1.67
4  ocbg529160503       high            2.17
5  tvpc165252510     medium            1.67
6  hzkr102840172     medium            1.67
7  qanh567637196     medium            1.67
8  kkni283102635       high            2.17
9  yzod046149213       high            2.17


# Some Additions

In [15]:
# Priority-score sanity check
print('corr(abs_error, priority_score)=', df[['abs_error','priority_score']].corr().iloc[0,1].round(3))
print(df.groupby('Traffic')['priority_score'].mean().round(2))

corr(abs_error, priority_score)= 0.882
Traffic
High       1.36
Jam        1.44
Low        1.29
Medium     1.50
Name: priority_score, dtype: float64
