In [1]:
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

ModuleNotFoundError: No module named 'lightgbm'

In [None]:
df = pd.read_csv('amazon delivery.csv')

In [None]:
# Feature Engineering: Extract useful time-based features
df['order_datetime'] = pd.to_datetime(df['Order_Date'] + ' ' + df['Order_Time'])
df['pickup_datetime'] = pd.to_datetime(df['Pickup_Time'])

df['order_hour'] = df['order_datetime'].dt.hour
df['order_dayofweek'] = df['order_datetime'].dt.dayofweek
df['order_weekend'] = df['order_dayofweek'].isin([5, 6]).astype(int)

df['pickup_hour'] = df['pickup_datetime'].dt.hour
df['pickup_dayofweek'] = df['pickup_datetime'].dt.dayofweek

In [None]:
# Compute time difference features
df['order_to_pickup_mins'] = (df['pickup_datetime'] - df['order_datetime']).dt.total_seconds() / 60

# Drop original datetime columns
df.drop(['Order_Date', 'Order_Time', 'Pickup_Time', 'order_datetime', 'pickup_datetime'], axis=1, inplace=True)

In [None]:
# One-hot encoding for categorical variables
categorical_features = ['Traffic', 'Weather', 'Vehicle', 'Area']
df = pd.get_dummies(df, columns=categorical_features, drop_first=True)

In [None]:
# Encode categorical variable 'Category' using Label Encoding
le = LabelEncoder()
df["Category"] = le.fit_transform(df["Category"])

In [None]:
from haversine import haversine

def calculate_distance(row):
    return haversine((row["Store_Latitude"], row["Store_Longitude"]),
                     (row["Drop_Latitude"], row["Drop_Longitude"]))

In [None]:
df.columns = df.columns.str.strip()
df["distance_km"] = df.apply(calculate_distance, axis=1)
df["distance_traffic"] = df["distance_km"] * df["Traffic_Jam"]
df["vehicle_distance"] = df["distance_km"] * df["Vehicle_van"]

In [None]:
print(df.info)

In [None]:
df.columns

In [None]:
from sklearn.cluster import KMeans
import numpy as np

# Use K-Means to cluster delivery durations
num_clusters = 2  # We assume two groups: on-time vs. late
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df["delay_cluster"] = kmeans.fit_predict(df[["Delivery_Time"]])

# Assign labels based on mean delay
cluster_means = df.groupby("delay_cluster")["Delivery_Time"].mean()
late_cluster = cluster_means.idxmax()  # Cluster with the highest mean delay
df["is_late"] = (df["delay_cluster"] == late_cluster).astype(int)

print(df["is_late"].value_counts())  # Check distribution of on-time vs. late



In [None]:
df_original = df.copy()
# Define features & target
X = df.drop(columns=["Delivery_Time", "delay_cluster", "is_late"], inplace=False)  # Exclude target variables
y = df["is_late"]  # Target: 0 (on-time) or 1 (late)
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# 1.LightGBM Classifier
lgbm_clf = lgb.LGBMClassifier(n_estimators=1500, learning_rate=0.03, max_depth=10, num_leaves=50)
lgbm_clf.fit(X_train, y_train)

# Predict on test set
y_pred = lgbm_clf.predict(X_test)

In [None]:
# Evaluate Model
print("LGBM Classification Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
#2.optuna to optimize LGBM
import optuna

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 500, 3000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "num_leaves": trial.suggest_int("num_leaves", 20, 200),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 100),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
        "lambda_l1": trial.suggest_float("lambda_l1", 0.0, 1.0),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.0, 1.0),
    }

    # Train LightGBM
    model = lgb.LGBMClassifier(**params)
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    
    return accuracy_score(y_test, y_pred)

# Run Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

# Best parameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)

# Train final model
lgbm_clf = lgb.LGBMClassifier(**best_params)
lgbm_clf.fit(X_train, y_train)

# Evaluate
y_pred = lgbm_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Optimized Classification Accuracy:", accuracy)


In [None]:
print("Best Hyperparameters:", best_params)
print("Optimized Classification Accuracy:", accuracy)
print(classification_report(y_test, y_pred))

In [None]:
#3.Stacking Model Classifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import accuracy_score

# Define base classifiers
base_classifiers = [
    ('lgbm', lgb.LGBMClassifier(n_estimators=1500, learning_rate=0.03, max_depth=10, num_leaves=50)),
    ('xgb', xgb.XGBClassifier(n_estimators=1500, learning_rate=0.03, max_depth=10, colsample_bytree=0.8)),
    ('rf', RandomForestClassifier(n_estimators=500, max_depth=10, random_state=42))
]

# Define stacking classifier with Logistic Regression as final model
stacking_clf = StackingClassifier(estimators=base_classifiers, final_estimator=LogisticRegression())

# Train the ensemble model
stacking_clf.fit(X_train, y_train)

# Predict & evaluate
y_pred_stack = stacking_clf.predict(X_test)
accuracy_stack = accuracy_score(y_test, y_pred_stack)
print("Stacking Model Accuracy:", accuracy_stack)


In [None]:
# Evaluate Model
print("Stacking Model Accuracy:", accuracy_stack)
print(classification_report(y_test, y_pred_stack))

In [None]:
#4.XGBoost Classifier
import xgboost as xgb
from sklearn.metrics import accuracy_score

# Train XGBoost Classifier
xgb_clf = xgb.XGBClassifier(n_estimators=1500, learning_rate=0.03, max_depth=10, colsample_bytree=0.8)
xgb_clf.fit(X_train, y_train)

# Predict on test set
y_pred_xgb = xgb_clf.predict(X_test)

# Evaluate Model
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print("XGBoost Classification Accuracy:", accuracy_xgb)

In [None]:
#5.Adjusting threshold Classification
from sklearn.metrics import precision_recall_curve

# Get predicted probabilities
y_probs = xgb_clf.predict_proba(X_test)[:,1]

# Find optimal threshold
precisions, recalls, thresholds = precision_recall_curve(y_test, y_probs)
optimal_idx = (precisions + recalls).argmax()  # Best balance of precision & recall
optimal_threshold = thresholds[optimal_idx]
print(f"Optimal Threshold: {optimal_threshold:.2f}")

# Apply new threshold
y_pred_adjusted = (y_probs > optimal_threshold).astype(int)
accuracy_adjusted = accuracy_score(y_test, y_pred_adjusted)
print("Adjusted Threshold Accuracy:", accuracy_adjusted)

In [None]:
#delivery time
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Filter only late deliveries
late_deliveries = df[df["is_late"] == 1]

# Define regression model
X_late = late_deliveries.drop(columns=["Delivery_Time", "delay_cluster", "is_late"], inplace = False)
y_late = late_deliveries["Delivery_Time"]  # Now predicting actual delay duration

# Train-test split
X_train_late, X_test_late, y_train_late, y_test_late = train_test_split(X_late, y_late, test_size=0.2, random_state=42)

# Train Gradient Boosting Regressor
gbm_regressor = GradientBoostingRegressor(n_estimators=1500, learning_rate=0.03, max_depth=8)
gbm_regressor.fit(X_train_late, y_train_late)

# Predict delay duration
y_pred_late = gbm_regressor.predict(X_test_late)

# Evaluate regression model
mae = mean_absolute_error(y_test_late, y_pred_late)
rmse = np.sqrt(mean_squared_error(y_test_late, y_pred_late))

print(f"MAE (Late Deliveries): {mae:.2f} min")
print(f"RMSE (Late Deliveries): {rmse:.2f} min")


In [None]:
print(X_train.columns)  # Check column list again

In [None]:
# Calculate expected delivery time based on key factors (store, vehicle, distance)
expected_times = df[df["is_late"] == 0].groupby(["Store_Latitude", "Store_Longitude", 
                                                 "Drop_Latitude", "Drop_Longitude", 
                                                 "Vehicle_scooter", "Vehicle_van", "distance_km"])["Delivery_Time"].mean().reset_index()

# Merge with main dataset
df = df.merge(expected_times, on=["Store_Latitude", "Store_Longitude", 
                                  "Drop_Latitude", "Drop_Longitude", 
                                  "Vehicle_scooter", "Vehicle_van", "distance_km"], 
              how="left", suffixes=("", "_expected"))

# Calculate delay duration
df["delay_duration"] = df["Delivery_Time"] - df["Delivery_Time_expected"]

# Print to confirm
print(df[["Delivery_Time", "Delivery_Time_expected", "delay_duration"]].head(10))


In [None]:
from sklearn.ensemble import RandomForestRegressor

# ✅ Step 1: Fill Missing "delivery_time_expected"
train_data = df[df["delivery_time_expected"].notnull()]
test_data = df[df["delivery_time_expected"].isnull()]

# Features (exclude target variables)
X_train_fill = train_data.drop(columns=["delivery_time_expected", "delivery_time", "delay_duration"])
y_train_fill = train_data["delivery_time_expected"]

# Train Random Forest Model for Expected Delivery Time
fill_model_expected = RandomForestRegressor(n_estimators=100, random_state=42)
fill_model_expected.fit(X_train_fill, y_train_fill)

# Predict missing values for delivery_time_expected
df.loc[df["delivery_time_expected"].isnull(), "delivery_time_expected"] = fill_model_expected.predict(test_data.drop(columns=["delivery_time_expected", "delivery_time", "delay_duration"]))

# ✅ Step 2: Calculate "delay_duration" After Filling "delivery_time_expected"
df["delay_duration"] = df["delivery_time"] - df["delivery_time_expected"]

# ✅ Step 3: Fill Missing "delay_duration" Using Another Random Forest Model
train_data_delay = df[df["delay_duration"].notnull()]
test_data_delay = df[df["delay_duration"].isnull()]

X_train_delay = train_data_delay.drop(columns=["delay_duration"])
y_train_delay = train_data_delay["delay_duration"]

# Train Random Forest Model for Delay Duration
fill_model_delay = RandomForestRegressor(n_estimators=100, random_state=42)
fill_model_delay.fit(X_train_delay, y_train_delay)

# Predict missing values for delay_duration
df.loc[df["delay_duration"].isnull(), "delay_duration"] = fill_model_delay.predict(test_data_delay.drop(columns=["delay_duration"]))


In [None]:
print(test_data_delay.shape)  # Check if test_data_delay has any rows
print(test_data_delay.isnull().sum())  # Verify missing values

In [None]:
print(df.isnull().sum())  # Count NaN values in each column

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
# Step 1: Filter only late deliveries (is_late = 1)
late_deliveries = df[df["is_late"] == 1]

# Step 2: Define features (drop non-relevant columns)
X_late = late_deliveries.drop(columns=["delivery_time", "delay_duration", "is_late"])
y_late = late_deliveries["delay_duration"]

# Step 3: Train-test split
X_train_late, X_test_late, y_train_late, y_test_late = train_test_split(X_late, y_late, test_size=0.2, random_state=42)

# Step 4: Train LightGBM Model for Delay Duration Prediction
delay_model = lgb.LGBMRegressor(
    n_estimators=2716, 
    learning_rate=0.0217, 
    max_depth=3
)
delay_model.fit(X_train_late, y_train_late)

# Step 5: Predict Delay Duration for Test Set
y_pred_late = delay_model.predict(X_test_late)

# Step 6: Evaluate Model Performance
mae = mean_absolute_error(y_test_late, y_pred_late)
rmse = np.sqrt(mean_squared_error(y_test_late, y_pred_late))

print(f"\nDelay Duration Prediction Model Performance:")
print(f"MAE (Late Deliveries): {mae:.2f} min")
print(f"RMSE (Late Deliveries): {rmse:.2f} min")

In [None]:
df.rename(columns=lambda x: x.strip().lower(), inplace=True)  # Convert all names to lowercase
print(df.columns)  # Check again

In [None]:
print(df.head())  # Check if "Delivery_Time_Expected" is actually there

In [None]:
import shap

# Select a trained model (e.g., LightGBM for classification)
model = lgb.LGBMClassifier(best_params)  # Replace with your trained model

# Create a SHAP explainer
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# Summary plot
shap.summary_plot(shap_values, X_test, feature_names=X.columns)

# Dependence plots for key features
shap.dependence_plot("distance_km", shap_values[1], X_test, feature_names=X.columns)
shap.dependence_plot("traffic_jam", shap_values[1], X_test, feature_names=X.columns)