In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import mlflow
import mlflow.sklearn
import joblib

In [3]:
df = pd.read_csv(r"D:\Guvi\Projects\mini\Amazon Delivery Time Prediction\Amazon_delivery_time_prediction\data\amazon_delivery_cleaned.csv")
df.head()

Unnamed: 0,Order_ID,Agent_Age,Agent_Rating,Store_Latitude,Store_Longitude,Drop_Latitude,Drop_Longitude,Order_Date,Order_Time,Pickup_Time,...,Vehicle,Area,Delivery_Time,Category,Order_DateTime,Pickup_DateTime,Time_To_Pickup,Distance_km,Order_Hour,Order_DayOfWeek
0,ialx566343618,37,4.9,22.745049,75.892471,22.765049,75.912471,2022-03-19,11:30:00,11:45:00,...,motorcycle,Urban,120,Clothing,2022-03-19 11:30:00,2022-03-19 11:45:00,15.0,3.020737,11,5
1,akqg208421122,34,4.5,12.913041,77.683237,13.043041,77.813237,2022-03-25,19:45:00,19:50:00,...,scooter,Metropolitian,165,Electronics,2022-03-25 19:45:00,2022-03-25 19:50:00,5.0,20.143737,19,4
2,njpu434582536,23,4.4,12.914264,77.6784,12.924264,77.6884,2022-03-19,08:30:00,08:45:00,...,motorcycle,Urban,130,Sports,2022-03-19 08:30:00,2022-03-19 08:45:00,15.0,1.549693,8,5
3,rjto796129700,38,4.7,11.003669,76.976494,11.053669,77.026494,2022-04-05,18:00:00,18:10:00,...,motorcycle,Metropolitian,105,Cosmetics,2022-04-05 18:00:00,2022-04-05 18:10:00,10.0,7.774497,18,1
4,zguw716275638,32,4.6,12.972793,80.249982,13.012793,80.289982,2022-03-26,13:30:00,13:45:00,...,scooter,Metropolitian,150,Toys,2022-03-26 13:30:00,2022-03-26 13:45:00,15.0,6.197898,13,5


### Feature/Target Split

In [4]:
X = df[['Agent_Age', 'Agent_Rating', 'Distance_km', 'Time_To_Pickup', 'Order_Hour', 'Order_DayOfWeek', 'Weather', 'Traffic', 'Vehicle', 'Area', 'Category']]
y = df['Delivery_Time']

### Preprocessing Pipeline

In [5]:
# Categorical columns
cat_cols = ['Weather', 'Traffic', 'Vehicle', 'Area', 'Category']
num_cols = list(set(X.columns) - set(cat_cols))

# OneHot encoding for categoricals
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
], remainder='passthrough')

### Train/Test Split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Define Evaluation Function

In [7]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

### Train Models with MLflow Logging

In [8]:
models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
}

mlflow.set_experiment("amazon_delivery_prediction")

for name, model in models.items():
    with mlflow.start_run(run_name=name):
        pipe = Pipeline(steps=[
            ('preprocess', preprocessor),
            ('model', model)
        ])

        pipe.fit(X_train, y_train)
        preds = pipe.predict(X_test)
        rmse, mae, r2 = eval_metrics(y_test, preds)

        mlflow.log_param("model_type", name)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2", r2)
        mlflow.sklearn.log_model(pipe, "model")

        print(f"{name}:\n  RMSE={rmse:.2f}, MAE={mae:.2f}, R²={r2:.2f}\n")



LinearRegression:
  RMSE=33.15, MAE=26.22, R²=0.58





RandomForest:
  RMSE=22.71, MAE=17.41, R²=0.80





GradientBoosting:
  RMSE=24.58, MAE=19.04, R²=0.77



### Save Best Model (Manually Pick or Based on R²)

In [8]:
# Save best model to disk manually, assuming GradientBoosting performed best
best_model = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42))
])
best_model.fit(X, y)
joblib.dump(best_model, r"D:\Guvi\Projects\mini\Amazon Delivery Time Prediction\Amazon_delivery_time_prediction\models/best_model.pkl")
print("✅ Best model saved as 'best_model.pkl'")


✅ Best model saved as 'best_model.pkl'


In [9]:
import joblib

### Save Linear Regression Model

In [None]:
cat_cols = ['Weather', 'Traffic', 'Vehicle', 'Area', 'Category']
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
], remainder='passthrough')

# Define Linear Regression pipeline
linear_pipe = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', LinearRegression())
])

# Fit on full dataset
linear_pipe.fit(X, y)

# Save model
joblib.dump(linear_pipe, r'D:\Guvi\Projects\mini\Amazon Delivery Time Prediction\Amazon_delivery_time_prediction\models/linear_regression_model.pkl')
print("✅ Linear Regression model saved as 'linear_regression_model.pkl'")

✅ Linear Regression model saved as 'linear_regression_model.pkl'


### Save Random Forest Model

In [12]:
# Random Forest Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Create pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', rf_model)
])

# Fit on full data
pipeline.fit(X, y)

# Save model
joblib.dump(pipeline, r"D:\Guvi\Projects\mini\Amazon Delivery Time Prediction\Amazon_delivery_time_prediction\models/random_forest_model.pkl")
print("✅ Random Forest model saved to models/random_forest_model.pkl")

✅ Random Forest model saved to models/random_forest_model.pkl


### Save Gradient Boosting model

In [11]:
# Gradient Boosting model
gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

# Pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', gb_model)
])

# Fit the model
pipeline.fit(X, y)

# Save the model
joblib.dump(pipeline, r"D:\Guvi\Projects\mini\Amazon Delivery Time Prediction\Amazon_delivery_time_prediction\models/gradient_boosting_model.pkl")
print("✅ Gradient Boosting model saved to models/gradient_boosting_model.pkl")

✅ Gradient Boosting model saved to models/gradient_boosting_model.pkl
