In [1]:
import pandas as pd
from geopy.distance import geodesic

In [2]:
df = pd.read_csv(r"D:\Guvi\Projects\mini\Amazon Delivery Time Prediction\Amazon_delivery_time_prediction\data\amazon_delivery.csv")
df.head()

Unnamed: 0,Order_ID,Agent_Age,Agent_Rating,Store_Latitude,Store_Longitude,Drop_Latitude,Drop_Longitude,Order_Date,Order_Time,Pickup_Time,Weather,Traffic,Vehicle,Area,Delivery_Time,Category
0,ialx566343618,37,4.9,22.745049,75.892471,22.765049,75.912471,2022-03-19,11:30:00,11:45:00,Sunny,High,motorcycle,Urban,120,Clothing
1,akqg208421122,34,4.5,12.913041,77.683237,13.043041,77.813237,2022-03-25,19:45:00,19:50:00,Stormy,Jam,scooter,Metropolitian,165,Electronics
2,njpu434582536,23,4.4,12.914264,77.6784,12.924264,77.6884,2022-03-19,08:30:00,08:45:00,Sandstorms,Low,motorcycle,Urban,130,Sports
3,rjto796129700,38,4.7,11.003669,76.976494,11.053669,77.026494,2022-04-05,18:00:00,18:10:00,Sunny,Medium,motorcycle,Metropolitian,105,Cosmetics
4,zguw716275638,32,4.6,12.972793,80.249982,13.012793,80.289982,2022-03-26,13:30:00,13:45:00,Cloudy,High,scooter,Metropolitian,150,Toys


### Convert to DateTime (with error handling)

In [3]:
df['Order_DateTime'] = pd.to_datetime(df['Order_Date'] + ' ' + df['Order_Time'], errors='coerce')
df['Pickup_DateTime'] = pd.to_datetime(df['Order_Date'] + ' ' + df['Pickup_Time'], errors='coerce')

# Drop rows with failed datetime parsing
df = df.dropna(subset=['Order_DateTime', 'Pickup_DateTime'])

In [6]:
print(df.shape)

(43648, 18)


Feature Engineering – Time to Pickup

In [7]:
df['Time_To_Pickup'] = (df['Pickup_DateTime'] - df['Order_DateTime']).dt.total_seconds() / 60

### Feature Engineering – Haversine/Geodesic Distance

In [8]:
df['Distance_km'] = df.apply(lambda row: geodesic(
    (row['Store_Latitude'], row['Store_Longitude']),
    (row['Drop_Latitude'], row['Drop_Longitude'])
).km, axis=1)

### Extract Time-Based Features

In [9]:
df['Order_Hour'] = df['Order_DateTime'].dt.hour
df['Order_DayOfWeek'] = df['Order_DateTime'].dt.dayofweek

### Final Cleanup – Drop Remaining Missing

In [10]:
df = df.dropna(subset=['Agent_Rating', 'Weather'])
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,Order_ID,Agent_Age,Agent_Rating,Store_Latitude,Store_Longitude,Drop_Latitude,Drop_Longitude,Order_Date,Order_Time,Pickup_Time,...,Vehicle,Area,Delivery_Time,Category,Order_DateTime,Pickup_DateTime,Time_To_Pickup,Distance_km,Order_Hour,Order_DayOfWeek
0,ialx566343618,37,4.9,22.745049,75.892471,22.765049,75.912471,2022-03-19,11:30:00,11:45:00,...,motorcycle,Urban,120,Clothing,2022-03-19 11:30:00,2022-03-19 11:45:00,15.0,3.020737,11,5
1,akqg208421122,34,4.5,12.913041,77.683237,13.043041,77.813237,2022-03-25,19:45:00,19:50:00,...,scooter,Metropolitian,165,Electronics,2022-03-25 19:45:00,2022-03-25 19:50:00,5.0,20.143737,19,4
2,njpu434582536,23,4.4,12.914264,77.6784,12.924264,77.6884,2022-03-19,08:30:00,08:45:00,...,motorcycle,Urban,130,Sports,2022-03-19 08:30:00,2022-03-19 08:45:00,15.0,1.549693,8,5
3,rjto796129700,38,4.7,11.003669,76.976494,11.053669,77.026494,2022-04-05,18:00:00,18:10:00,...,motorcycle,Metropolitian,105,Cosmetics,2022-04-05 18:00:00,2022-04-05 18:10:00,10.0,7.774497,18,1
4,zguw716275638,32,4.6,12.972793,80.249982,13.012793,80.289982,2022-03-26,13:30:00,13:45:00,...,scooter,Metropolitian,150,Toys,2022-03-26 13:30:00,2022-03-26 13:45:00,15.0,6.197898,13,5


### Save Preprocessed Data

In [None]:
D:\Guvi\Projects\mini\Amazon Delivery Time Prediction\Amazon_delivery_time_prediction\data\amazon_delivery_cleaned.csv

In [14]:
df.to_csv(r"D:\Guvi\Projects\mini\Amazon Delivery Time Prediction\Amazon_delivery_time_prediction\data\amazon_delivery_cleaned.csv", index=False)
print("✅ Cleaned dataset saved to 'amazon_delivery_cleaned.csv'")

✅ Cleaned dataset saved to 'amazon_delivery_cleaned.csv'
