In [None]:
!pip install geopy
!pip install folium



In [None]:
# Import required libraries
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import os
import folium
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from geopy.distance import geodesic
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.tree import plot_tree
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split



warnings.filterwarnings('ignore')



In [None]:
data = pd.read_csv("train.csv")

In [None]:
# Data cleaning:

# 1.Remove '(min)' from 'Time_taken(min)' and convert it to an integer
data['Time_taken(min)'] = data['Time_taken(min)'].str.replace(r'\(min\)\s*', '', regex=True).astype(int)
# 2. Remove the 'conditions' prefix from 'Weather conditions'
data['Weatherconditions'] = data['Weatherconditions'].str.replace('conditions ', '')

# 3. Convert to numeric
data['Delivery_person_Age'] = pd.to_numeric(data['Delivery_person_Age'], errors='coerce')  # convert to int, NaNs if invalid
data['Delivery_person_Ratings'] = pd.to_numeric(data['Delivery_person_Ratings'], errors='coerce')  # convert to float
data['multiple_deliveries'] = pd.to_numeric(data['multiple_deliveries'], errors='coerce')  # convert to int (handle NaNs)

# 4. Convert to categorical
categorical_columns = [
    'Weatherconditions',
    'Road_traffic_density',
    'Type_of_order',
    'Type_of_vehicle',
    'Festival',
    'City'
]
data[categorical_columns] = data[categorical_columns].astype('category')

# 5. Convert negative latitudes and longitudes to positive for both restaurant and delivery locations
data['Restaurant_latitude'] = data['Restaurant_latitude'].abs()
data['Restaurant_longitude'] = data['Restaurant_longitude'].abs()
data['Delivery_location_latitude'] = data['Delivery_location_latitude'].abs()
data['Delivery_location_longitude'] = data['Delivery_location_longitude'].abs()

#6. Convert to datetime
data['Time_Orderd'] = pd.to_datetime(data['Time_Orderd'], errors='coerce')
data['Time_Order_picked'] = pd.to_datetime(data['Time_Order_picked'], errors='coerce')

#7. Check for missing values (NaN) in each column of the dataset
missing_value = data.isna().sum() + ((data == 'NaN') | (data == 'NaN ')).sum()
# Display the number of missing values for each column
print(missing_value)
# remove the number of blank (missing) values is exactly 3 among the total 12 columns
cols_to_check = [
    'Delivery_person_Age',
    'Delivery_person_Ratings',
    'Time_Orderd',
    'Weatherconditions',
    'Road_traffic_density',
    'multiple_deliveries',
    'Festival',
    'City'
]

def is_blank(val):
    return pd.isna(val) or str(val).strip().lower() == 'nan'
rows_with_3_blank = data[cols_to_check].applymap(is_blank).sum(axis=1) == 3
data = data[~rows_with_3_blank]



In [None]:
#8. Function to calculate distance between restaurant and delivery location (in km)
def calculate_distance(row):
    restaurant_coords = (row['Restaurant_latitude'], row['Restaurant_longitude'])
    delivery_coords = (row['Delivery_location_latitude'], row['Delivery_location_longitude'])
    return geodesic(restaurant_coords, delivery_coords).km  # return distance in km
data['Delivery_distance_km'] = data.apply(calculate_distance, axis=1)
#9. add new column 'Order_to_Pickup_Duration'
duration = data['Time_Order_picked'] - data['Time_Orderd']
duration = duration.where(duration >= pd.Timedelta(0), duration + pd.Timedelta(days=1))

data['Order_to_Pickup_Duration'] = duration.dt.total_seconds() / 60

In [None]:
# Step 1: Extract restaurant latitude and longitude columns
restaurant_data = data[['Restaurant_latitude', 'Restaurant_longitude']]

# Step 2: Create map centered on India
india_center = [20.5937, 78.9629]
map_all_restaurants = folium.Map(location=india_center, zoom_start=5)

# Step 3: Plot all restaurant latitudes and longitudes
for _, row in restaurant_data.iterrows():
    folium.CircleMarker(
        location=[row['Restaurant_latitude'], row['Restaurant_longitude']],
        radius=3,
        color='blue',  # Blue color for restaurant locations
        fill=True,
        fill_opacity=0.6
    ).add_to(map_all_restaurants)

# Step 4: Save the map as an HTML file to view
map_all_restaurants
#map_all_restaurants.save("all_restaurants_map.html")
#print("✅ Map saved as 'all_restaurants_map.html'")


In [None]:

# Step 1: Extract delivery location latitude and longitude columns
delivery_data = data[['Delivery_location_latitude', 'Delivery_location_longitude']]    

# Step 2: Create map centered on India
india_center = [20.5937, 78.9629]
map_all_delivery_locations = folium.Map(location=india_center, zoom_start=5)

# Step 3: Plot all restaurant latitudes and longitudes
for _, row in delivery_data.iterrows():
    folium.CircleMarker(
        location=[row['Delivery_location_latitude'], row['Delivery_location_longitude']],
        radius=3,
        color='red',  # red color for delivery locations
        fill=True,
        fill_opacity=0.6
    ).add_to(map_all_delivery_locations)

# Step 4: Save the map as an HTML file to view
map_all_delivery_locations
#map_all_delivery_locations.save("map_all_delivery_locations.html")
#print("✅ Map saved as 'map_all_delivery_locations.html'")

In [None]:
numerical_vars = ['Delivery_person_Age', 'Delivery_person_Ratings', 
                  'Delivery_distance_km', 'multiple_deliveries','Order_to_Pickup_Duration']

categorical_vars = ['Vehicle_condition', 'Weatherconditions', 'Type_of_order',
                    'Type_of_vehicle', 'Festival', 'City', 'Road_traffic_density']

y = data['Time_taken(min)']

In [None]:
plt.figure(figsize=(15, 10))
all_vars = numerical_vars + categorical_vars
for i, var in enumerate(all_vars):
    plt.subplot(4, 3, i+1)
    sns.histplot(data[var].dropna(), kde=False, bins=30, color='skyblue')
    plt.title(f'Distribution of {var}')
plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Compute correlation matrix
corr_matrix = data.corr(numeric_only=True)

# Plot full heatmap
plt.figure(figsize=(14, 10))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", square=True, linewidths=0.5)

# Improve label readability
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.title("Correlation Heatmap of Features", fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Sort features by correlation with the target
target_corr = corr_matrix['Time_taken(min)'].drop('Time_taken(min)').sort_values(ascending=True)

# Plot as horizontal bar chart
plt.figure(figsize=(10, 6))
sns.barplot(x=target_corr.values, y=target_corr.index, palette='coolwarm')
plt.title("Feature Correlation with Delivery Time")
plt.xlabel("Correlation Coefficient")
plt.ylabel("Features")
plt.tight_layout()
plt.show()

In [None]:
#according to the above figure, choose 'Delivery_person_Ratings', 'Festival', and 'City' 
#missing values with the mode (most frequent value)

mode_rating = data['Delivery_person_Ratings'].mode()[0]
data['Delivery_person_Ratings'].fillna(mode_rating, inplace=True)

data['Festival'] = (
    data['Festival'].astype(str).str.strip().replace('NaN', np.nan)
)
data['Festival'].fillna(data['Festival'].mode()[0], inplace=True)

data['City'] = (
    data['City'].astype(str).str.strip().replace('NaN', np.nan)
)
data['City'].fillna(data['City'].mode()[0], inplace=True)

In [None]:
#check the original records， and remove these missing value records. only 4%

original_rows = data.shape[0]

data_cleaned = data.replace(['NaN', 'NaN '], np.nan)
data_cleaned.dropna(inplace=True)

cleaned_rows = data_cleaned.shape[0]

deleted_rows = original_rows - cleaned_rows
print(deleted_rows/original_rows)
data = data_cleaned

In [None]:

# Encode 'Road_traffic_density' with label encoding (ordinal)
# Define the order of categories
categories = ['Low ', 'Medium ', 'High ', 'Jam ']

# Convert to ordered categorical and get integer codes
data['Rd_traffic_density'] = pd.Categorical(
    data['Road_traffic_density'],
    categories=categories,
    ordered=True
).codes

# One-Hot Encode the rest (drop_first=True to avoid multicollinearity)
one_hot_cols = ['Weatherconditions', 'Type_of_order', 'Type_of_vehicle', 'Festival', 'City']
data = pd.get_dummies(data, columns=one_hot_cols, drop_first=True)

# Drop the original 'Road_traffic_density' column (optional, now replaced)
data.drop(columns='Road_traffic_density', inplace=True)


In [None]:
# List of columns that need to be converted from boolean to 0 and 1
boolean_columns = [
    'Weatherconditions_Fog', 'Weatherconditions_Sandstorms', 'Weatherconditions_Stormy', 
    'Weatherconditions_Sunny', 'Weatherconditions_Windy', 
    'Type_of_order_Drinks ', 'Type_of_order_Meal ', 'Type_of_order_Snack ', 
    'Type_of_vehicle_electric_scooter ', 'Type_of_vehicle_motorcycle ', 'Type_of_vehicle_scooter ', 
    'Festival_Yes', 'City_Semi-Urban', 'City_Urban'
]

# Convert boolean columns to 0 and 1
data[boolean_columns] = data[boolean_columns].astype(int)


In [None]:
# Now define X and y for modeling
# Define target and features
X = data.drop(columns=[
    'Time_taken(min)',        # target
    'ID',                     # unique identifier
    'Delivery_person_ID',     # personal identifier
    'Restaurant_latitude',
    'Restaurant_longitude',
    'Delivery_location_latitude',
    'Delivery_location_longitude',
    'Order_Date',
    'Time_Orderd',
    'Time_Order_picked',
])

y = data['Time_taken(min)']

# Print shapes
print("X shape:", X.shape)
print("y shape:", y.shape)


In [None]:
# Splitting Data

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,              
    stratify=y,         
    random_state=42             
)


In [None]:
# Initialize scaler
scaler = StandardScaler()

# Scale numerical columns in X_train
X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train[numerical_vars]),
    columns=numerical_vars,
    index=X_train.index
)

# Scale numerical columns in X_test (only transform!)
X_test_scaled = pd.DataFrame(
    scaler.transform(X_test[numerical_vars]),
    columns=numerical_vars,
    index=X_test.index
)

# Drop old numerical columns
X_train.drop(columns=numerical_vars, inplace=True)
X_test.drop(columns=numerical_vars, inplace=True)

# Add scaled data back
X_train = pd.concat([X_train, X_train_scaled], axis=1)
X_test = pd.concat([X_test, X_test_scaled], axis=1)


In [None]:
#1. k-NN model with grid search and cross-validation
knn = KNeighborsRegressor()
param_knn = {'n_neighbors': [5, 10, 15, 20, 25]}

grid_knn = GridSearchCV(knn, param_knn, cv = 5)
grid_knn.fit(X_train, y_train)

#(1) the best k chosen
print(grid_knn.best_params_)

#2) the test accuracy under the best model
print(grid_knn.score(X_test, y_test))

#3) the mean validation accuracy through the cross-validation process (under the best model)
print(grid_knn.best_score_)

In [None]:
y_pred = grid_knn.predict(X_test)

plt.figure(figsize=(6, 6))
plt.scatter(y_test, y_pred, alpha=0.5, edgecolor='k')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual Time Taken')
plt.ylabel('Predicted Time Taken')
plt.title('KNN Regression: Predicted vs Actual')
plt.grid(True)
plt.tight_layout()
plt.show()

rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")

In [None]:

# 2. DT
opt_tree = DecisionTreeRegressor(random_state = 0) 

dt_params = {'max_depth':  range(2,20) }

grid_tree = GridSearchCV(opt_tree, dt_params, cv = 5)
grid_tree.fit(X_train, y_train)

# (1) the best max_depth chosen
print(grid_tree.best_params_['max_depth'])
#(2) the test accuracy under the best model
print(grid_tree.score(X_test, y_test))
#(3) the mean validation accuracy through the cross-validation process (under the best model)
print(grid_tree.best_score_)



In [None]:
importances = grid_tree.best_estimator_.feature_importances_
plt.barh(X_train.columns, importances)
plt.title("Feature Importances")
plt.show()

In [None]:
# Get the best estimator
best_tree = grid_tree.best_estimator_

plt.figure(figsize=(20, 10))
plot_tree(best_tree, 
          filled=True, 
          feature_names=X.columns,  
          rounded=True, 
          fontsize=10)
best_tree = grid_tree.best_estimator_


plt.title("Best Decision Tree from GridSearch")
plt.show()

In [None]:
y_pred = best_tree.predict(X_test)

plt.figure(figsize=(6, 6))
plt.scatter(y_test, y_pred, alpha=0.3)
plt.xlabel("Actual Time Taken")
plt.ylabel("Predicted Time Taken")
plt.title("Prediction vs Actual")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.show()

mse = mean_squared_error(y_test, y_pred)
print(f"Test MSE: {mse:.2f}")

In [None]:
# XG Boost
!pip install xgboost

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
# Drop unwanted columns
X = data.drop(['Time_taken(min)', 'ID', 'Delivery_person_ID'], axis=1)
y = data['Time_taken(min)']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(X_train.dtypes)

In [None]:
X = pd.get_dummies(X)  # this works on entire DataFrame
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(X_train.select_dtypes(include='datetime64').columns)

In [None]:
X = X.drop(['Order_Date', 'Time_Orderd', 'Time_Order_picked'], axis=1, errors='ignore')

In [None]:
print(X.columns)

In [None]:
X = X.drop(['Order_Date', 'Time_Orderd', 'Time_Order_picked'], axis=1, errors='ignore')

In [None]:
print(X_train.dtypes[X_train.dtypes == 'datetime64[ns]'])

In [None]:
X_train = X_train.select_dtypes(exclude=['datetime64[ns]'])
X_test = X_test.select_dtypes(exclude=['datetime64[ns]'])

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'reg:squarederror',
    'max_depth': 6,
    'eta': 0.1,
    'eval_metric': 'rmse'
}

xgb_model = xgb.train(params, dtrain, num_boost_round=100)

In [None]:
y_pred = xgb_model.predict(dtest)

print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))
print("R² Score:", r2_score(y_test, y_pred))

In [None]:
# Number of observations and features
n = X_test.shape[0]
k = X_test.shape[1]

# R-squared
r2 = r2_score(y_test, y_pred)

# Adjusted R-squared
adjusted_r2 = 1 - ((1 - r2) * (n - 1)) / (n - k - 1)

print("Adjusted R² Score:", adjusted_r2)

In [None]:
xgb_model = xgb.train(params, dtrain, num_boost_round=100)

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
xgb_model = xgb.train(params, dtrain, num_boost_round=100)

In [None]:
# Plot Feature Importance
import matplotlib.pyplot as plt

xgb.plot_importance(xgb_model, max_num_features=10, importance_type='gain', height=0.5)
plt.title('Top 10 Important Features - XGBoost')
plt.tight_layout()
plt.show()

In [None]:
y_pred = xgb_model.predict(dtest)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(6,6))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')  # ideal line
plt.xlabel("Actual Time Taken")
plt.ylabel("Predicted Time Taken")
plt.title("XGBoost: Predicted vs Actual")
plt.grid(True)
plt.tight_layout()
plt.show()