In [None]:
!pip install geopy

# Import required libraries
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import os
os.chdir(r"C:\Users\ronch\OneDrive\Documents\Applied ML")
data = pd.read_csv("train.csv")

data.head()


In [None]:
# Data cleaning:
# Remove '(min)' from 'Time_taken(min)' and convert it to an integer
data['Time_taken(min)'] = data['Time_taken(min)'].str.replace(r'\(min\)\s*', '', regex=True).astype(int)

# Remove 'conditions' prefix from 'Weatherconditions'
data['Weatherconditions'] = data['Weatherconditions'].str.replace('conditions ', '')

data.head()

In [None]:
print(data.dtypes)

In [None]:
import pandas as pd

# Convert to numeric
data['Delivery_person_Age'] = pd.to_numeric(data['Delivery_person_Age'], errors='coerce')  # convert to int, NaNs if invalid
data['Delivery_person_Ratings'] = pd.to_numeric(data['Delivery_person_Ratings'], errors='coerce')  # convert to float
data['multiple_deliveries'] = pd.to_numeric(data['multiple_deliveries'], errors='coerce')  # convert to int (handle NaNs)

# Convert to categorical
categorical_columns = [
    'Weatherconditions',
    'Road_traffic_density',
    'Type_of_order',
    'Type_of_vehicle',
    'Festival',
    'City'
]
data[categorical_columns] = data[categorical_columns].astype('category')



In [None]:
print(data.dtypes)

In [None]:
data.head()

In [None]:
# Check for missing values (NaN) in each column of the dataset
missing_values = data.isna().sum()

# Display the number of missing values for each column
print(missing_values)

In [None]:
!pip install folium

import pandas as pd
import folium

# Step 1: Extract delivery location latitude and longitude columns
delivery_data = data[['Delivery_location_latitude', 'Delivery_location_longitude']]    

# Step 2: Create map centered on India
india_center = [20.5937, 78.9629]
map_all_delivery_locations = folium.Map(location=india_center, zoom_start=5)

# Step 3: Plot all restaurant latitudes and longitudes
for _, row in delivery_data.iterrows():
    folium.CircleMarker(
        location=[row['Delivery_location_latitude'], row['Delivery_location_longitude']],
        radius=3,
        color='red',  # red color for delivery locations
        fill=True,
        fill_opacity=0.6
    ).add_to(map_all_delivery_locations)

# Step 4: Save the map as an HTML file to view
map_all_delivery_locations.save("map_all_delivery_locations.html")
print("✅ Map saved as 'map_all_delivery_locations.html'")

In [None]:
# Convert negative latitudes and longitudes to positive for both restaurant and delivery locations
data['Restaurant_latitude'] = data['Restaurant_latitude'].abs()
data['Restaurant_longitude'] = data['Restaurant_longitude'].abs()
data['Delivery_location_latitude'] = data['Delivery_location_latitude'].abs()
data['Delivery_location_longitude'] = data['Delivery_location_longitude'].abs()


In [None]:
# Import necessary libraries for distance calculation
from geopy.distance import geodesic

# Function to calculate distance between restaurant and delivery location (in km)
def calculate_distance(row):
    restaurant_coords = (row['Restaurant_latitude'], row['Restaurant_longitude'])
    delivery_coords = (row['Delivery_location_latitude'], row['Delivery_location_longitude'])
    return geodesic(restaurant_coords, delivery_coords).km  # return distance in km

# Apply function to create a new column for delivery radius
data['Delivery_distance_km'] = data.apply(calculate_distance, axis=1)

In [None]:
data.head()

In [None]:
cols_to_check = ['Restaurant_latitude', 'Restaurant_longitude', 'Delivery_location_latitude', 'Delivery_location_longitude']
data = data[~(data[cols_to_check] == 0).any(axis=1)]


In [None]:
numerical_vars = ['Delivery_person_Age', 'Delivery_person_Ratings', 
                  'Delivery_distance_km', 'Time_taken(min)', 'multiple_deliveries']

categorical_vars = ['Vehicle_condition', 'Weatherconditions', 'Type_of_order',
                    'Type_of_vehicle', 'Festival', 'City', 'Road_traffic_density']


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(15, 10))
for i, var in enumerate(numerical_vars):
    plt.subplot(2, 3, i+1)
    sns.histplot(data[var].dropna(), kde=False, bins=30, color='skyblue')
    plt.title(f'Distribution of {var}')
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import pandas as pd

# Replace string "NaN" with actual np.nan across the entire DataFrame
data.replace("NaN", np.nan, inplace=True)

# Convert all object-type columns to string, strip whitespaces, lowercase, and replace 'nan' with np.nan
for col in data.select_dtypes(include='object').columns:
    data[col] = data[col].astype(str).str.strip().str.lower().replace('nan', np.nan)


# Columns with encoded string 'nan' that may still exist as categories
cols_to_clean = ['City', 'Festival', 'Road_traffic_density']

for col in cols_to_clean:
    # Convert to string, strip whitespace, lowercase everything
    data[col] = data[col].astype(str).str.strip().str.lower()
    
    # Replace 'nan' and 'none' strings with actual np.nan
    data[col] = data[col].replace(['nan', 'none'], np.nan)
    
    # Optional: Convert back to category if needed
    data[col] = data[col].astype('category')

data.dtypes

In [None]:
# Clean NaNs or string 'NaN' in the 'Type_of_vehicle' column
data['Type_of_vehicle'] = data['Type_of_vehicle'].replace('NaN', pd.NA)  # Replace 'NaN' string with actual NaN
data = data.dropna(subset=['Type_of_vehicle'])  # Remove rows where 'Type_of_vehicle' is NaN

# Check unique values after cleaning
print(data['Type_of_vehicle'].unique())


In [None]:
plt.figure(figsize=(18, 12))
for i, var in enumerate(categorical_vars):
    plt.subplot(3, 3, i+1)
    sns.stripplot(data=data, x=var, y='Time_taken(min)', palette='Set2', jitter=True)
    plt.xticks(rotation=45)
    plt.title(f'Time Taken by {var}')
plt.tight_layout()
plt.show()

In [None]:
numeric_vs_time = ['Delivery_person_Age', 'Delivery_person_Ratings', 
                   'multiple_deliveries', 'Delivery_distance_km']

plt.figure(figsize=(15, 10))
for i, var in enumerate(numeric_vs_time):
    plt.subplot(2, 2, i+1)
    sns.scatterplot(data=data, x=var, y='Time_taken(min)', alpha=0.6)
    plt.title(f'{var} vs Time Taken')
plt.tight_layout()
plt.show()

In [None]:
#scaling and transforming/encoding
numerical_vars = ['Delivery_person_Age', 'Delivery_person_Ratings', 
                  'Delivery_distance_km', 'multiple_deliveries']

categorical_vars = ['Vehicle_condition', 'Weatherconditions', 'Type_of_order',
                    'Type_of_vehicle', 'Festival', 'City', 'Road_traffic_density']

y = data['Time_taken(min)']


In [None]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Strip and replace 'NaN' strings with actual np.nan
for col in categorical_vars:
    data[col] = data[col].astype(str).str.strip()
    data[col] = data[col].replace('NaN', np.nan)

# Encode 'Road_traffic_density' with label encoding (ordinal)
road_traffic_order = ['Low', 'Medium', 'High', 'Jam']
label_encoder = LabelEncoder()
data['Rd_traffic_density'] = label_encoder.fit_transform(data['Road_traffic_density'].fillna('Missing'))


# One-Hot Encode the rest (drop_first=True to avoid multicollinearity)
one_hot_cols = ['Weatherconditions', 'Type_of_order', 'Type_of_vehicle', 'Festival', 'City']
data = pd.get_dummies(data, columns=one_hot_cols, drop_first=True)

# Drop the original 'Road_traffic_density' column (optional, now replaced)
data.drop(columns='Road_traffic_density', inplace=True)



In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize scaler
scaler = StandardScaler()

# Scale the numerical variables
scaled_numericals = pd.DataFrame(scaler.fit_transform(data[numerical_vars]), 
                                 columns=numerical_vars, index=data.index)

# Drop original numerical columns and replace with scaled ones
data.drop(columns=numerical_vars, inplace=True)
data = pd.concat([data, scaled_numericals], axis=1)


In [None]:
# Remove unnecessary columns
data.drop(columns=['Weatherconditions_nan', 'City_nan', 'Festival_no'], inplace=True)

In [None]:
# List of columns that need to be converted from boolean to 0 and 1
boolean_columns = [
    'Weatherconditions_Fog', 'Weatherconditions_Sandstorms', 'Weatherconditions_Stormy', 
    'Weatherconditions_Sunny', 'Weatherconditions_Windy', 
    'Type_of_order_Drinks', 'Type_of_order_Meal', 'Type_of_order_Snack', 
    'Type_of_vehicle_electric_scooter', 'Type_of_vehicle_motorcycle', 'Type_of_vehicle_scooter', 
    'Festival_yes', 'City_semi-urban', 'City_urban'
]

# Convert boolean columns to 0 and 1
data[boolean_columns] = data[boolean_columns].astype(int)

# Check the transformation
print(data[boolean_columns].head())


In [None]:
# Now define X and y for modeling
# Define target and features
X = data.drop(columns=[
    'Time_taken(min)',        # target
    'ID',                     # unique identifier
    'Delivery_person_ID',     # personal identifier
    'Restaurant_latitude',
    'Restaurant_longitude',
    'Delivery_location_latitude',
    'Delivery_location_longitude',
    'Order_Date',
    'Time_Orderd',
    'Time_Order_picked'
])

y = data['Time_taken(min)']

# Print shapes
print("X shape:", X.shape)
print("y shape:", y.shape)



In [None]:
X.head()

In [None]:
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import HalvingRandomSearchCV, train_test_split
from scipy.stats import loguniform
from sklearn.metrics import r2_score
import numpy as np

# 1) Split off full train/test
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 2) Subsample 50% for hyperparameter tuning
X_sub, _, y_sub, _ = train_test_split(
    X_train_full, y_train_full, train_size=0.5, random_state=42
)

# 3) Build pipeline: impute → MLP
pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('mlp', MLPRegressor(random_state=42))
])

# 4) Tiny hyperparameter space
param_dist = {
    'mlp__hidden_layer_sizes': [(64, 32), (128, 64)],
    'mlp__activation': ['relu'],
    'mlp__alpha':        loguniform(1e-4, 1e-2),
    'mlp__learning_rate_init': loguniform(1e-4, 1e-2),
    'mlp__batch_size':   [32],
    'mlp__tol':          [1e-3]
}

# 5) Successive‑halving on the `mlp__max_iter` budget
halving_mlp = HalvingRandomSearchCV(
    estimator=pipe,
    param_distributions=param_dist,
    resource='mlp__max_iter',    # the budget parameter name in our pipeline
    max_resources=50,            # best candidates see 50 epochs
    min_resources=10,            # first rung: 10 epochs
    factor=3,                    
    cv=2,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

# 6) Run tuning on the subsample
halving_mlp.fit(X_sub, y_sub)

# 7) Grab best pipeline and hyperparameters
best_pipe = halving_mlp.best_estimator_
print("Best hyperparams:", halving_mlp.best_params_)

# 8) Retrain best MLP on full training set with a higher epoch cap
best_pipe.set_params(mlp__max_iter=200)
best_pipe.fit(X_train_full, y_train_full)

# 9) Final evaluation on hold‑out test set
y_pred = best_pipe.predict(X_test)
print("Final Test R²:", r2_score(y_test, y_pred))
mse  = mean_squared_error(y_test, y_pred)
mae  = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2   = r2_score(y_test, y_pred)

print(f"Test RMSE: {rmse:.2f}")
print(f"Test MAE:  {mae:.2f}")
print(f"Test R²:   {r2:.4f}")

n       = len(y_test)
p       = X_test.shape[1]
adj_r2  = 1 - (1 - r2) * (n - 1) / (n - p - 1)
print(f"Adjusted R²:   {adj_r2:.4f}")

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPRegressor
from scipy.stats import uniform
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 1) Add cyclical time features (assumes `data` is your preprocessed DataFrame)
data['order_dt'] = pd.to_datetime(
    data['Order_Date'].str.strip() + ' ' + data['Time_Orderd'].str.strip(),
    errors='coerce'
)
data['hour']     = data['order_dt'].dt.hour.fillna(0)
data['dow']      = data['order_dt'].dt.weekday.fillna(0)
data['hour_sin'] = np.sin(2 * np.pi * data['hour'] / 24)
data['hour_cos'] = np.cos(2 * np.pi * data['hour'] / 24)
data['dow_sin']  = np.sin(2 * np.pi * data['dow'] / 7)
data['dow_cos']  = np.cos(2 * np.pi * data['dow'] / 7)

# 2) Define features and target
X = data.drop(columns=[
    'Time_taken(min)',
    'ID',
    'Delivery_person_ID',
    'Restaurant_latitude',
    'Restaurant_longitude',
    'Delivery_location_latitude',
    'Delivery_location_longitude',
    'Order_Date',
    'Time_Orderd',
    'Time_Order_picked',
    'order_dt',
    'hour',
    'dow'
])
y = data['Time_taken(min)']

# 3) Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 4) Build pipeline: impute → MLP
pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('mlp', MLPRegressor(
        hidden_layer_sizes=(128, 64),
        activation='relu',
        random_state=42
    ))
])

# 5) Small hyperparameter space around proven region
param_dist = {
    'mlp__alpha':              uniform(1e-5, 1e-3),
    'mlp__learning_rate_init': uniform(1e-3, 2e-3),
    'mlp__tol':                [1e-4, 1e-3],
    'mlp__batch_size':         [32, 64],
    'mlp__max_iter':           [80, 100, 120]
}

# 6) RandomizedSearchCV for refinement
search = RandomizedSearchCV(
    pipe,
    param_dist,
    n_iter=15,
    cv=3,
    scoring='r2',
    n_jobs=-1,
    random_state=42,
    verbose=1
)
search.fit(X_train, y_train)

best_pipe = search.best_estimator_
print("Refined best params:", search.best_params_)

# 7) Evaluate on hold‑out test set
y_pred = best_pipe.predict(X_test)



In [None]:
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(search.best_estimator_, X, y,
                            cv=5, scoring='r2', n_jobs=-1)
print("5‑fold CV R²: %.3f ± %.3f" % (cv_scores.mean(), cv_scores.std()))


In [None]:
# 1) If best_svr isn’t defined in your current session, re-tune or reload it.
# Here’s a quick RandomizedSearchCV to get a strong SVR:

from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVR
from scipy.stats import loguniform
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# SVR pipeline
svr_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('svr', SVR())
])

# Hyperparameter space (you can adjust n_iter for speed/quality trade‑off)
param_svr = {
    'svr__C': loguniform(1e-2, 1e3),
    'svr__gamma': loguniform(1e-4, 1),
    'svr__epsilon': [0.1, 0.2, 0.3]
}

rand_svr = RandomizedSearchCV(
    svr_pipe,
    param_svr,
    n_iter=10,
    cv=3,
    scoring='r2',
    n_jobs=-1,
    random_state=42,
    verbose=1
)
rand_svr.fit(X_train_full, y_train_full)
best_svr = rand_svr.best_estimator_
print("Best SVR params:", rand_svr.best_params_)


from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 1) Build the stacking pipeline (impute → stack)
stacking_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('stack', StackingRegressor(
        estimators=[
            ('svr', best_svr),
            ('mlp', best_pipe)
        ],
        final_estimator=RidgeCV(),
        passthrough=True,
        n_jobs=-1
    ))
])

# 2) Fit on your training data
stacking_pipeline.fit(X_train, y_train)

# 3) Predict on your hold‑out set
y_stack = stacking_pipeline.predict(X_test)

# 4) Evaluate
rmse_stack = mean_squared_error(y_test, y_stack, squared=False)
mae_stack  = mean_absolute_error(y_test, y_stack)
r2_stack   = r2_score(y_test, y_stack)

print(f"Stacked RMSE: {rmse_stack:.2f}")
print(f"Stacked MAE:  {mae_stack:.2f}")
print(f"Stacked R²:   {r2_stack:.4f}")



In [None]:
from sklearn.metrics import mean_absolute_percentage_error, median_absolute_error

# y_test and y_pred are your true & predicted delivery times:

mape = mean_absolute_percentage_error(y_test, y_stack) * 100
medae = median_absolute_error(y_test, y_stack)

print(f"MAPE: {mape:.2f}%")
print(f"Median AE: {medae:.2f} minutes")


In [None]:
from sklearn.metrics import r2_score

# 1. Compute standard R²
r2 = r2_score(y_test, y_stack)

# 2. Compute adjusted R²
n = len(y_test)           # number of samples
p = X_test.shape[1]       # number of predictors/features
adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

print(f"R²:          {r2:.4f}")
print(f"Adjusted R²: {adj_r2:.4f}")

In [None]:
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(search.best_estimator_, X, y,
                            cv=5, scoring='r2', n_jobs=-1)
print("5‑fold CV R²: %.3f ± %.3f" % (cv_scores.mean(), cv_scores.std()))


In [None]:
import matplotlib.pyplot as plt

# Plot Predicted vs. Actual
plt.figure(figsize=(6,6))
plt.scatter(y_test, y_stack, alpha=0.4)
plt.plot([y_test.min(), y_test.max()],
         [y_test.min(), y_test.max()],
         'k--', linewidth=2)
plt.xlabel('Actual Delivery Time (min)')
plt.ylabel('Predicted Delivery Time (min)')
plt.title('Predicted vs. Actual Delivery Time')
plt.tight_layout()
plt.show()
