In [9]:
import numpy as np
import pandas as pd
import data_clean_utils
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer, MissingIndicator
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
    LabelEncoder,
    MinMaxScaler,
    PowerTransformer,
    OrdinalEncoder
)
from sklearn.model_selection import train_test_split

In [10]:
from sklearn import set_config
set_config(transform_output="pandas")

In [11]:
#Load the data
df=pd.read_csv("swiggy.csv")
df.head()

Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weatherconditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken(min)
0,0x4607,INDORES13DEL02,37,4.9,22.745049,75.892471,22.765049,75.912471,19-03-2022,11:30:00,11:45:00,conditions Sunny,High,2,Snack,motorcycle,0,No,Urban,(min) 24
1,0xb379,BANGRES18DEL02,34,4.5,12.913041,77.683237,13.043041,77.813237,25-03-2022,19:45:00,19:50:00,conditions Stormy,Jam,2,Snack,scooter,1,No,Metropolitian,(min) 33
2,0x5d6d,BANGRES19DEL01,23,4.4,12.914264,77.6784,12.924264,77.6884,19-03-2022,08:30:00,08:45:00,conditions Sandstorms,Low,0,Drinks,motorcycle,1,No,Urban,(min) 26
3,0x7a6a,COIMBRES13DEL02,38,4.7,11.003669,76.976494,11.053669,77.026494,05-04-2022,18:00:00,18:10:00,conditions Sunny,Medium,0,Buffet,motorcycle,1,No,Metropolitian,(min) 21
4,0x70a2,CHENRES12DEL01,32,4.6,12.972793,80.249982,13.012793,80.289982,26-03-2022,13:30:00,13:45:00,conditions Cloudy,High,1,Snack,scooter,1,No,Metropolitian,(min) 30


In [12]:
data_clean_utils.perform_data_cleaning(df)

In [13]:
#load cleaned data set
df_final=pd.read_csv('swiggy_cleaned.csv')
df_final

Unnamed: 0,rider_id,age,ratings,restaurant_latitude,restaurant_longitude,delivery_latitude,delivery_longitude,order_date,weather,traffic,...,city_name,order_day,order_month,order_day_of_week,is_weekend,pickup_time_minutes,order_time_hour,order_time_of_day,distance,distance_type
0,INDORES13DEL02,37.0,4.9,22.745049,75.892471,22.765049,75.912471,2022-03-19,sunny,high,...,INDO,19,3,saturday,1,15.0,11.0,morning,3.025149,short
1,BANGRES18DEL02,34.0,4.5,12.913041,77.683237,13.043041,77.813237,2022-03-25,stormy,jam,...,BANG,25,3,friday,0,5.0,19.0,evening,20.183530,very_long
2,BANGRES19DEL01,23.0,4.4,12.914264,77.678400,12.924264,77.688400,2022-03-19,sandstorms,low,...,BANG,19,3,saturday,1,15.0,8.0,morning,1.552758,short
3,COIMBRES13DEL02,38.0,4.7,11.003669,76.976494,11.053669,77.026494,2022-04-05,sunny,medium,...,COIMB,5,4,tuesday,0,10.0,18.0,evening,7.790401,medium
4,CHENRES12DEL01,32.0,4.6,12.972793,80.249982,13.012793,80.289982,2022-03-26,cloudy,high,...,CHEN,26,3,saturday,1,15.0,13.0,afternoon,6.210138,medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45497,JAPRES04DEL01,30.0,4.8,26.902328,75.794257,26.912328,75.804257,2022-03-24,windy,high,...,JAP,24,3,thursday,0,10.0,11.0,morning,1.489846,short
45498,AGRRES16DEL01,21.0,4.6,,,,,2022-02-16,windy,jam,...,AGR,16,2,wednesday,0,15.0,19.0,evening,,
45499,CHENRES08DEL03,30.0,4.9,13.022394,80.242439,13.052394,80.272439,2022-03-11,cloudy,low,...,CHEN,11,3,friday,0,15.0,23.0,night,4.657195,short
45500,COIMBRES11DEL01,20.0,4.7,11.001753,76.986241,11.041753,77.026241,2022-03-07,cloudy,high,...,COIMB,7,3,monday,0,5.0,13.0,afternoon,6.232393,medium


In [14]:
# drop columns not required for model input
# Define the list of columns you want to remove
columns_to_drop = [
    'rider_id',
    'restaurant_latitude',
    'restaurant_longitude',
    'delivery_latitude',
    'delivery_longitude',
    'order_date',
    "order_time_hour",
    "order_day"
]

# Find which of these columns actually exist in the DataFrame
cols_that_exist = [col for col in columns_to_drop if col in df_final.columns]

# Drop only the columns that were found
df_final.drop(columns=cols_that_exist, inplace=True)

print(f"Successfully dropped columns: {cols_that_exist}")

Successfully dropped columns: ['rider_id', 'restaurant_latitude', 'restaurant_longitude', 'delivery_latitude', 'delivery_longitude', 'order_date', 'order_time_hour', 'order_day']


In [15]:
df_final.head()

Unnamed: 0,age,ratings,weather,traffic,vehicle_condition,type_of_order,type_of_vehicle,multiple_deliveries,festival,city_type,time_taken,city_name,order_month,order_day_of_week,is_weekend,pickup_time_minutes,order_time_of_day,distance,distance_type
0,37.0,4.9,sunny,high,2,snack,motorcycle,0.0,no,urban,24,INDO,3,saturday,1,15.0,morning,3.025149,short
1,34.0,4.5,stormy,jam,2,snack,scooter,1.0,no,metropolitian,33,BANG,3,friday,0,5.0,evening,20.18353,very_long
2,23.0,4.4,sandstorms,low,0,drinks,motorcycle,1.0,no,urban,26,BANG,3,saturday,1,15.0,morning,1.552758,short
3,38.0,4.7,sunny,medium,0,buffet,motorcycle,1.0,no,metropolitian,21,COIMB,4,tuesday,0,10.0,evening,7.790401,medium
4,32.0,4.6,cloudy,high,1,snack,scooter,1.0,no,metropolitian,30,CHEN,3,saturday,1,15.0,afternoon,6.210138,medium


In [16]:
#check missing values
df_final.isnull().sum()

age                    1854
ratings                1908
weather                 525
traffic                 510
vehicle_condition         0
type_of_order             0
type_of_vehicle           0
multiple_deliveries     993
festival                228
city_type              1198
time_taken                0
city_name                 0
order_month               0
order_day_of_week         0
is_weekend                0
pickup_time_minutes    1640
order_time_of_day      2070
distance               3630
distance_type          3630
dtype: int64

In [17]:
#check for duplicates
df_final.duplicated().sum()

np.int64(0)

In [18]:

# columns that have missing values
missing_cols = (
    df_final
    .isna()
    .any(axis=0)
    .loc[lambda x: x]
    .index
)

In [19]:
missing_cols

Index(['age', 'ratings', 'weather', 'traffic', 'multiple_deliveries',
       'festival', 'city_type', 'pickup_time_minutes', 'order_time_of_day',
       'distance', 'distance_type'],
      dtype='object')

In [20]:
temp_df=df_final.copy().dropna()

In [21]:
# split into X and y

X = temp_df.drop(columns='time_taken')
y = temp_df['time_taken']

In [22]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
print("The size of train data is", X_train.shape)
print("The shape of test data is", X_test.shape)

The size of train data is (30156, 18)
The shape of test data is (7539, 18)


In [24]:
# do basic preprocessing
num_cols = ["age", "ratings", "pickup_time_minutes", "distance"]

nominal_cat_cols = [
    'weather', 'type_of_order', 'type_of_vehicle', 'festival',
    "city_type", "order_month", "order_day_of_week",
    "is_weekend", "order_time_of_day"
]

ordinal_cat_cols = ["traffic", "distance_type"]

In [25]:
len(num_cols + nominal_cat_cols + ordinal_cat_cols)

15

In [26]:
for col in ordinal_cat_cols:
    print(col, X_train[col].unique())

traffic ['jam' 'medium' 'high' 'low']
distance_type ['medium' 'short' 'long' 'very_long']


In [27]:
# generate order for ordinal encoding
traffic_order = ["low", "medium", "high", "jam"]
distance_type_order = ["short", "medium", "long", "very_long"]

In [28]:
# build a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("scale", MinMaxScaler(), num_cols),
        (
            "nominal_encode",
            OneHotEncoder(drop="first", handle_unknown="ignore", sparse_output=False),
            nominal_cat_cols,
        ),
        (
            "ordinal_encode",
            OrdinalEncoder(categories=[traffic_order, distance_type_order]),
            ordinal_cat_cols,
        ),
    ],
    remainder="passthrough",
    n_jobs=-1,
    force_int_remainder_cols=False,
    verbose_feature_names_out=False,
)

preprocessor.set_output(transform="pandas")

0,1,2
,transformers,"[('scale', ...), ('nominal_encode', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,-1
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,False

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,"[['low', 'medium', ...], ['short', 'medium', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,


In [29]:
# transform the data
X_train_trans = preprocessor.fit_transform(X_train)
X_test_trans = preprocessor.transform(X_test)
X_train_trans



Unnamed: 0,age,ratings,pickup_time_minutes,distance,weather_fog,weather_sandstorms,weather_stormy,weather_sunny,weather_windy,type_of_order_drinks,...,order_day_of_week_wednesday,is_weekend_1,order_time_of_day_evening,order_time_of_day_morning,order_time_of_day_night,traffic,distance_type,vehicle_condition,multiple_deliveries,city_name
8708,0.473684,0.56,1.0,0.404165,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,3.0,1.0,0,2.0,COIMB
25198,1.000000,0.76,0.0,0.154044,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0,1.0,JAP
34049,0.473684,0.80,0.5,0.002461,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,2.0,0.0,1,0.0,INDO
25987,1.000000,0.92,1.0,0.460411,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0,1.0,KNP
37121,0.526316,0.76,0.5,0.243676,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1,1.0,MYS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20239,0.578947,0.92,0.5,0.451895,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,3.0,2.0,0,0.0,DEH
7590,0.052632,1.00,1.0,0.612270,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,2.0,1,1.0,JAP
13610,0.526316,0.92,0.0,0.322877,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1,0.0,BANG
1045,0.947368,0.96,0.5,0.004486,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0,1.0,BANG


In [30]:
# transform target column
pt = PowerTransformer()

y_train_pt = pt.fit_transform(y_train.values.reshape(-1, 1))
y_test_pt = pt.transform(y_test.values.reshape(-1, 1))

There are some exterme values in target so applies power transformer

In [31]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
# --- Train the model ---
lr = LinearRegression()
lr.fit(X_train_trans, y_train_pt)

ValueError: could not convert string to float: 'COIMB'

In [None]:
# --- Get the predictions (in transformed scale) ---
y_pred_train = lr.predict(X_train_trans)
y_pred_test = lr.predict(X_test_trans)

In [None]:
# --- Get the actual prediction values (inverse transform) ---
# Assumes 'pt' is the previously fitted PowerTransformer
y_pred_train_org = pt.inverse_transform(y_pred_train.reshape(-1, 1))
y_pred_test_org = pt.inverse_transform(y_pred_test.reshape(-1, 1))

In [None]:
# --- Evaluate the model ---
print(f"the train error is {mean_absolute_error(y_train, y_pred_train_org):.2f} minutes")
print(f"the test error is {mean_absolute_error(y_test, y_pred_test_org):.2f} minutes")


In [None]:
# --- Calculate and print Mean Absolute Error ---
print(f"The train error is {mean_absolute_error(y_train, y_pred_train_org):.2f} minutes")
print(f"The test error is {mean_absolute_error(y_test, y_pred_test_org):.2f} minutes")

In [None]:
# --- Calculate and print R2 Score ---
print(f"The train r2 score is {r2_score(y_train, y_pred_train_org):.2f}")
print(f"The test r2 score is {r2_score(y_test, y_pred_test_org):.2f}")

# now we will impute missing values and find prediction

In [None]:
temp_df=df_final.copy()

In [None]:
# split into X and y
X = temp_df.drop(columns='time_taken')
y = temp_df['time_taken']


In [None]:
X

In [None]:
# --- Train-Test Split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# --- Print Shapes of the Sets ---
print("The size of train data is", X_train.shape)
print("The shape of test data is", X_test.shape)

In [None]:
X_train.isna().sum()

In [None]:
# --- Transform target column ---
pt = PowerTransformer()
y_train_pt = pt.fit_transform(y_train.values.reshape(-1, 1))
y_test_pt = pt.transform(y_test.values.reshape(-1, 1))

In [None]:
missing_cols

In [None]:
# --- Calculate percentage of rows with missing values ---
(
    X_train
    .isna()
    .any(axis=1)
    .mean()
    .round(2) * 100
)

In [None]:
X_train['age'].describe()

In [None]:
X_train['age'].isna().sum()

In [None]:
# --- Calculate the median value ---
age_median = X_train['age'].median()

In [None]:
# --- Plot the KDE plot for comparison ---
sns.kdeplot(X_train['age'], label="original")
sns.kdeplot(X_train['age'].fillna(age_median), label="imputed")
plt.legend()
plt.show()

In [None]:

# --- Separate features based on imputation strategy ---
features_to_fill_mode = ['multiple_deliveries', 'festival', 'city_type']
features_to_fill_missing = [
    col for col in nominal_cat_cols if col not in features_to_fill_mode
]
features_to_fill_missing

In [None]:
simple_imputer = ColumnTransformer(
    transformers=[
        (
            "mode_imputer",
            SimpleImputer(strategy="most_frequent"),
            features_to_fill_mode,
        ),
        (
            "missing_imputer",
            SimpleImputer(strategy="constant", fill_value="missing"),
            features_to_fill_missing,
        ),
    ],
    remainder="passthrough",
    n_jobs=-1,
    force_int_remainder_cols=False,
    verbose_feature_names_out=False,
)

In [None]:
simple_imputer

In [None]:
simple_imputer.fit_transform(X_train)

In [None]:
simple_imputer.fit_transform(X_train).isna().sum()

In [None]:
from sklearn.impute import KNNImputer
# --- KNN Imputer ---
knn_imputer = KNNImputer(n_neighbors=5)

In [None]:
# --- Do basic preprocessing: Group columns by type ---
num_cols = ["age", "ratings", "pickup_time_minutes", "distance"]

nominal_cat_cols = [
    'weather', 'type_of_order', 'type_of_vehicle', 'festival',
    "city_type", "city_name", "order_month", "order_day_of_week",
    "is_weekend", "order_time_of_day"
]

ordinal_cat_cols = ["traffic", "distance_type"]

In [None]:
# --- Generate order for ordinal encoding ---
traffic_order = ["low", "medium", "high", "jam"]
distance_type_order = ["short", "medium", "long", "very_long"]

In [None]:
# --- Generate order for ordinal encoding ---
traffic_order = ["low", "medium", "high", "jam"]
distance_type_order = ["short", "medium", "long", "very_long"]

In [None]:
# --- Check unique categories in the ordinal columns ---
for col in ordinal_cat_cols:
    print(col, X_train[col].unique())

In [None]:
# --- Build a preprocessor ---
preprocessor = ColumnTransformer(
    transformers=[
        ("scale", MinMaxScaler(), num_cols),
        (
            "nominal_encode",
            OneHotEncoder(drop='first',handle_unknown="ignore", sparse_output=False),
            nominal_cat_cols,
        ),
        (
            "ordinal_encode",
            OrdinalEncoder(
                categories=[traffic_order, distance_type_order],
                encoded_missing_value=-999,
                handle_unknown="use_encoded_value",
                unknown_value=-1,
            ),
            ordinal_cat_cols,
        ),
    ],
    remainder="passthrough",
    n_jobs=-1,
    force_int_remainder_cols=False,
    verbose_feature_names_out=False,
)
preprocessor

In [None]:
preprocessor.fit_transform(X_train)

In [None]:
# --- Inspect the value counts of a transformed column ---
preprocessor.fit_transform(X_train)["distance_type"].value_counts()

In [None]:
# --- Verify that no columns have missing values after transformation ---
preprocessor.fit_transform(X_train).isna().sum().loc[lambda ser: ser.ge(1)]

In [None]:
# build the pipeline
processing_pipeline = Pipeline(
    steps=[
        ("simple_imputer", simple_imputer),
        ("preprocess", preprocessor),
        ("knn_imputer", knn_imputer),
    ]
)
processing_pipeline

In [None]:
# fit and transform the pipeline on X_train
processing_pipeline.fit_transform(X_train)

In [None]:
# Initialize the model
lr = LinearRegression()

# Create the final model pipeline
model_pipe = Pipeline(
    steps=[
        ("preprocessing", processing_pipeline),
        ("model", lr)
    ]
)

model_pipe

In [None]:
model_pipe.fit(X_train,y_train_pt)

In [None]:
# --- Get the predictions ---
y_pred_train = model_pipe.predict(X_train)
y_pred_test = model_pipe.predict(X_test)

In [None]:
y_pred_train_org = pt.inverse_transform(y_pred_train.reshape(-1, 1))
y_pred_test_org = pt.inverse_transform(y_pred_test.reshape(-1, 1))


In [None]:
# --- Evaluate the model using Mean Absolute Error ---
print(f"The train error is {mean_absolute_error(y_train, y_pred_train_org):.2f} minutes")
print(f"The test error is {mean_absolute_error(y_test, y_pred_test_org):.2f} minutes")


In [None]:
# --- Evaluate the model using R2 Score ---
print(f"The train r2 score is {r2_score(y_train, y_pred_train_org):.2f}")
print(f"The test r2 score is {r2_score(y_test, y_pred_test_org):.2f}")

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the model
rf = RandomForestRegressor(random_state=42, n_jobs=-1)

# Create the final model pipeline
model_pipe = Pipeline(
    steps=[
        ("preprocessing", processing_pipeline),
        ("model", rf)
    ]
)
model_pipe.fit(X_train,y_train_pt.values.ravel())

In [None]:
# --- Get the predictions ---
y_pred_train = model_pipe.predict(X_train)
y_pred_test = model_pipe.predict(X_test)

In [None]:
y_pred_train_org = pt.inverse_transform(y_pred_train.reshape(-1, 1))
y_pred_test_org = pt.inverse_transform(y_pred_test.reshape(-1, 1))


In [None]:
# --- Evaluate the model using Mean Absolute Error ---
print(f"The train error is {mean_absolute_error(y_train, y_pred_train_org):.2f} minutes")
print(f"The test error is {mean_absolute_error(y_test, y_pred_test_org):.2f} minutes")


In [None]:
# --- Evaluate the model using R2 Score ---
print(f"The train r2 score is {r2_score(y_train, y_pred_train_org):.2f}")
print(f"The test r2 score is {r2_score(y_test, y_pred_test_org):.2f}")