In [1]:
## Importing all the necessary libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from warnings import filterwarnings

## Feature engineering
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import OneHotEncoder
## Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
## Train Test Split
from sklearn.model_selection import train_test_split
## Model Training
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [2]:
df = pd.read_csv('D:\\Ineuron\\Machine_Learning\\Delivery_time_Prediction\\Dataset\\finalTrain.csv')

## Ingesting the dataset from local.

In [66]:
df.head(5)

Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min)
0,0xcdcd,DEHRES17DEL01,36.0,4.2,30.327968,78.046106,30.397968,78.116106,12-02-2022,21:55,22:10,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,46
1,0xd987,KOCRES16DEL01,21.0,4.7,10.003064,76.307589,10.043064,76.347589,13-02-2022,14:55,15:05,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,23
2,0x2784,PUNERES13DEL03,23.0,4.7,18.56245,73.916619,18.65245,74.006619,04-03-2022,17:30,17:40,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitian,21
3,0xc8b6,LUDHRES15DEL02,34.0,4.3,30.899584,75.809346,30.919584,75.829346,13-02-2022,09:20,09:30,Sandstorms,Low,0,Buffet,motorcycle,0.0,No,Metropolitian,20
4,0xdb64,KNPRES14DEL02,24.0,4.7,26.463504,80.372929,26.593504,80.502929,14-02-2022,19:50,20:05,Fog,Jam,1,Snack,scooter,1.0,No,Metropolitian,41


In [67]:
df.shape

## Total no. of rows and columns in the dataset

(45584, 20)

In [68]:
df.info()

## Checking the data types of all the dependent and independent features.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45584 entries, 0 to 45583
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   ID                           45584 non-null  object 
 1   Delivery_person_ID           45584 non-null  object 
 2   Delivery_person_Age          43730 non-null  float64
 3   Delivery_person_Ratings      43676 non-null  float64
 4   Restaurant_latitude          45584 non-null  float64
 5   Restaurant_longitude         45584 non-null  float64
 6   Delivery_location_latitude   45584 non-null  float64
 7   Delivery_location_longitude  45584 non-null  float64
 8   Order_Date                   45584 non-null  object 
 9   Time_Orderd                  43853 non-null  object 
 10  Time_Order_picked            45584 non-null  object 
 11  Weather_conditions           44968 non-null  object 
 12  Road_traffic_density         44983 non-null  object 
 13  Vehicle_conditio

In [3]:
df.drop(columns=['Delivery_person_ID', 'ID'], axis=1, inplace=True)

## Dropping Delivery_person_ID' & 'ID' features because of no relevance with the target feature. 

In [70]:
df.head(2)

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min)
0,36.0,4.2,30.327968,78.046106,30.397968,78.116106,12-02-2022,21:55,22:10,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,46
1,21.0,4.7,10.003064,76.307589,10.043064,76.347589,13-02-2022,14:55,15:05,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,23


In [None]:
df.drop(columns=['Order_Date', 'Time_Orderd', 'Time_Order_picked'], axis=1, inplace=True)

## Dropped these categorical features because of no clear insights in relation to our target feature/variable.

In [12]:
df.isnull().sum()

## Total null values in all the features.

Delivery_person_Age            1278
Delivery_person_Ratings        1330
Restaurant_latitude               0
Restaurant_longitude              0
Delivery_location_latitude        0
Delivery_location_longitude       0
Weather_conditions                0
Road_traffic_density              0
Vehicle_condition                 0
Type_of_order                     0
Type_of_vehicle                   0
multiple_deliveries               0
Festival                          0
City                           1128
Time_taken (min)                  0
dtype: int64

In [None]:
df.dropna(subset=['Festival', 'multiple_deliveries', 'Weather_conditions', 'Road_traffic_density'], inplace=True)

## Dropped all the rows from the above columns having NULL values. 
## This was done because the NULL values compared to the entire volume of the dataset was much smaller. 
## NOTE: Dropped the feature rows where the NULL values were less than 1000.

In [None]:
df[['Festival', 'multiple_deliveries', 'Weather_conditions', 'Road_traffic_density']].isnull().sum()

## No NULL values after dropping the rows. 

Festival                0
multiple_deliveries     0
Weather_conditions      0
Road_traffic_density    0
dtype: int64

In [16]:
## Seggregating Independent and dependent features from Dataset. 

X = df.iloc[:, 0:-1]
Y = df.iloc[:,-1]

In [19]:
## Segregating the Independent features based on their Data types to visualise the data in the later steps.

numerical_columns = X.select_dtypes(exclude='object').columns
categorical_columns = X.select_dtypes(include='object').columns

In [20]:
numerical_columns

Index(['Delivery_person_Age', 'Delivery_person_Ratings', 'Restaurant_latitude',
       'Restaurant_longitude', 'Delivery_location_latitude',
       'Delivery_location_longitude', 'Vehicle_condition',
       'multiple_deliveries'],
      dtype='object')

In [21]:
categorical_columns

Index(['Weather_conditions', 'Road_traffic_density', 'Type_of_order',
       'Type_of_vehicle', 'Festival', 'City'],
      dtype='object')

In [76]:
df.duplicated().sum()

## No duplicated values found in the entire dataset. 

0

In [77]:
## Doing a descriptive stats to undestand more about the numerical features.  

df.describe()

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Vehicle_condition,multiple_deliveries,Time_taken (min)
count,43730.0,43676.0,45584.0,45584.0,45584.0,45584.0,45584.0,44591.0,45584.0
mean,29.566911,4.633774,17.017948,70.229684,17.46548,70.844161,1.023385,0.744635,26.293963
std,5.815064,0.334744,8.185674,22.885575,7.335562,21.120578,0.839055,0.57251,9.384298
min,15.0,1.0,-30.905562,-88.366217,0.01,0.01,0.0,0.0,10.0
25%,25.0,4.5,12.933284,73.17,12.988453,73.28,0.0,0.0,19.0
50%,30.0,4.7,18.55144,75.897963,18.633934,76.002574,1.0,1.0,26.0
75%,35.0,4.9,22.728163,78.044095,22.785049,78.107044,2.0,1.0,32.0
max,50.0,6.0,30.914057,88.433452,31.054057,88.563452,3.0,3.0,54.0


In [78]:
## Created a new object which will have all the numerical features including time taken (target feature). 

num_features_with_timetaken = df.select_dtypes(exclude='object').columns

In [None]:
## Plotting HEATMAP to see the correlation between numerical_columns and target_variable:

sns.set(rc={"figure.figsize": (10, 8)})
sns.heatmap(df[num_features_with_timetaken].corr(), annot=True)

In [80]:
df.head(2)

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min)
0,36.0,4.2,30.327968,78.046106,30.397968,78.116106,12-02-2022,21:55,22:10,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,46
1,21.0,4.7,10.003064,76.307589,10.043064,76.347589,13-02-2022,14:55,15:05,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,23


In [None]:
for columns in categorical_columns:
    sns.lineplot(x=columns, y=df['Time_taken (min)'], data=df)
    plt.show()

In [95]:
df.head(2)

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min)
0,36.0,4.2,30.327968,78.046106,30.397968,78.116106,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,46
1,21.0,4.7,10.003064,76.307589,10.043064,76.347589,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,23


In [89]:
df['Delivery_person_Ratings'].unique()

## Checking all the unique ratings available in Delivery_person_Ratings. 

array([4.2, 4.7, 4.3, 4.5, 4. , 4.1, 5. , 4.8, 4.9, 3.5, 4.6, nan, 4.4,
       3.8, 3.9, 3.7, 2.6, 2.5, 3.6, 3.1, 2.7, 3.2, 3.3, 3.4, 2.8, 2.9,
       3. ])

In [90]:
df['City'].unique()

array(['Metropolitian', 'Urban', 'Semi-Urban', nan], dtype=object)

In [30]:
## Numerical pipeline

num_pipeline = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='median')),
           ('scaler', StandardScaler(with_mean=False))]
)

## Categorical pipeline

cat_pipeline = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='most_frequent')),
           ('One_Hot_Encoder', OneHotEncoder(categories='auto', handle_unknown='ignore')),
           ('scaler', StandardScaler(with_mean=False))]
)

## This is to define which column will be processed by each pipeline. 
## Both the pipelines are being merged to one object so that it can be used to preprocess both numerical and categorical data simultaneously. 

preprocessor = ColumnTransformer([
('num_pipeline', num_pipeline, numerical_columns),
('cat_pipeline', cat_pipeline, categorical_columns)
])


In [157]:
categorical_columns

Index(['Weather_conditions', 'Road_traffic_density', 'Type_of_order',
       'Type_of_vehicle', 'Festival', 'City'],
      dtype='object')

In [161]:
numerical_columns

Index(['Delivery_person_Age', 'Delivery_person_Ratings', 'Restaurant_latitude',
       'Restaurant_longitude', 'Delivery_location_latitude',
       'Delivery_location_longitude', 'Vehicle_condition',
       'multiple_deliveries'],
      dtype='object')

In [31]:
preprocessor

In [24]:
## Train test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=30)

In [28]:
X_train.head(5)

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City
13131,24.0,4.6,13.091809,80.219104,13.151809,80.279104,Fog,Low,0,Meal,motorcycle,1.0,No,Metropolitian
43567,30.0,4.7,18.530963,73.828972,18.540963,73.838972,Sandstorms,High,2,Buffet,motorcycle,1.0,No,Urban
33120,25.0,4.6,11.006686,76.951736,11.086686,77.031736,Stormy,Medium,2,Meal,scooter,0.0,No,Urban
33703,35.0,4.6,12.284747,76.625861,12.314747,76.655861,Fog,Medium,1,Meal,scooter,1.0,No,Metropolitian
10076,25.0,4.9,11.001852,76.976268,11.081852,77.056268,Fog,Low,2,Buffet,electric_scooter,1.0,No,Urban


In [26]:
X_test.shape

(13133, 14)

In [32]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [156]:
df['Weather_conditions'].unique()

array(['Fog', 'Stormy', 'Sandstorms', 'Windy', 'Cloudy', 'Sunny'],
      dtype=object)

In [133]:
X_train.head(2)

Unnamed: 0,num_pipeline__Delivery_person_Age,num_pipeline__Delivery_person_Ratings,num_pipeline__Restaurant_latitude,num_pipeline__Restaurant_longitude,num_pipeline__Delivery_location_latitude,num_pipeline__Delivery_location_longitude,num_pipeline__Vehicle_condition,num_pipeline__multiple_deliveries,cat_pipeline__Weather_conditions_Cloudy,cat_pipeline__Weather_conditions_Fog,...,cat_pipeline__Type_of_order_Meal,cat_pipeline__Type_of_order_Snack,cat_pipeline__Type_of_vehicle_electric_scooter,cat_pipeline__Type_of_vehicle_motorcycle,cat_pipeline__Type_of_vehicle_scooter,cat_pipeline__Festival_No,cat_pipeline__Festival_Yes,cat_pipeline__City_Metropolitian,cat_pipeline__City_Semi-Urban,cat_pipeline__City_Urban
0,4.224172,14.856959,1.645144,3.682979,1.790794,3.800165,0.0,1.75278,0.0,2.662982,...,2.315708,0.0,0.0,2.028988,0.0,7.057785,0.0,2.392429,0.0,0.0
1,5.280215,15.179937,2.328639,3.389599,2.524599,3.495309,2.451552,1.75278,0.0,0.0,...,0.0,0.0,0.0,2.028988,0.0,7.057785,0.0,0.0,0.0,2.406376


In [132]:
X_test.head(2)

Unnamed: 0,num_pipeline__Delivery_person_Age,num_pipeline__Delivery_person_Ratings,num_pipeline__Restaurant_latitude,num_pipeline__Restaurant_longitude,num_pipeline__Delivery_location_latitude,num_pipeline__Delivery_location_longitude,num_pipeline__Vehicle_condition,num_pipeline__multiple_deliveries,cat_pipeline__Weather_conditions_Cloudy,cat_pipeline__Weather_conditions_Fog,...,cat_pipeline__Type_of_order_Meal,cat_pipeline__Type_of_order_Snack,cat_pipeline__Type_of_vehicle_electric_scooter,cat_pipeline__Type_of_vehicle_motorcycle,cat_pipeline__Type_of_vehicle_scooter,cat_pipeline__Festival_No,cat_pipeline__Festival_Yes,cat_pipeline__City_Metropolitian,cat_pipeline__City_Semi-Urban,cat_pipeline__City_Urban
0,5.280215,13.56505,1.629878,3.564833,1.767444,3.675983,1.225776,1.75278,2.66891,0.0,...,2.315708,0.0,0.0,2.028988,0.0,7.057785,0.0,2.392429,0.0,0.0
1,4.048165,15.179937,2.803586,3.359387,3.044683,3.466053,1.225776,1.75278,0.0,2.662982,...,2.315708,0.0,0.0,2.028988,0.0,7.057785,0.0,2.392429,0.0,0.0


In [33]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mse, mae, rmse, r2_square

In [37]:
## Train multiple models

models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet()
}

model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mse, mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("MSE:",mse)
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",np.round(r2_square*100, 2))

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
MSE: 38.40261642605935
RMSE: 6.196984462305788
MAE: 4.908053662529506
R2 score 56.37


Lasso
Model Training Performance
MSE: 46.829656293422
RMSE: 6.843219731487657
MAE: 5.423437446389698
R2 score 46.79


Ridge
Model Training Performance
MSE: 38.36948802725511
RMSE: 6.194310940472323
MAE: 4.907020434149774
R2 score 56.4


Elasticnet
Model Training Performance
MSE: 45.91128091112634
RMSE: 6.775786368468706
MAE: 5.408633294813905
R2 score 47.83




In [36]:
## This is to check the accuracy of the Training data

reg = LinearRegression()

# fit the model to the training data
reg.fit(X_train, y_train)

# use the model to make predictions on new data
y_train_pred = reg.predict(X_train)

np.round(reg.score(X_train, y_train)*100, 2)


56.23