In [2]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

In [3]:
## Data Ingestions Step
df = pd.read_csv("finalTrain.csv")

In [4]:
df_copy = df.copy()

In [5]:
df_copy = df_copy.drop(labels=['ID', 'Delivery_person_ID', 'Order_Date', 'Time_Orderd', 'Time_Order_picked'],axis=1)
df_copy.head()

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min)
0,36.0,4.2,30.327968,78.046106,30.397968,78.116106,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,46
1,21.0,4.7,10.003064,76.307589,10.043064,76.347589,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,23
2,23.0,4.7,18.56245,73.916619,18.65245,74.006619,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitian,21
3,34.0,4.3,30.899584,75.809346,30.919584,75.829346,Sandstorms,Low,0,Buffet,motorcycle,0.0,No,Metropolitian,20
4,24.0,4.7,26.463504,80.372929,26.593504,80.502929,Fog,Jam,1,Snack,scooter,1.0,No,Metropolitian,41


In [6]:
## Missing Values
df_copy.isnull().sum()

Delivery_person_Age            1854
Delivery_person_Ratings        1908
Restaurant_latitude               0
Restaurant_longitude              0
Delivery_location_latitude        0
Delivery_location_longitude       0
Weather_conditions              616
Road_traffic_density            601
Vehicle_condition                 0
Type_of_order                     0
Type_of_vehicle                   0
multiple_deliveries             993
Festival                        228
City                           1200
Time_taken (min)                  0
dtype: int64

In [7]:
df_copy['Delivery_person_Age'] = df_copy['Delivery_person_Age'].fillna(df_copy['Delivery_person_Age'].mean())
df_copy['Delivery_person_Ratings']  = df_copy['Delivery_person_Ratings'].fillna(df_copy['Delivery_person_Ratings'].mean())
df_copy['Weather_conditions']  = df_copy['Weather_conditions'].fillna('Fog')
df_copy['Road_traffic_density']  = df_copy['Road_traffic_density'].fillna('Low')
df_copy['multiple_deliveries'] = df_copy['multiple_deliveries'].fillna(df_copy['multiple_deliveries'].median())
df_copy['Festival']  = df_copy['Festival'].fillna('No')
df_copy['City']  = df_copy['City'].fillna('Metropolitian')

In [8]:
df_copy.isnull().sum()

Delivery_person_Age            0
Delivery_person_Ratings        0
Restaurant_latitude            0
Restaurant_longitude           0
Delivery_location_latitude     0
Delivery_location_longitude    0
Weather_conditions             0
Road_traffic_density           0
Vehicle_condition              0
Type_of_order                  0
Type_of_vehicle                0
multiple_deliveries            0
Festival                       0
City                           0
Time_taken (min)               0
dtype: int64

In [9]:
R = 6371  ##The earth's radius (in km)

def deg_to_rad(degrees):
    return degrees * (np.pi/180)

## The haversine formula
def distcalculate(lat1, lon1, lat2, lon2):
    d_lat = deg_to_rad(lat2-lat1)
    d_lon = deg_to_rad(lon2-lon1)
    a1 = np.sin(d_lat/2)**2 + np.cos(deg_to_rad(lat1))
    a2 = np.cos(deg_to_rad(lat2)) * np.sin(d_lon/2)**2
    a = a1 * a2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    return R * c

# Create distance column & calculate the distance
df_copy['distance'] = np.nan

for i in range(len(df_copy)):
  df_copy.loc[i, 'distance'] = distcalculate(df_copy.loc[i, 'Restaurant_latitude'], 
                                          df_copy.loc[i, 'Restaurant_longitude'], 
                                          df_copy.loc[i, 'Delivery_location_latitude'], 
                                          df_copy.loc[i, 'Delivery_location_longitude'])

In [10]:
df_copy = df_copy.drop(labels=['Restaurant_latitude',	'Restaurant_longitude',	'Delivery_location_latitude', 'Delivery_location_longitude'],axis=1)
df_copy.head()

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min),distance
0,36.0,4.2,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,46,6.716044
1,21.0,4.7,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,23,4.379914
2,23.0,4.7,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitian,21,9.484418
3,34.0,4.3,Sandstorms,Low,0,Buffet,motorcycle,0.0,No,Metropolitian,20,1.908058
4,24.0,4.7,Fog,Jam,1,Snack,scooter,1.0,No,Metropolitian,41,12.933369


These graphs shows that the weather condition, type of vehicle, the type of order, vehicle condition do not significantly affect delivery time

In [12]:
df_copy = df_copy.drop(labels=['Weather_conditions', 'Vehicle_condition', 'Type_of_vehicle','Type_of_order'],axis=1)
df_copy.head()

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Road_traffic_density,multiple_deliveries,Festival,City,Time_taken (min),distance
0,36.0,4.2,Jam,3.0,No,Metropolitian,46,6.716044
1,21.0,4.7,High,1.0,No,Metropolitian,23,4.379914
2,23.0,4.7,Medium,1.0,No,Metropolitian,21,9.484418
3,34.0,4.3,Low,0.0,No,Metropolitian,20,1.908058
4,24.0,4.7,Jam,1.0,No,Metropolitian,41,12.933369


In [13]:
## Independent and dependent feature
X = df_copy.drop(labels=['Time_taken (min)'],axis=1)
Y = df_copy[['Time_taken (min)']]

In [14]:
# Define which columns should be ordinal-encoded and which should be scale
categorical_cols = X.select_dtypes(include='object') .columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [15]:
categorical_cols

Index(['Road_traffic_density', 'Festival', 'City'], dtype='object')

In [16]:
for i in categorical_cols:
    print('\n')
    print(df_copy[i].unique())



['Jam' 'High' 'Medium' 'Low']


['No' 'Yes']


['Metropolitian' 'Urban' 'Semi-Urban']


In [17]:
# Define the custom ranking for each ordinal variable
#Weather_condition_categories = ['Fog', 'Stormy', 'Sandstorms', 'Windy', 'Cloudy', 'Sunny']
Road_traffic_density_categories = ['Jam', 'High', 'Medium', 'Low']
#Type_of_order_categories = ['Snack', 'Meal', 'Drinks', 'Buffet']
#Type_of_vehicle_categories = ['motorcycle', 'scooter', 'electric_scooter', 'bicycle']
Festival_categories = ['No', 'Yes']
City_categories = ['Metropolitian', 'Urban', 'Semi-Urban']

In [18]:
# Dependent value
Y

Unnamed: 0,Time_taken (min)
0,46
1,23
2,21
3,20
4,41
...,...
45579,32
45580,36
45581,16
45582,26


In [19]:
from sklearn.impute import SimpleImputer  # Handling missing values
from sklearn.preprocessing import StandardScaler  # Handling feature  scaling
from sklearn.preprocessing import OrdinalEncoder  # ordinal encodind
## Pipelines 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [20]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())

    ]

)

# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[Road_traffic_density_categories, Festival_categories, City_categories])),
    ('scaler',StandardScaler())
    ]

)

preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])

In [21]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=30)
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())
X_train.head()

Unnamed: 0,num_pipeline__Delivery_person_Age,num_pipeline__Delivery_person_Ratings,num_pipeline__multiple_deliveries,num_pipeline__distance,cat_pipeline__Road_traffic_density,cat_pipeline__Festival,cat_pipeline__City
0,1.127187,-1.616514,-1.318236,-0.058522,-1.308057,-0.142953,-0.532682
1,-0.103466,-3.137444,3.943714,-0.05955,-1.308057,6.995304,1.805257
2,0.248149,0.512787,0.435747,-0.063728,1.093916,-0.142953,-0.532682
3,0.599764,-1.312329,0.435747,-0.051777,-1.308057,-0.142953,-0.532682
4,0.775572,-1.616514,0.435747,-0.063617,1.093916,-0.142953,1.805257


In [22]:
## Model Training

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [23]:
regression=LinearRegression()
regression.fit(X_train,y_train)

In [24]:
regression.coef_

array([[ 2.20909766, -2.4156753 ,  2.27297352,  0.07876257, -3.15367838,
         1.76229546, -0.83762666]])

In [25]:
regression.intercept_

array([26.31769462])

In [26]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [27]:
## Train multiple models

models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet()
}
trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
RMSE: 6.987259125992553
MAE: 5.469572183358936
R2 score 43.55530662405218


Lasso
Model Training Performance
RMSE: 7.260363453782937
MAE: 5.7514887191909345
R2 score 39.056675294657495


Ridge
Model Training Performance
RMSE: 6.987256813731662
MAE: 5.46957138448085
R2 score 43.55534398200129


Elasticnet
Model Training Performance
RMSE: 7.356230480519613
MAE: 5.835705139974187
R2 score 37.43663857214218




In [None]:
model_list

['LinearRegression', 'Lasso', 'Ridge', 'Elasticnet']

In [28]:
features = np.array([[1.127187,	-1.616514,	-1.318236,	-0.058522,	-1.308057,	-0.142953,	-0.532682]])
print("Delivery Time Prediction in Minutes = ", model.predict(features))

Delivery Time Prediction in Minutes =  [30.70626202]


