In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./data/finalTrain.csv')
df.head()

Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min)
0,0xcdcd,DEHRES17DEL01,36.0,4.2,30.327968,78.046106,30.397968,78.116106,12-02-2022,21:55,22:10,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,46
1,0xd987,KOCRES16DEL01,21.0,4.7,10.003064,76.307589,10.043064,76.347589,13-02-2022,14:55,15:05,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,23
2,0x2784,PUNERES13DEL03,23.0,4.7,18.56245,73.916619,18.65245,74.006619,04-03-2022,17:30,17:40,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitian,21
3,0xc8b6,LUDHRES15DEL02,34.0,4.3,30.899584,75.809346,30.919584,75.829346,13-02-2022,09:20,09:30,Sandstorms,Low,0,Buffet,motorcycle,0.0,No,Metropolitian,20
4,0xdb64,KNPRES14DEL02,24.0,4.7,26.463504,80.372929,26.593504,80.502929,14-02-2022,19:50,20:05,Fog,Jam,1,Snack,scooter,1.0,No,Metropolitian,41


In [3]:
df=df.drop(labels=['ID','Delivery_person_ID','Order_Date','Time_Orderd','Time_Order_picked'],axis=1)

In [4]:
## Independent and dependent features
X = df.drop(labels=['Time_taken (min)'],axis=1)
Y = df[['Time_taken (min)']]

In [5]:
Y

Unnamed: 0,Time_taken (min)
0,46
1,23
2,21
3,20
4,41
...,...
45579,32
45580,36
45581,16
45582,26


In [6]:
# Define which columns should be ordinal-encoded and which should be scaled
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [7]:
categorical_cols

Index(['Weather_conditions', 'Road_traffic_density', 'Type_of_order',
       'Type_of_vehicle', 'Festival', 'City'],
      dtype='object')

In [8]:
numerical_cols

Index(['Delivery_person_Age', 'Delivery_person_Ratings', 'Restaurant_latitude',
       'Restaurant_longitude', 'Delivery_location_latitude',
       'Delivery_location_longitude', 'Vehicle_condition',
       'multiple_deliveries'],
      dtype='object')

In [19]:
# Define the custom ranking for each ordinal variable
Weather_conditions_categories = ['Sunny','Cloudy','Windy','Stormy','Fog','Sandstorms']
Road_traffic_density_categories= ['Low','Medium','High','Jam']
Type_of_order_categories=['Drinks','Snack','Meal','Buffet']
Type_of_vehicle_categories=['bicycle','electric_scooter','scooter','motorcycle']
Festival_categories=['No','Yes']
City_categories=['Semi-Urban','Urban','Metropolitian']

In [20]:
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [21]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())

    ]

)

# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[Weather_conditions_categories,Road_traffic_density_categories,Type_of_order_categories,Type_of_vehicle_categories,Festival_categories,City_categories])),
    ('scaler',StandardScaler())
    ]

)

preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])

In [22]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=30)

In [23]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [24]:
X_train.head()

Unnamed: 0,num_pipeline__Delivery_person_Age,num_pipeline__Delivery_person_Ratings,num_pipeline__Restaurant_latitude,num_pipeline__Restaurant_longitude,num_pipeline__Delivery_location_latitude,num_pipeline__Delivery_location_longitude,num_pipeline__Vehicle_condition,num_pipeline__multiple_deliveries,cat_pipeline__Weather_conditions,cat_pipeline__Road_traffic_density,cat_pipeline__Type_of_order,cat_pipeline__Type_of_vehicle,cat_pipeline__Festival,cat_pipeline__City
0,1.124025,-1.62344,0.647373,0.131167,0.669961,0.116679,-1.223528,-1.318236,1.454123,1.308057,1.351245,0.77763,-0.142953,0.532682
1,-0.106505,-3.143059,0.647397,0.131248,0.668629,0.116295,-1.223528,3.943714,-1.489512,1.308057,1.351245,0.77763,6.995304,-1.805257
2,0.245075,0.504025,1.156141,0.443938,1.228657,0.45241,1.160323,0.435747,-0.312058,-1.093916,0.453809,-0.753919,-0.142953,0.532682
3,0.596655,-1.319517,-0.226656,0.165206,-0.293358,0.156297,1.160323,0.435747,-0.312058,1.308057,0.453809,-0.753919,-0.142953,0.532682
4,0.772445,-1.62344,0.258322,0.116402,0.230746,0.098366,-1.223528,0.435747,-0.312058,-1.093916,0.453809,0.77763,-0.142953,-1.805257


In [25]:
## Model Training

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [26]:
regression=LinearRegression()
regression.fit(X_train,y_train)

In [27]:
regression.coef_

array([[ 2.25996138, -2.33439181, -0.0061067 , -0.55809097,  0.05791941,
         0.48713551, -1.74376348,  2.10067236,  0.85541195,  3.17988454,
         0.01260889,  0.02511437,  1.63199115,  0.76933256]])

In [28]:
regression.intercept_

array([26.31769462])

In [29]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [30]:
## Train multiple models

models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet()
}
trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)
    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
RMSE: 6.72718944638382
MAE: 5.30358059435583
R2 score 47.678915748419456


Lasso
Model Training Performance
RMSE: 7.11089617983112
MAE: 5.660438847716622
R2 score 41.54009640339824


Ridge
Model Training Performance
RMSE: 6.7271863337230124
MAE: 5.303579106468125
R2 score 47.678964166187974


Elasticnet
Model Training Performance
RMSE: 7.171928072217933
MAE: 5.724337329044686
R2 score 40.53228249531302


