In [36]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder,StandardScaler
import geopy.distance
from sklearn.ensemble import RandomForestRegressor

In [4]:
df=pd.read_csv('data/order.csv')
df.head()

Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min)
0,0xcdcd,DEHRES17DEL01,36.0,4.2,30.327968,78.046106,30.397968,78.116106,12-02-2022,21:55,22:10,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,46
1,0xd987,KOCRES16DEL01,21.0,4.7,10.003064,76.307589,10.043064,76.347589,13-02-2022,14:55,15:05,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,23
2,0x2784,PUNERES13DEL03,23.0,4.7,18.56245,73.916619,18.65245,74.006619,04-03-2022,17:30,17:40,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitian,21
3,0xc8b6,LUDHRES15DEL02,34.0,4.3,30.899584,75.809346,30.919584,75.829346,13-02-2022,09:20,09:30,Sandstorms,Low,0,Buffet,motorcycle,0.0,No,Metropolitian,20
4,0xdb64,KNPRES14DEL02,24.0,4.7,26.463504,80.372929,26.593504,80.502929,14-02-2022,19:50,20:05,Fog,Jam,1,Snack,scooter,1.0,No,Metropolitian,41


In [5]:
import geopy.distance

def distance(l1,o1,l2,o2):
    return (geopy.distance.distance((l1,o1), (l2,o2)).km)

df['distance']=df.apply(lambda x:distance(x.Restaurant_latitude,x.Restaurant_longitude,x.Delivery_location_latitude,x.Delivery_location_longitude), axis = 1)

Below columns can be dropped out as there is very low coorelation with target veriable or no impact , while some column been transfomed into the required one like the distance

'ID','Delivery_person_ID','Time_Orderd','Order_Date','Restaurant_latitude','Restaurant_longitude',
'Delivery_location_latitude','Delivery_location_longitude','Time_Order_picked','city','type_of_vehicle,
type_of_order,weather_condition)

In [6]:
df=df.drop(labels=['ID','Delivery_person_ID','Time_Orderd','Order_Date','Restaurant_latitude','Restaurant_longitude','Delivery_location_latitude','Delivery_location_longitude','Time_Order_picked'],axis=1)

In [7]:
X=df.drop(labels=['Time_taken (min)'],axis=1)
Y=df['Time_taken (min)']

In [20]:
# Define which columns should be ordinal-encoded and which should be scaled
#categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns
categorical_cols=['Road_traffic_density','Type_of_vehicle','Festival']

In [19]:
df['Type_of_order'].value_counts()
categorical_cols

Index(['Weather_conditions', 'Road_traffic_density', 'Type_of_order',
       'Type_of_vehicle', 'Festival', 'City'],
      dtype='object')

In [21]:

#Weather_cat=['Fog','Stormy','Cloudy','Sandstorms','Windy','Sunny']
traffic_cat=['Low','Jam','Medium','High']
#order_cat=['Snack','Meal','Drinks','Buffet']
vehicle_cat=['motorcycle','scooter','electric_scooter','bicycle']
festival_cat=['Yes','No']
#city_cat=['Metropolitian','Urban','Semi-Urban']




In [22]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())

    ]

)

# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[traffic_cat,vehicle_cat,festival_cat])),
    ('scaler',StandardScaler())
    ]

)

preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])

In [23]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=30)

In [24]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [38]:

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [26]:
regression=LinearRegression()
regression.fit(X_train,y_train)

In [27]:
regression.coef_

array([ 2.21003234, -2.57035998, -1.69703164,  2.47326006,  0.17674408,
        1.80275568, -0.02720466, -1.93125881])

In [28]:
regression.intercept_

26.317694622038356

In [29]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [37]:
## Train multiple models

models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet(),
    "RandomForest": RandomForestRegressor()
}
trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')


LinearRegression
Model Training Performance
RMSE: 7.2607302501915605
MAE: 5.717072442568198
R2 score 39.05051737778875


Lasso
Model Training Performance
RMSE: 7.552187184558426
MAE: 6.00545364008033
R2 score 34.05909394307843


Ridge
Model Training Performance
RMSE: 7.260729000937313
MAE: 5.717073569653293
R2 score 39.05053835126962


Elasticnet
Model Training Performance
RMSE: 7.62110309174192
MAE: 6.084817730172728
R2 score 32.85014304363587


RandomForest
Model Training Performance
RMSE: 5.910124890920461
MAE: 4.6205369576294
R2 score 59.616613482270154




Since we had very low accuracy from the Linear Regression model, SO We will go with Random forest model with below accuracy score
RandomForest
Model Training Performance
RMSE: 5.910124890920461
MAE: 4.6205369576294
R2 score 59.616613482270154
