#Model Training

In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [38]:
import pandas as pd

In [39]:
df = pd.read_csv('finalTrain.csv')
df.head()

Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min)
0,0xcdcd,DEHRES17DEL01,36.0,4.2,30.327968,78.046106,30.397968,78.116106,12-02-2022,21:55,22:10,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,46
1,0xd987,KOCRES16DEL01,21.0,4.7,10.003064,76.307589,10.043064,76.347589,13-02-2022,14:55,15:05,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,23
2,0x2784,PUNERES13DEL03,23.0,4.7,18.56245,73.916619,18.65245,74.006619,04-03-2022,17:30,17:40,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitian,21
3,0xc8b6,LUDHRES15DEL02,34.0,4.3,30.899584,75.809346,30.919584,75.829346,13-02-2022,09:20,09:30,Sandstorms,Low,0,Buffet,motorcycle,0.0,No,Metropolitian,20
4,0xdb64,KNPRES14DEL02,24.0,4.7,26.463504,80.372929,26.593504,80.502929,14-02-2022,19:50,20:05,Fog,Jam,1,Snack,scooter,1.0,No,Metropolitian,41


In [40]:
def haversine_vectorize(lat1, lon1, lat2, lon2):

    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    newlon = lon2 - lon1
    newlat = lat2 - lat1

    haver_formula = np.sin(newlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(newlon/2.0)**2

    dist = 2 * np.arcsin(np.sqrt(haver_formula ))
    km = 6367 * dist #6367 for distance in KM for miles use 3958
    return km

In [41]:
dist_cols = ['Restaurant_latitude', 'Restaurant_longitude', 'Delivery_location_latitude','Delivery_location_longitude']

In [42]:
df['Distance'] = df[dist_cols].apply(
    lambda x: haversine_vectorize(x[0], x[1], x[2], x[3]), 
    axis=1)

In [43]:
df = df.drop(labels = ['ID','Delivery_person_ID', 'Order_Date','Time_Orderd','Time_Order_picked','Restaurant_latitude', 'Restaurant_longitude', 'Delivery_location_latitude','Delivery_location_longitude'], axis = 1)

In [44]:
df.head()

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min),Distance
0,36.0,4.2,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,46,10.274127
1,21.0,4.7,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,23,6.238399
2,23.0,4.7,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitian,21,13.779204
3,34.0,4.3,Sandstorms,Low,0,Buffet,motorcycle,0.0,No,Metropolitian,20,2.928418
4,24.0,4.7,Fog,Jam,1,Snack,scooter,1.0,No,Metropolitian,41,19.38444


In [45]:
# independent and dependent features
X = df.drop(labels = 'Time_taken (min)', axis= 1)
Y = df['Time_taken (min)']

In [46]:
categorical_cols = X.select_dtypes(include= 'object').columns
numerical_cols = X.select_dtypes(exclude= 'object').columns

In [47]:
categorical_cols

Index(['Weather_conditions', 'Road_traffic_density', 'Type_of_order',
       'Type_of_vehicle', 'Festival', 'City'],
      dtype='object')

In [48]:
# Define the custom ranking for each ordinal variable
Weather_conditions_cat =  ['Fog', 'Stormy', 'Sandstorms', 'Windy', 'Cloudy', 'Sunny']
Road_traffic_density_cat = ['Jam', 'High', 'Medium', 'Low' ]
Type_of_order_cat = ['Snack', 'Meal', 'Drinks', 'Buffet']
Type_of_vehicle_cat = ['motorcycle', 'scooter' ,'electric_scooter', 'bicycle']
Festival_cat = ['No', 'Yes']
City_cat = ['Metropolitian', 'Urban', 'Semi-Urban']

In [49]:
from sklearn.impute import SimpleImputer ##handling missing values
from sklearn.preprocessing import StandardScaler ##handling feature scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal encoding

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [50]:
#numerical pipeline

num_pipeline = Pipeline(
    steps = [
        ('imputer',SimpleImputer(strategy= 'median')),
        ('scaler',StandardScaler())
    ]
)

#categorical pipeline

cat_pipeline = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy= 'most_frequent')),
        ('ordinalencoder',OrdinalEncoder(categories= [Weather_conditions_cat,Road_traffic_density_cat,Type_of_order_cat,Type_of_vehicle_cat,Festival_cat,City_cat])),
        ('scaler', StandardScaler())

    ]
)
preprocessor= ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])

In [51]:
num_pipeline

In [52]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=30)

In [53]:
preprocessor.fit_transform(X_train)

array([[ 1.12402521, -1.6234405 , -1.2235278 , ..., -0.77763048,
        -0.14295305, -0.532682  ],
       [-0.10650539, -3.14305853, -1.2235278 , ..., -0.77763048,
         6.99530359,  1.8052571 ],
       [ 0.24507478,  0.50402474,  1.16032303, ...,  0.75391902,
        -0.14295305, -0.532682  ],
       ...,
       [-1.68861615,  0.20010114,  1.16032303, ...,  0.75391902,
        -0.14295305, -0.532682  ],
       [ 0.77244504, -0.10382247, -0.03160238, ..., -0.77763048,
        -0.14295305, -0.532682  ],
       [-1.33703598,  0.20010114, -1.2235278 , ..., -0.77763048,
        -0.14295305, -0.532682  ]])

In [54]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [55]:
X_train.head()

Unnamed: 0,num_pipeline__Delivery_person_Age,num_pipeline__Delivery_person_Ratings,num_pipeline__Vehicle_condition,num_pipeline__multiple_deliveries,num_pipeline__Distance,cat_pipeline__Weather_conditions,cat_pipeline__Road_traffic_density,cat_pipeline__Type_of_order,cat_pipeline__Type_of_vehicle,cat_pipeline__Festival,cat_pipeline__City
0,1.124025,-1.62344,-1.223528,-1.318236,-0.079476,-0.261197,-1.308057,1.351642,-0.77763,-0.142953,-0.532682
1,-0.106505,-3.143059,-1.223528,3.943714,-0.080867,1.482131,-1.308057,1.351642,-0.77763,6.995304,1.805257
2,0.245075,0.504025,1.160323,0.435747,-0.086475,0.319912,1.093916,-0.437167,0.753919,-0.142953,-0.532682
3,0.596655,-1.319517,1.160323,0.435747,-0.070761,0.319912,-1.308057,-0.437167,0.753919,-0.142953,-0.532682
4,0.772445,-1.62344,-1.223528,0.435747,-0.086406,0.319912,1.093916,-0.437167,-0.77763,-0.142953,1.805257


In [56]:
#model training
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [58]:
regression = LinearRegression()
regression.fit(X_train, y_train)


In [59]:
regression.coef_


array([ 2.2652838 , -2.33876763, -1.77834742,  2.06927729,  0.18159268,
       -1.16256967, -3.20381037, -0.048139  , -0.02144378,  1.61084188,
       -0.75666133])

In [60]:
regression.intercept_

26.317694622038353

In [68]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true,predicted)
    return mae, rmse,r2_square

In [69]:
## Train multiple models

models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet()
}
trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
RMSE: 6.680899444595755
MAE: 5.271594863101187
R2 score 48.39648446551805


Lasso
Model Training Performance
RMSE: 7.081947624124816
MAE: 5.638425369306291
R2 score 42.015109674383055


Ridge
Model Training Performance
RMSE: 6.680897096081736
MAE: 5.2715932951485565
R2 score 48.396520745530744


Elasticnet
Model Training Performance
RMSE: 7.138249710610972
MAE: 5.698084806403554
R2 score 41.089475177376045


