In [105]:
import pandas as pd
df =pd.read_csv('data/final_raw_data_to_process.csv')

In [106]:
df.head()

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_vehicle,multiple_deliveries,Festival,City,Ordered_Date_Year,Ordered_Date_Month,Ordered_Date_Day,Time_OrderPicked_hours,Time_OrderPicked_mins,Time_Orderd_hours,Time_Orderd_mins,Time_taken (min),Distance_covered
0,36.0,4.2,Fog,Jam,2,motorcycle,3.0,No,Metropolitian,2022,2,12,22,10,21,55,46,10.280582
1,21.0,4.7,Stormy,High,1,motorcycle,1.0,No,Metropolitian,2022,2,13,15,5,14,55,23,6.242319
2,23.0,4.7,Sandstorms,Medium,1,scooter,1.0,No,Metropolitian,2022,3,4,17,40,17,30,21,13.78786
3,34.0,4.3,Sandstorms,Low,0,motorcycle,0.0,No,Metropolitian,2022,2,13,9,30,9,20,20,2.930258
4,24.0,4.7,Fog,Jam,1,scooter,1.0,No,Metropolitian,2022,2,14,20,5,19,50,41,19.396618


In [107]:
X=df.drop(['Time_taken (min)'],axis=1)
y=df[['Time_taken (min)']]

In [108]:
numerical_columns = X.select_dtypes(exclude='object').columns
categorical_columns  = X.select_dtypes(include='object').columns

In [109]:
Weather_conditions_Map=['Sunny','Sandstorms','Stormy','Windy','Cloudy','Fog']
Road_traffic_density_Map=['Low','Medium','High','Jam']
Type_of_vehicle_Map = ['scooter','electric_scooter','bicycle','motorcycle']
Festival_Map=['No','Yes']
City_Map=['Urban','Metropolitian','Semi-Urban']

In [110]:
from sklearn.impute import SimpleImputer ## to handle missing values with simple strategies
from sklearn.preprocessing import StandardScaler  ## handle feature scaling
from sklearn.preprocessing import OrdinalEncoder ## Ordinal encoding
## Pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [111]:
## Numerical pipeline 
num_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ]
)

## Categorical pipeline
cat_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder',OrdinalEncoder(categories=[Weather_conditions_Map,Road_traffic_density_Map,Type_of_vehicle_Map,Festival_Map,City_Map])),
        ('scaler',StandardScaler())
    ]
)

preprocessor=ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_columns),
    ('cat_pipeline',cat_pipeline,categorical_columns)
])

In [112]:
## Train,test,split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=42)

In [113]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [114]:
X_train.head()

Unnamed: 0,num_pipeline__Delivery_person_Age,num_pipeline__Delivery_person_Ratings,num_pipeline__Vehicle_condition,num_pipeline__multiple_deliveries,num_pipeline__Ordered_Date_Year,num_pipeline__Ordered_Date_Month,num_pipeline__Ordered_Date_Day,num_pipeline__Time_OrderPicked_hours,num_pipeline__Time_OrderPicked_mins,num_pipeline__Time_Orderd_hours,num_pipeline__Time_Orderd_mins,num_pipeline__Distance_covered,cat_pipeline__Weather_conditions,cat_pipeline__Road_traffic_density,cat_pipeline__Type_of_vehicle,cat_pipeline__Festival,cat_pipeline__City
0,0.954495,-0.108398,-0.028178,0.441723,0.0,0.03521,0.713479,-0.184367,-0.155852,-0.177397,-1.333471,-0.079651,0.256556,-0.293488,-1.298421,-0.141231,0.521193
1,1.480327,0.19471,1.165832,0.441723,0.0,0.03521,-1.006928,-1.122162,0.521381,-1.117931,-0.215874,-0.084175,-0.906668,0.507865,0.836596,-0.141231,0.521193
2,1.129772,0.497818,1.165832,-1.324947,0.0,0.03521,-0.662846,0.987877,0.521381,0.998272,0.156658,-0.071488,-0.325056,-1.09484,-0.586748,-0.141231,-1.842734
3,-0.447724,1.104034,1.165832,-1.324947,0.0,0.03521,0.598785,-0.418816,0.521381,-0.41253,0.156658,-0.084163,-0.325056,-0.293488,0.836596,-0.141231,0.521193
4,1.480327,0.19471,-1.222187,0.441723,0.0,-1.791606,0.484091,0.518979,1.537231,0.528004,1.274254,-0.077272,0.838168,1.309217,0.836596,-0.141231,0.521193


In [115]:
from sklearn.linear_model import LogisticRegression,LinearRegression,ElasticNet,Lasso,Ridge
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [116]:
# Fitting Random Forest Regression to the dataset
# import the regressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import numpy as np
def evaluate_model(true,predicted):
    mae=mean_absolute_error(true,predicted)
    mse=mean_squared_error(true,predicted)
    rmse=np.sqrt(mean_squared_error(true,predicted))
    r2_square = r2_score(true,predicted)
    return mae,rmse,r2_square

In [117]:
models = {
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'ElasticNet':ElasticNet(),
    'Random Forest Regressor':RandomForestRegressor(),
    'Decision Tree Regressor': DecisionTreeRegressor()
}
# trained_model_list=[]
# model_list=[]
# r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    ## Make predictions
    y_pred=model.predict(X_test)
    mae,rmse,r2_square = evaluate_model(y_pred,y_test)

    print(list(models.keys())[i])

    print('Model Performance')
    print('Mean Absolute Error = ',mae)
    print('Root mean squarred error = ',rmse)
    print('R2 Score = ',r2_square*100)
    print('='*35)
    print('\n')

LinearRegression
Model Performance
Mean Absolute Error =  5.108631445475199
Root mean squarred error =  6.408379745361034
R2 Score =  13.39228164612929


Lasso
Model Performance
Mean Absolute Error =  5.543392594798884
Root mean squarred error =  6.91241009379246
R2 Score =  -131.3461535359213


Ridge
Model Performance
Mean Absolute Error =  5.108633053318918
Root mean squarred error =  6.40838177924673
R2 Score =  13.388358148449985


ElasticNet
Model Performance
Mean Absolute Error =  5.624953076949281
Root mean squarred error =  6.981929097443579
R2 Score =  -174.82117253299165




  return fit_method(estimator, *args, **kwargs)


Random Forest Regressor
Model Performance
Mean Absolute Error =  3.183627827337428
Root mean squarred error =  4.027204022536157
R2 Score =  78.38309883129376


Decision Tree Regressor
Model Performance
Mean Absolute Error =  4.173661889441357
Root mean squarred error =  5.531127846644189
R2 Score =  65.82292029946876


