## Model Training

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./data/clean_data.csv')
df.head()

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Weather_conditions,Road_traffic_density,Vehicle_condition,multiple_deliveries,Festival,City,Time_taken (min),distance
0,36.0,4.2,Fog,Jam,2,3.0,0.0,Metropolitian,46,10.280582
1,21.0,4.7,Stormy,High,1,1.0,0.0,Metropolitian,23,6.242319
2,23.0,4.7,Sandstorms,Medium,1,1.0,0.0,Metropolitian,21,13.78786
3,34.0,4.3,Sandstorms,Low,0,0.0,0.0,Metropolitian,20,2.930258
4,24.0,4.7,Fog,Jam,1,1.0,0.0,Metropolitian,41,19.396618


In [3]:
## Independent and dependent features
X = df.drop(labels=['Time_taken (min)'],axis=1)
Y = df[['Time_taken (min)']]

In [4]:
X

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Weather_conditions,Road_traffic_density,Vehicle_condition,multiple_deliveries,Festival,City,distance
0,36.0,4.2,Fog,Jam,2,3.0,0.0,Metropolitian,10.280582
1,21.0,4.7,Stormy,High,1,1.0,0.0,Metropolitian,6.242319
2,23.0,4.7,Sandstorms,Medium,1,1.0,0.0,Metropolitian,13.787860
3,34.0,4.3,Sandstorms,Low,0,0.0,0.0,Metropolitian,2.930258
4,24.0,4.7,Fog,Jam,1,1.0,0.0,Metropolitian,19.396618
...,...,...,...,...,...,...,...,...,...
40183,35.0,4.2,Windy,Jam,2,1.0,0.0,Metropolitian,16.600272
40184,30.0,4.8,Windy,High,1,0.0,0.0,Metropolitian,1.489846
40185,30.0,4.9,Cloudy,Low,1,0.0,0.0,Metropolitian,4.657195
40186,20.0,4.7,Cloudy,High,0,1.0,0.0,Metropolitian,6.232393


In [5]:
Y

Unnamed: 0,Time_taken (min)
0,46
1,23
2,21
3,20
4,41
...,...
40183,33
40184,32
40185,16
40186,26


In [6]:
# Define which columns should be ordinal-encoded. One Hot encoded and which should be scaled

numerical_cols = X.select_dtypes(exclude='object').columns
categorical_cols = ["Road_traffic_density"]
categorical_cols1 = ["Weather_conditions", "City"]

In [7]:
# Define the custom ranking for each ordinal variable

traffic_categories = ['Low', 'Medium', 'High','Jam']

In [8]:
from sklearn.impute import SimpleImputer ## Handling Missing Values
from sklearn.preprocessing import StandardScaler # Handling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder # Ordinal Encoding and OneHot Encoding

## pipelines

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [9]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
    ]
)

# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[traffic_categories])),
    ('scaler',StandardScaler())
    ]
)

cat_pipeline1=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('onehotencoder',OneHotEncoder(handle_unknown = "ignore")),
    ('scaler',StandardScaler(with_mean=False))
    ]
)

# Combine
preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols),
('cat_pipeline1',cat_pipeline1,categorical_cols1)
])


In [10]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.20,random_state=30)

In [11]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [12]:
X_train.head()

Unnamed: 0,num_pipeline__Delivery_person_Age,num_pipeline__Delivery_person_Ratings,num_pipeline__Vehicle_condition,num_pipeline__multiple_deliveries,num_pipeline__Festival,num_pipeline__distance,cat_pipeline__Road_traffic_density,cat_pipeline1__Weather_conditions_Cloudy,cat_pipeline1__Weather_conditions_Fog,cat_pipeline1__Weather_conditions_Sandstorms,cat_pipeline1__Weather_conditions_Stormy,cat_pipeline1__Weather_conditions_Sunny,cat_pipeline1__Weather_conditions_Windy,cat_pipeline1__City_Metropolitian,cat_pipeline1__City_Semi-Urban,cat_pipeline1__City_Urban
0,0.601925,0.84708,1.22758,-1.316189,-0.140803,-0.089951,-0.308107,0.0,2.660144,0.0,0.0,0.0,0.0,0.0,0.0,2.409269
1,-0.791175,1.165041,0.000878,-1.316189,-0.140803,-0.110022,-0.308107,0.0,0.0,0.0,0.0,0.0,2.690804,2.396473,0.0,0.0
2,-0.4429,0.84708,0.000878,-1.316189,-0.140803,-1.460877,-1.111614,0.0,0.0,2.689994,0.0,0.0,0.0,2.396473,0.0,0.0
3,1.124338,-0.106801,-1.225825,3.956112,-0.140803,1.256454,1.298908,2.690399,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.409269
4,1.298475,-0.424761,0.000878,0.441244,-0.140803,1.241112,-1.111614,2.690399,0.0,0.0,0.0,0.0,0.0,2.396473,0.0,0.0


In [13]:
## Model Training

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [14]:
regression=LinearRegression()
regression.fit(X_train,y_train)

In [15]:
regression.coef_

array([[ 2.21818257e+00, -2.30068044e+00, -1.84923110e+00,
         1.67320110e+00,  1.33196905e+00,  1.78285963e+00,
         2.88390064e+00, -9.10155700e+12, -9.20507219e+12,
        -9.10292736e+12, -9.21641116e+12, -9.02477293e+12,
        -9.10018619e+12, -5.87335360e+11, -8.10655346e+10,
        -5.84216030e+11]])

In [16]:
regression.intercept_

array([2.58943532e+13])

In [17]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [18]:
## Train multiple models

models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet()
}
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score:",r2_square*100)
    print("Adjusted_R_sq:", (1- (1-r2_square)*(len(Y)-1)/(len(Y)- X.shape[1]-1))*100)

    r2_list.append(r2_square)
    
    print('='*35)

LinearRegression
Model Training Performance
RMSE: 6.031187492830925
MAE: 4.819442122574023
R2 score: 58.574566491063464
Adjusted_R_sq: 58.56528706198336
Lasso
Model Training Performance
RMSE: 6.670417191267719
MAE: 5.3214569612432125
R2 score: 49.328075764135136
Adjusted_R_sq: 49.31672509167452
Ridge
Model Training Performance
RMSE: 6.0305010642336745
MAE: 4.819440277916971
R2 score: 58.58399547468392
Adjusted_R_sq: 58.57471815772618
Elasticnet
Model Training Performance
RMSE: 6.646212338424474
MAE: 5.326622485390574
R2 score: 49.6951536036679
Adjusted_R_sq: 49.68388515781278
