## Model Training

In [48]:
import pandas as pd

In [49]:
df = pd.read_csv('./data/clean_data.csv')
df.head()

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Weather_conditions,Road_traffic_density,Vehicle_condition,multiple_deliveries,Festival,City,Time_taken (min),distance
0,36.0,4.2,Fog,Jam,2,3.0,0.0,Metropolitian,46,10.280582
1,21.0,4.7,Stormy,High,1,1.0,0.0,Metropolitian,23,6.242319
2,23.0,4.7,Sandstorms,Medium,1,1.0,0.0,Metropolitian,21,13.78786
3,34.0,4.3,Sandstorms,Low,0,0.0,0.0,Metropolitian,20,2.930258
4,24.0,4.7,Fog,Jam,1,1.0,0.0,Metropolitian,41,19.396618


In [50]:
## Independent and dependent features
X = df.drop(labels=['Time_taken (min)'],axis=1)
Y = df[['Time_taken (min)']]

In [51]:
X

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Weather_conditions,Road_traffic_density,Vehicle_condition,multiple_deliveries,Festival,City,distance
0,36.0,4.2,Fog,Jam,2,3.0,0.0,Metropolitian,10.280582
1,21.0,4.7,Stormy,High,1,1.0,0.0,Metropolitian,6.242319
2,23.0,4.7,Sandstorms,Medium,1,1.0,0.0,Metropolitian,13.787860
3,34.0,4.3,Sandstorms,Low,0,0.0,0.0,Metropolitian,2.930258
4,24.0,4.7,Fog,Jam,1,1.0,0.0,Metropolitian,19.396618
...,...,...,...,...,...,...,...,...,...
40183,35.0,4.2,Windy,Jam,2,1.0,0.0,Metropolitian,16.600272
40184,30.0,4.8,Windy,High,1,0.0,0.0,Metropolitian,1.489846
40185,30.0,4.9,Cloudy,Low,1,0.0,0.0,Metropolitian,4.657195
40186,20.0,4.7,Cloudy,High,0,1.0,0.0,Metropolitian,6.232393


In [52]:
Y

Unnamed: 0,Time_taken (min)
0,46
1,23
2,21
3,20
4,41
...,...
40183,33
40184,32
40185,16
40186,26


In [53]:
# Define which columns should be ordinal-encoded. One Hot encoded and which should be scaled

numerical_cols = X.select_dtypes(exclude='object').columns
categorical_cols = ["Road_traffic_density"]
categorical_cols1 = ["Weather_conditions", "City"]

In [54]:
# Define the custom ranking for each ordinal variable

traffic_categories = ['Low', 'Medium', 'High','Jam']

In [55]:
from sklearn.impute import SimpleImputer ## Handling Missing Values
from sklearn.preprocessing import StandardScaler # Handling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder # Ordinal Encoding and OneHot Encoding

## pipelines

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [56]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
    ]
)

# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[traffic_categories]))
    ]
)

cat_pipeline1=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('onehotencoder',OneHotEncoder(handle_unknown = "ignore"))
    ]
)

# Combine
preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols),
('cat_pipeline1',cat_pipeline1,categorical_cols1)
])


In [57]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.25,random_state=42)

In [58]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [59]:
X_train

Unnamed: 0,num_pipeline__Delivery_person_Age,num_pipeline__Delivery_person_Ratings,num_pipeline__Vehicle_condition,num_pipeline__multiple_deliveries,num_pipeline__Festival,num_pipeline__distance,cat_pipeline__Road_traffic_density,cat_pipeline1__Weather_conditions_Cloudy,cat_pipeline1__Weather_conditions_Fog,cat_pipeline1__Weather_conditions_Sandstorms,cat_pipeline1__Weather_conditions_Stormy,cat_pipeline1__Weather_conditions_Sunny,cat_pipeline1__Weather_conditions_Windy,cat_pipeline1__City_Metropolitian,cat_pipeline1__City_Semi-Urban,cat_pipeline1__City_Urban
0,0.075307,-0.430527,1.227443,0.445548,-0.140073,1.878683,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,-1.669248,0.209814,0.003452,0.445548,-0.140073,-0.337149,3.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,-0.796970,0.529985,0.003452,0.445548,-0.140073,-1.456563,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.598674,0.850155,0.003452,0.445548,-0.140073,-0.107654,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,-1.669248,-2.031380,0.003452,0.445548,-0.140073,0.456096,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30136,-1.320337,-0.110356,1.227443,0.445548,-0.140073,0.700647,3.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
30137,1.296496,0.209814,1.227443,0.445548,-0.140073,-1.453764,2.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
30138,0.075307,-0.750698,-1.220539,0.445548,-0.140073,1.330456,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
30139,0.075307,0.529985,0.003452,0.445548,-0.140073,-1.190634,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [60]:
X_train.shape

(30141, 16)

In [61]:
y_train.shape

(30141, 1)

In [62]:
X_test.shape

(10047, 16)

In [63]:
y_test.shape

(10047, 1)

In [64]:
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
import xgboost as xgb

# Create a dictionary to store the models
models = {}

# Decision Tree Regression
dt_params = {
    'max_depth': [3, 5, 7],
    'min_samples_leaf': [2, 4, 6]
}
dt_model = DecisionTreeRegressor(random_state=42)
dt_random = RandomizedSearchCV(dt_model, dt_params, scoring='r2', cv=5, n_iter=5, random_state=42)
dt_random.fit(X_train, y_train)
dt_best_model = dt_random.best_estimator_
models['Decision Tree'] = dt_best_model

# Random Forest Regression
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, None],
    'min_samples_leaf': [1, 2, 4]
}
rf_model = RandomForestRegressor(random_state=42)
rf_random = RandomizedSearchCV(rf_model, rf_params, scoring='r2', cv=5, n_iter=5, random_state=42)
rf_random.fit(X_train, y_train)
rf_best_model = rf_random.best_estimator_
models['Random Forest'] = rf_best_model

# Bagging Regression
bagging_model = BaggingRegressor(base_estimator=dt_best_model, n_estimators=10, random_state=42)
models['Bagging'] = bagging_model

# XGBoost Regression
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.005, 0.1, 0.2]
}
xgb_random = RandomizedSearchCV(xgb_model, xgb_params, scoring='r2', cv=5, n_iter=5, random_state=42)
xgb_random.fit(X_train, y_train)
xgb_best_model = xgb_random.best_estimator_
models['XGBoost'] = xgb_best_model

# Evaluate models
for model_name, model in models.items():
    model.fit(X_train, y_train)
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    train_r2 = r2_score(y_train, train_pred)
    test_r2 = r2_score(y_test, test_pred)
    train_rmse = mean_squared_error(y_train, train_pred, squared=False)
    test_rmse = mean_squared_error(y_test, test_pred, squared=False)
    print(f"Model: {model_name}")
    print(f"Train R^2 Score: {train_r2*100:.4f}")
    print(f"Test R^2 Score: {test_r2*100:.4f}")
    print(f"Train RMSE: {train_rmse:.4f}")
    print(f"Test RMSE: {test_rmse:.4f}")
    print("Adjusted_R_sq:", (1- (1-train_r2)*(len(y_train)-1)/(len(y_train)- X_train.shape[1]-1))*100)
    print("Adjusted_R_sq:", (1- (1-test_r2)*(len(y_test)-1)/(len(y_test)- X_test.shape[1]-1))*100)
    print("------------------------------------")

# Find the best model based on the R-squared score
best_model_name = max(models, key=lambda x: models[x].score(X_test, y_test))
best_model = models[best_model_name]

print(f"Best Model: {best_model_name}")
print(f"Best Model R^2 Score: {best_model.score(X_test, y_test):.4f}")



  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

Model: Decision Tree
Train R^2 Score: 71.6223
Test R^2 Score: 71.3579
Train RMSE: 4.9885
Test RMSE: 5.0340
Adjusted_R_sq: 71.60726030598217
Adjusted_R_sq: 71.31224081768372
------------------------------------


  model.fit(X_train, y_train)


Model: Random Forest
Train R^2 Score: 84.3315
Test R^2 Score: 83.1071
Train RMSE: 3.7068
Test RMSE: 3.8660
Adjusted_R_sq: 84.32315992234128
Adjusted_R_sq: 83.08010413256869
------------------------------------


  return column_or_1d(y, warn=True)


Model: Bagging
Train R^2 Score: 73.9574
Test R^2 Score: 73.6336
Train RMSE: 4.7788
Test RMSE: 4.8299
Adjusted_R_sq: 73.94358316955075
Adjusted_R_sq: 73.59158856590462
------------------------------------
Model: XGBoost
Train R^2 Score: 84.3905
Test R^2 Score: 83.0380
Train RMSE: 3.6998
Test RMSE: 3.8739
Adjusted_R_sq: 84.38222266774034
Adjusted_R_sq: 83.01092138134958
------------------------------------
Best Model: Random Forest
Best Model R^2 Score: 0.8311


In [65]:
print(xgb_random.best_params_)

{'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.1}


In [66]:
print(rf_random.best_params_)

{'n_estimators': 200, 'min_samples_leaf': 2, 'max_depth': 10}


In [67]:
print(dt_random.best_params_)

{'min_samples_leaf': 6, 'max_depth': 7}
