In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [30]:
df = pd.read_csv("cleaned_mail_routing_data.csv")
df.head()

Unnamed: 0,Package Type,Source Location,Destination Location,Distance (km),Weight (kg),Urgency Level,Mode of Transmission,Estimated Cost (₹),Delivery Slot (Days),Actual Delivery Time (Days),Delay (Yes/No)
0,Document,Mumbai,Delhi,1400,0.5,High,Air,500.0,2,2,No
1,Parcel,Kolkata,Chennai,1670,5.0,Medium,Road,5000.0,4,6,Yes
2,Small Packet,Pune,Bangalore,835,1.2,Low,Rail,350.0,3,3,No
3,Heavy Parcel,Lucknow,Ahmedabad,1350,20.0,High,Air,5000.0,2,3,Yes
4,Letter,Jaipur,Mumbai,1180,0.1,Medium,Road,1573.333333,4,4,No


In [31]:
dummies = pd.get_dummies(df['Package Type'], drop_first= True).astype(int)

In [32]:
dummies.head()

Unnamed: 0,Heavy Parcel,Letter,Parcel,Small Packet
0,0,0,0,0
1,0,0,1,0
2,0,0,0,1
3,1,0,0,0
4,0,1,0,0


In [33]:
df1 = pd.concat([df, dummies], axis = 'columns')
df1.head()

Unnamed: 0,Package Type,Source Location,Destination Location,Distance (km),Weight (kg),Urgency Level,Mode of Transmission,Estimated Cost (₹),Delivery Slot (Days),Actual Delivery Time (Days),Delay (Yes/No),Heavy Parcel,Letter,Parcel,Small Packet
0,Document,Mumbai,Delhi,1400,0.5,High,Air,500.0,2,2,No,0,0,0,0
1,Parcel,Kolkata,Chennai,1670,5.0,Medium,Road,5000.0,4,6,Yes,0,0,1,0
2,Small Packet,Pune,Bangalore,835,1.2,Low,Rail,350.0,3,3,No,0,0,0,1
3,Heavy Parcel,Lucknow,Ahmedabad,1350,20.0,High,Air,5000.0,2,3,Yes,1,0,0,0
4,Letter,Jaipur,Mumbai,1180,0.1,Medium,Road,1573.333333,4,4,No,0,1,0,0


In [34]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df1['Urgency Level'] = le.fit_transform(df1['Urgency Level'])

In [35]:
df1.head()

Unnamed: 0,Package Type,Source Location,Destination Location,Distance (km),Weight (kg),Urgency Level,Mode of Transmission,Estimated Cost (₹),Delivery Slot (Days),Actual Delivery Time (Days),Delay (Yes/No),Heavy Parcel,Letter,Parcel,Small Packet
0,Document,Mumbai,Delhi,1400,0.5,0,Air,500.0,2,2,No,0,0,0,0
1,Parcel,Kolkata,Chennai,1670,5.0,2,Road,5000.0,4,6,Yes,0,0,1,0
2,Small Packet,Pune,Bangalore,835,1.2,1,Rail,350.0,3,3,No,0,0,0,1
3,Heavy Parcel,Lucknow,Ahmedabad,1350,20.0,0,Air,5000.0,2,3,Yes,1,0,0,0
4,Letter,Jaipur,Mumbai,1180,0.1,2,Road,1573.333333,4,4,No,0,1,0,0


In [36]:
df1['Mode of Transmission'] = le.fit_transform(df1['Mode of Transmission'])
df1.head()

Unnamed: 0,Package Type,Source Location,Destination Location,Distance (km),Weight (kg),Urgency Level,Mode of Transmission,Estimated Cost (₹),Delivery Slot (Days),Actual Delivery Time (Days),Delay (Yes/No),Heavy Parcel,Letter,Parcel,Small Packet
0,Document,Mumbai,Delhi,1400,0.5,0,0,500.0,2,2,No,0,0,0,0
1,Parcel,Kolkata,Chennai,1670,5.0,2,2,5000.0,4,6,Yes,0,0,1,0
2,Small Packet,Pune,Bangalore,835,1.2,1,1,350.0,3,3,No,0,0,0,1
3,Heavy Parcel,Lucknow,Ahmedabad,1350,20.0,0,0,5000.0,2,3,Yes,1,0,0,0
4,Letter,Jaipur,Mumbai,1180,0.1,2,2,1573.333333,4,4,No,0,1,0,0


In [37]:
columns = ['Distance (km)', 'Weight (kg)']
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df1[columns] = scaler.fit_transform(df1[columns])

In [38]:
X = df1.drop(['Package Type', 'Source Location', 'Destination Location', 'Mode of Transmission', 'Estimated Cost (₹)', 'Delivery Slot (Days)', 'Actual Delivery Time (Days)', 'Delay (Yes/No)'], axis = 'columns')

In [39]:
X.head()

Unnamed: 0,Distance (km),Weight (kg),Urgency Level,Heavy Parcel,Letter,Parcel,Small Packet
0,-0.047689,-1.541494,0,0,0,0,0
1,0.287445,-1.226911,2,0,0,1,0
2,-0.74899,-1.492559,1,0,0,0,1
3,-0.109751,-0.1783,0,1,0,0,0
4,-0.320762,-1.569457,2,0,1,0,0


In [40]:
y1 = df1['Mode of Transmission']

In [41]:
from sklearn.model_selection import train_test_split
X_train, X_test, y1_train, y1_test = train_test_split(X, y1, test_size = 0.2, random_state = 2)

In [42]:
X_train.shape

(540, 7)

In [43]:
X_test.shape

(135, 7)

In [44]:
from xgboost import XGBClassifier 
from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200, 300],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8]
}

# Initialize XGBoost classifier
xgb_model = XGBClassifier(random_state=42)

# Use GridSearchCV to search for the best combination of hyperparameters
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y1_train)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Train the best model
best_model = grid_search.best_estimator_
y1_pred = best_model.predict(X_test)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best Hyperparameters: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100, 'subsample': 1.0}


In [45]:
score = best_model.score(X_test, y1_test)
score

0.35555555555555557

In [46]:
from sklearn.metrics import classification_report
print(classification_report(y1_test, y1_pred))

              precision    recall  f1-score   support

           0       0.32      0.20      0.24        41
           1       0.36      0.50      0.42        50
           2       0.38      0.34      0.36        44

    accuracy                           0.36       135
   macro avg       0.35      0.35      0.34       135
weighted avg       0.35      0.36      0.34       135



In [47]:
y2 = df1['Estimated Cost (₹)']

In [48]:
X2_train, X2_test, y2_train,y2_test = train_test_split(X, y2, test_size=0.2)

In [49]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [50]:
xgb_reg = XGBRegressor(
    learning_rate=0.05,   # The learning rate
    max_depth=5,          # The maximum depth of each tree
    n_estimators=200,     # Number of trees (boosting rounds)
    subsample=0.8,        # Fraction of samples used for fitting each tree
    colsample_bytree=0.7, # Fraction of features used for each tree
    random_state=42       # Set random state for reproducibility
)

In [51]:
xgb_reg.fit(X2_train, y2_train)

In [52]:
y2_pred = xgb_reg.predict(X2_test)

In [53]:
mse = mean_squared_error(y2_test, y2_pred)

In [54]:
mse

np.float64(540116613.8511986)