In [None]:
# Potential Use Cases: 1.Predict delivery delays using status/distance 2.Optimize carrier selection based on cost/transit time  
# 3.Forecast shipping expenses by route

# QUESTION 1: Predict delivery delays using status

In [1]:
import numpy as np
import pandas as pd

In [3]:
df=pd.read_csv("logistics_shipments_dataset.csv")
df.head(3)

Unnamed: 0,Shipment_ID,Origin_Warehouse,Destination,Carrier,Shipment_Date,Delivery_Date,Weight_kg,Cost,Status,Distance_miles,Transit_Days
0,SH10000,Warehouse_MIA,San Francisco,UPS,2023-10-02,2023-10-04,25.7,67.46,Delivered,291,2
1,SH10001,Warehouse_MIA,Atlanta,DHL,2023-12-06,2023-12-09,38.9,268.85,Delivered,1225,3
2,SH10002,Warehouse_LA,Houston,DHL,2023-09-18,2023-09-20,37.2,74.35,Delivered,220,2


In [7]:
df.isnull().sum()

Shipment_ID          0
Origin_Warehouse     0
Destination          0
Carrier              0
Shipment_Date        0
Delivery_Date       32
Weight_kg            0
Cost                41
Status               0
Distance_miles       0
Transit_Days         0
dtype: int64

In [9]:
# Convert dates
df["Shipment_Date"] = pd.to_datetime(df["Shipment_Date"])
df["Delivery_Date"] = pd.to_datetime(df["Delivery_Date"])

#features from dates
df["ship_dayofweek"] = df["Shipment_Date"].dt.dayofweek
df["ship_month"] = df["Shipment_Date"].dt.month
df["delivery_dayofweek"] = df["Delivery_Date"].dt.dayofweek

# From Shipment_Date, we take both day of week and month to capture cycles (weekdays vs weekends, seasonal effects).
# From Delivery_Date, we only take day of week, since it tells if deliveries fall on weekends/holidays.
# Delivery month is skipped because it overlaps with Shipment month + Delivery duration.
# This keeps features informative but avoids redundancy and multicollinearity. ✅

# Alternative
# You could also extract delivery_month for completeness, but:
# It might be highly correlated with ship_month + Delivery_Duration.
# Logistic regression doesn’t like too much multicollinearity.

In [11]:
df["Delivery_Duration"] = (df["Delivery_Date"] - df["Shipment_Date"]).dt.days
df["Delayed_Flag"] = df["Status"].map({"Delayed": 1, "Delivered": 0, "Lost": 0, "In Transit": 0})

#Option 2: Using np.where() (vectorized, fast)
#df["Delayed_Flag"] = np.where(df["Status"] == "Delayed", 1, 0)
#Option 3: Using replace() (also simple)
#df["Delayed_Flag"] = df["Status"].replace({"Delayed": 1, "Delivered": 0, "Lost": 0, "In Transit": 0})
#Option 4: Using map() (clean & short)
#df["Delayed_Flag"] = df["Status"].map({"Delayed": 1, "Delivered": 0, "Lost": 0, "In Transit": 0})

In [13]:
df.head(3)

Unnamed: 0,Shipment_ID,Origin_Warehouse,Destination,Carrier,Shipment_Date,Delivery_Date,Weight_kg,Cost,Status,Distance_miles,Transit_Days,ship_dayofweek,ship_month,delivery_dayofweek,Delivery_Duration,Delayed_Flag
0,SH10000,Warehouse_MIA,San Francisco,UPS,2023-10-02,2023-10-04,25.7,67.46,Delivered,291,2,0,10,2.0,2.0,0.0
1,SH10001,Warehouse_MIA,Atlanta,DHL,2023-12-06,2023-12-09,38.9,268.85,Delivered,1225,3,2,12,5.0,3.0,0.0
2,SH10002,Warehouse_LA,Houston,DHL,2023-09-18,2023-09-20,37.2,74.35,Delivered,220,2,0,9,2.0,2.0,0.0


In [15]:
df.isnull().sum()

Shipment_ID            0
Origin_Warehouse       0
Destination            0
Carrier                0
Shipment_Date          0
Delivery_Date         32
Weight_kg              0
Cost                  41
Status                 0
Distance_miles         0
Transit_Days           0
ship_dayofweek         0
ship_month             0
delivery_dayofweek    32
Delivery_Duration     32
Delayed_Flag          32
dtype: int64

In [17]:
df=df.drop(columns="Delivery_Date")
df=df.drop(columns="Shipment_Date")
df=df.drop(columns="Shipment_ID")

In [19]:
df.head(4)

Unnamed: 0,Origin_Warehouse,Destination,Carrier,Weight_kg,Cost,Status,Distance_miles,Transit_Days,ship_dayofweek,ship_month,delivery_dayofweek,Delivery_Duration,Delayed_Flag
0,Warehouse_MIA,San Francisco,UPS,25.7,67.46,Delivered,291,2,0,10,2.0,2.0,0.0
1,Warehouse_MIA,Atlanta,DHL,38.9,268.85,Delivered,1225,3,2,12,5.0,3.0,0.0
2,Warehouse_LA,Houston,DHL,37.2,74.35,Delivered,220,2,0,9,2.0,2.0,0.0
3,Warehouse_BOS,Seattle,OnTrac,42.6,187.04,Delivered,1156,9,3,1,5.0,9.0,0.0


In [21]:
df.isnull().sum()

Origin_Warehouse       0
Destination            0
Carrier                0
Weight_kg              0
Cost                  41
Status                 0
Distance_miles         0
Transit_Days           0
ship_dayofweek         0
ship_month             0
delivery_dayofweek    32
Delivery_Duration     32
Delayed_Flag          32
dtype: int64

In [23]:
#Filling Missing values
from sklearn.impute import SimpleImputer
si_cost = SimpleImputer()
df["Cost"]=si_cost.fit_transform(df[["Cost"]])
si_delivery_dayofweek  = SimpleImputer(strategy="most_frequent")
df["delivery_dayofweek"] = si_delivery_dayofweek .fit_transform(df[["delivery_dayofweek"]])
si_Delivery_Duration  = SimpleImputer(strategy="most_frequent")
df["Delivery_Duration"] = si_Delivery_Duration.fit_transform(df[["Delivery_Duration"]])
si_Delayed_Flag   = SimpleImputer(strategy="most_frequent")
df["Delayed_Flag"] = si_Delayed_Flag.fit_transform(df[["Delayed_Flag"]])


In [25]:
df.isnull().sum()

Origin_Warehouse      0
Destination           0
Carrier               0
Weight_kg             0
Cost                  0
Status                0
Distance_miles        0
Transit_Days          0
ship_dayofweek        0
ship_month            0
delivery_dayofweek    0
Delivery_Duration     0
Delayed_Flag          0
dtype: int64

In [27]:
print(df.columns)


Index(['Origin_Warehouse', 'Destination', 'Carrier', 'Weight_kg', 'Cost',
       'Status', 'Distance_miles', 'Transit_Days', 'ship_dayofweek',
       'ship_month', 'delivery_dayofweek', 'Delivery_Duration',
       'Delayed_Flag'],
      dtype='object')


In [29]:
df.isnull().sum()

Origin_Warehouse      0
Destination           0
Carrier               0
Weight_kg             0
Cost                  0
Status                0
Distance_miles        0
Transit_Days          0
ship_dayofweek        0
ship_month            0
delivery_dayofweek    0
Delivery_Duration     0
Delayed_Flag          0
dtype: int64

In [31]:
df=pd.get_dummies(df,columns=["Origin_Warehouse"])
df=pd.get_dummies(df,columns=["Destination"])
df=pd.get_dummies(df,columns=["Carrier"])

In [33]:
x=df.drop(columns=["Status","Delayed_Flag"])
y=df["Delayed_Flag"]

In [35]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [37]:
df["Delayed_Flag"].value_counts()

Delayed_Flag
0.0    1801
1.0     199
Name: count, dtype: int64

In [39]:
from imblearn.over_sampling import RandomOverSampler
randomsamp=RandomOverSampler(random_state=42)
x_resampled,y_resampled=randomsamp.fit_resample(x_train,y_train)

In [41]:
# After resampling
from collections import Counter

print("Before Resampling:", Counter(y_train))
print("After Resampling:", Counter(y_resampled))

Before Resampling: Counter({0.0: 1437, 1.0: 163})
After Resampling: Counter({0.0: 1437, 1.0: 1437})


In [45]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(max_iter=1000, random_state=42)
lr.fit(x_resampled,y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [47]:
y_pred=lr.predict(x_test)

In [49]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.9825

In [51]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred, average='weighted')


0.9819067011463827

In [53]:
param_grid = [
    {'penalty':['l1','l2','elasticnet','none'],
    'C' : np.logspace(-4,4,20),
    'solver': ['lbfgs','newton-cg','liblinear','sag','saga'],
    'max_iter'  : [10,50,100,150,500,1000,2000,3000]
}
]

In [55]:
from sklearn.model_selection import GridSearchCV
gridsearch = GridSearchCV(lr,param_grid = param_grid, cv = 3, verbose=True,n_jobs=-1)
gridsearch

In [67]:
bestgridsearch = gridsearch.fit(x_resampled,y_resampled)
bestgridsearch.best_estimator_

Fitting 3 folds for each of 3200 candidates, totalling 9600 fits


6240 fits failed out of a total of 9600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
480 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\shaba\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\shaba\anaconda3\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\shaba\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1172, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

In [69]:
y_pred_test = bestgridsearch.predict(x_test)

In [71]:
print(f'Accuracy : {bestgridsearch.score(x_test,y_test):.3f}')

Accuracy : 0.985


In [73]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[363,   1],
       [  6,  30]], dtype=int64)

# QUESTION 2: Optimize carrier selection based on cost/transit time

In [1]:
import numpy as np
import pandas as pd


In [2]:
df = pd.read_csv("logistics_shipments_dataset.csv")
df.head(3)

Unnamed: 0,Shipment_ID,Origin_Warehouse,Destination,Carrier,Shipment_Date,Delivery_Date,Weight_kg,Cost,Status,Distance_miles,Transit_Days
0,SH10000,Warehouse_MIA,San Francisco,UPS,2023-10-02,2023-10-04,25.7,67.46,Delivered,291,2
1,SH10001,Warehouse_MIA,Atlanta,DHL,2023-12-06,2023-12-09,38.9,268.85,Delivered,1225,3
2,SH10002,Warehouse_LA,Houston,DHL,2023-09-18,2023-09-20,37.2,74.35,Delivered,220,2


In [5]:
# Convert dates
df["Shipment_Date"] = pd.to_datetime(df["Shipment_Date"])
df["Delivery_Date"] = pd.to_datetime(df["Delivery_Date"])

In [7]:
# Extract features from dates
df["ship_dayofweek"] = df["Shipment_Date"].dt.dayofweek
df["ship_month"] = df["Shipment_Date"].dt.month
df["delivery_dayofweek"] = df["Delivery_Date"].dt.dayofweek
df["Delivery_Duration"] = (df["Delivery_Date"] - df["Shipment_Date"]).dt.days

In [9]:
# Drop unnecessary columns
df = df.drop(columns=["Shipment_Date", "Delivery_Date", "Shipment_ID", "Status"])
df.head(3)

Unnamed: 0,Origin_Warehouse,Destination,Carrier,Weight_kg,Cost,Distance_miles,Transit_Days,ship_dayofweek,ship_month,delivery_dayofweek,Delivery_Duration
0,Warehouse_MIA,San Francisco,UPS,25.7,67.46,291,2,0,10,2.0,2.0
1,Warehouse_MIA,Atlanta,DHL,38.9,268.85,1225,3,2,12,5.0,3.0
2,Warehouse_LA,Houston,DHL,37.2,74.35,220,2,0,9,2.0,2.0


In [11]:
# Fill missing values
from sklearn.impute import SimpleImputer
si = SimpleImputer(strategy="most_frequent")
df["Delivery_Duration"] = si.fit_transform(df[["Delivery_Duration"]])
df["delivery_dayofweek"] = si.fit_transform(df[["delivery_dayofweek"]])
si_cost=SimpleImputer()
df["Cost"] = si_cost.fit_transform(df[["Cost"]])

In [13]:
df = pd.get_dummies(df, columns=["Origin_Warehouse","Destination"])
df.head(3)

Unnamed: 0,Carrier,Weight_kg,Cost,Distance_miles,Transit_Days,ship_dayofweek,ship_month,delivery_dayofweek,Delivery_Duration,Origin_Warehouse_Warehouse_ATL,...,Destination_Detroit,Destination_Houston,Destination_Los Angeles,Destination_Miami,Destination_Minneapolis,Destination_New York,Destination_Phoenix,Destination_Portland,Destination_San Francisco,Destination_Seattle
0,UPS,25.7,67.46,291,2,0,10,2.0,2.0,False,...,False,False,False,False,False,False,False,False,True,False
1,DHL,38.9,268.85,1225,3,2,12,5.0,3.0,False,...,False,False,False,False,False,False,False,False,False,False
2,DHL,37.2,74.35,220,2,0,9,2.0,2.0,False,...,False,True,False,False,False,False,False,False,False,False


In [15]:
X = df.drop(columns=["Carrier"])
y = df["Carrier"]

In [17]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [19]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [21]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=500)
lr.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [23]:
y_pred = lr.predict(X_test)

In [25]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.17

In [27]:
param_grid = [
    {'penalty':['l1','l2','elasticnet','none'],
    'C' : np.logspace(-4,4,20),
    'solver': ['lbfgs','newton-cg','liblinear','sag','saga'],
    'max_iter'  : [10,50,100,150,500,1000,2000,3000]
}
]

In [29]:
from sklearn.model_selection import GridSearchCV
gridsearch = GridSearchCV(lr,param_grid = param_grid, cv = 3, verbose=True,n_jobs=-1)
gridsearch

In [31]:
bestgridsearch = gridsearch.fit(X,y)
bestgridsearch.best_estimator_

Fitting 3 folds for each of 3200 candidates, totalling 9600 fits


6240 fits failed out of a total of 9600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
480 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\shaba\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\shaba\anaconda3\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\shaba\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1172, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

In [35]:
print(f'Accuracy : {bestgridsearch.score(X,y):.3f}')

Accuracy : 0.234


In [93]:
df["Carrier"].value_counts()

Carrier
LaserShip           303
OnTrac              299
FedEx               295
USPS                292
DHL                 281
Amazon Logistics    274
UPS                 256
Name: count, dtype: int64

# Question 3: Forecast shipping expenses by route

In [186]:
import numpy as np
import pandas as pd

In [188]:
df = pd.read_csv("logistics_shipments_dataset.csv")

In [190]:
df.head(3)

Unnamed: 0,Shipment_ID,Origin_Warehouse,Destination,Carrier,Shipment_Date,Delivery_Date,Weight_kg,Cost,Status,Distance_miles,Transit_Days
0,SH10000,Warehouse_MIA,San Francisco,UPS,2023-10-02,2023-10-04,25.7,67.46,Delivered,291,2
1,SH10001,Warehouse_MIA,Atlanta,DHL,2023-12-06,2023-12-09,38.9,268.85,Delivered,1225,3
2,SH10002,Warehouse_LA,Houston,DHL,2023-09-18,2023-09-20,37.2,74.35,Delivered,220,2


In [192]:
df["Shipment_Date"] = pd.to_datetime(df["Shipment_Date"])
df["Delivery_Date"] = pd.to_datetime(df["Delivery_Date"])

In [194]:
# feature engineering
df["Delivery_Duration"] = (df["Delivery_Date"] - df["Shipment_Date"]).dt.days

In [196]:
df = df.drop(columns=["Shipment_ID", "Shipment_Date", "Delivery_Date", "Status"])

In [198]:
df.isnull().sum()

Origin_Warehouse      0
Destination           0
Carrier               0
Weight_kg             0
Cost                 41
Distance_miles        0
Transit_Days          0
Delivery_Duration    32
dtype: int64

In [200]:
from sklearn.impute import SimpleImputer
si_cost = SimpleImputer()
df["Cost"] = si_cost.fit_transform(df[["Cost"]])
si_duration = SimpleImputer()
df["Delivery_Duration"] = si_duration.fit_transform(df[["Delivery_Duration"]])

In [202]:
df.isnull().sum()

Origin_Warehouse     0
Destination          0
Carrier              0
Weight_kg            0
Cost                 0
Distance_miles       0
Transit_Days         0
Delivery_Duration    0
dtype: int64

In [204]:
from smogn import smoter
df_balanced = smoter(
    data=df,
    y='Cost'  # target column
)


dist_matrix: 100%|###################################################################| 104/104 [00:04<00:00, 24.96it/s]
synth_matrix: 100%|##################################################################| 104/104 [00:02<00:00, 39.64it/s]
r_index: 100%|########################################################################| 63/63 [00:00<00:00, 317.16it/s]
1      Warehouse_ATL
2      Warehouse_ATL
3      Warehouse_ATL
4      Warehouse_ATL
           ...      
890    Warehouse_ATL
891    Warehouse_ATL
892    Warehouse_ATL
893    Warehouse_ATL
894    Warehouse_ATL
Name: 0, Length: 895, dtype: object' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  data_new.iloc[:, j] = data_new.iloc[:, j].replace(x, cat_list[x])
1      Houston
2      Houston
3      Houston
4      Houston
        ...   
890    Houston
891    Houston
892    Houston
893    Houston
894    Houston
Name: 1, Length: 895, dtype: object' has dtype incompatible with float64, please expli

In [206]:
df = pd.get_dummies(df, columns=["Origin_Warehouse", "Destination", "Carrier"])

In [172]:
X = df.drop(columns=["Cost"])
y = df["Cost"]

In [208]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [210]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)

In [218]:
y_pred = lr.predict(X_test)

In [220]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.09601520070984304

In [222]:
df["Cost"].value_counts()

Cost
205.161598    41
107.880000     2
43.350000      2
326.900000     2
42.110000      2
              ..
94.630000      1
260.860000     1
220.620000     1
230.690000     1
360.740000     1
Name: count, Length: 1904, dtype: int64

In [148]:
pip install smogn

Collecting smognNote: you may need to restart the kernel to use updated packages.

  Downloading smogn-0.1.2-py3-none-any.whl.metadata (4.5 kB)
Downloading smogn-0.1.2-py3-none-any.whl (30 kB)
Installing collected packages: smogn
Successfully installed smogn-0.1.2


In [142]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [225]:
np.sqrt(mean_squared_error(y_test, y_pred))

322.700594016156

In [227]:
mean_absolute_error(y_test, y_pred)

48.29060581939759