# Logistics Shipment Delay Classification

In this notebook, we:
1. Generate (or load) a synthetic logistics dataset
2. Perform data exploration & preprocessing
3. Train a Logistic Regression model
4. Evaluate the model’s performance
5. (Optionally) visualize the confusion matrix

In [None]:
# --------------------------------------------------
# 1. IMPORT LIBRARIES
# --------------------------------------------------
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# For reproducibility
np.random.seed(42)

In [None]:
# --------------------------------------------------
# 2. DATA GENERATION (SYNTHETIC) OR UPLOAD
# --------------------------------------------------
# Below, we create a synthetic DataFrame with columns relevant to a logistics scenario.

num_samples = 500
distances = np.random.randint(10, 3000, num_samples)  # distance in km
weights = np.random.randint(1, 200, num_samples)      # shipment weight in kg
shipping_modes = np.random.choice(['Air', 'Truck', 'Ship'], size=num_samples)
weekday_sent = np.random.randint(1, 8, num_samples)   # day of week (1=Monday,...7=Sunday)
weather = np.random.choice(['Clear', 'Rain', 'Storm'], size=num_samples)

# We'll define a target 'delayed' (isDelayed) based on some conditions.
is_delayed = []
for i in range(num_samples):
    if distances[i] > 2000 and weather[i] == 'Storm':
        is_delayed.append(1)
    elif shipping_modes[i] == 'Ship' and weather[i] != 'Clear':
        is_delayed.append(1)
    else:
        # Slight randomness for other cases
        is_delayed.append(np.random.choice([0, 1], p=[0.8, 0.2]))

data = pd.DataFrame({
    'distance_km': distances,
    'weight_kg': weights,
    'shipping_mode': shipping_modes,
    'weekday_sent': weekday_sent,
    'weather': weather,
    'delayed': is_delayed
})

print(data.head(10))

In [None]:
# --------------------------------------------------
# 3. DATA EXPLORATION & PREPROCESSING
# --------------------------------------------------
# Let's examine basic statistics of the dataset.

print("\nData Description:")
print(data.describe())

# Check class balance
print("\nClass Distribution:")
print(data['delayed'].value_counts())

# Convert categorical features to numeric (One-Hot Encoding)
data_encoded = pd.get_dummies(data, columns=['shipping_mode', 'weather'], drop_first=True)

# (Optional) Scale numeric features if desired, but it's not mandatory for logistic regression.
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# data_encoded[['distance_km','weight_kg','weekday_sent']] = scaler.fit_transform(
#     data_encoded[['distance_km','weight_kg','weekday_sent']])

data_encoded.head()

In [None]:
# --------------------------------------------------
# 4. SPLIT DATA INTO TRAIN/TEST
# --------------------------------------------------
X = data_encoded.drop('delayed', axis=1)
y = data_encoded['delayed']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# --------------------------------------------------
# 5. MODEL BUILDING & TRAINING
# --------------------------------------------------
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
print("Logistic Regression model is trained.")

In [None]:
# --------------------------------------------------
# 6. EVALUATION
# --------------------------------------------------
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_mat = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nConfusion Matrix:\n", conf_mat)
print("\nClassification Report:\n", report)

In [None]:
# --------------------------------------------------
# 7. OPTIONAL VISUALIZATION
# --------------------------------------------------
plt.figure()
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()