In [5]:
# --- 0. Load Data (Run this cell first) ---
import pandas as pd
import os

# Define the path to your data file
file_path = '../Supply chain logistics problem.xlsx'

# List of sheet names to load
sheet_names = [
    "OrderList",
    "FreightRates",
    "WhCosts",
    "WhCapacities",
    "ProductsPerPlant",
    "VmiCustomers",
    "PlantPorts"
]

# Load all sheets into a dictionary of DataFrames
dfs = {}
try:
    for sheet in sheet_names:
        dfs[sheet.lower()] = pd.read_excel(file_path, sheet_name=sheet)
    # Strip whitespace from all column names for all DataFrames
    for df in dfs.values():
        df.columns = df.columns.str.strip()
    print("All data sheets loaded successfully.")
except Exception as e:
    print(f"An error occurred: {e}")

# --- 1. Create the Base Training DataFrame ---
# This process will create a comprehensive dataset where each order is matched
# with every possible plant, port, and carrier that could fulfill it.

# Merge orders with the plant-port mapping to find all possible origin ports for each order
orders_df = dfs['orderlist'].copy()
plant_ports_df = dfs['plantports'].copy()
# The 'Plant Code' in orders is the fulfillment plant, which determines the origin port
orders_with_ports = pd.merge(
    orders_df,
    plant_ports_df,
    on='Plant Code'
)
orders_with_ports.rename(columns={'Port': 'orig_port_cd'}, inplace=True)

# Merge this with the freight rates to pair each order with every possible carrier and rate tier on its route
freight_rates_df = dfs['freightrates'].copy()
# The destination for all orders is PORT09
orders_with_ports['dest_port_cd'] = 'PORT09'

# This merge creates a large table of all potential shipping options for every order
potential_shipments = pd.merge(
    orders_with_ports,
    freight_rates_df,
    on=['orig_port_cd', 'dest_port_cd']
)

# --- 2. Filter to Find the Correct Rate Tier ---
# The merge above created many rows per order (one for each price tier).
# Now, we filter to keep only the row with the correct price tier based on the order's weight.
# Replace 'Weight' with the actual column name for order weight if different
correct_tier_shipments = potential_shipments[
    (potential_shipments['Weight'] >= potential_shipments['minm_wgh_qty']) &
    (potential_shipments['Weight'] <= potential_shipments['max_wgh_qty'])
].copy()

# --- 3. Calculate the True Freight Cost (The Target Variable) ---
# Calculate the cost based on weight and rate
correct_tier_shipments['calculated_cost'] = correct_tier_shipments['Weight'] * correct_tier_shipments['rate']

# The final freight cost is the HIGHER of the calculated cost or the carrier's minimum charge
correct_tier_shipments['freight_cost'] = correct_tier_shipments[['calculated_cost', 'minimum cost']].max(axis=1)

# --- 4. Display the Final Training Set ---
# This is the dataset we will use to train our model.
# It includes the features (like Weight, Carrier, etc.) and the target ('freight_cost').
print("\n--- Sample of the Final Training Dataset for the ML Model ---")
# Select a few key columns to display for clarity
display(correct_tier_shipments.head())

# Save this dataframe for the next step
# We'll use this 'training_data_df' in the next code block to build the model

All data sheets loaded successfully.

--- Sample of the Final Training Dataset for the ML Model ---


Unnamed: 0,Order ID,Order Date,Origin Port,Carrier_x,TPT,Service Level,Ship ahead day count,Ship Late Day count,Customer,Product ID,...,minm_wgh_qty,max_wgh_qty,svc_cd,minimum cost,rate,mode_dsc,tpt_day_cnt,Carrier type,calculated_cost,freight_cost
2,1447296000.0,2013-05-26,PORT09,V44_3,1,CRF,3,0,V55555_53,1700106,...,0.0,99.99,DTD,17.6888,0.0332,AIR,14,V88888888_0,0.47476,17.6888
8,1447296000.0,2013-05-26,PORT09,V44_3,1,CRF,3,0,V55555_53,1700106,...,0.0,99.99,DTD,23.54,0.0476,AIR,5,V88888888_0,0.68068,23.54
12,1447296000.0,2013-05-26,PORT09,V44_3,1,CRF,3,0,V55555_53,1700106,...,0.0,99.99,DTD,25.584,0.0532,AIR,2,V88888888_0,0.76076,25.584
15,1447296000.0,2013-05-26,PORT09,V44_3,1,CRF,3,0,V55555_53,1700106,...,0.0,5000.0,DTD,31.2784,21.2784,GROUND,1,V88888888_0,304.28112,304.28112
16,1447296000.0,2013-05-26,PORT09,V44_3,1,CRF,3,0,V55555_53,1700106,...,0.0,5000.0,DTP,31.2784,20.2784,GROUND,0,V88888888_0,289.98112,289.98112


In [10]:
# ...existing code...
training_data_df = correct_tier_shipments.copy()
# ...existing code...

In [11]:
%pip install xgboost

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np

# --- 1. Define Features (X) and Target (y) ---
# The target is the 'freight_cost' we want to predict.
# The features are the inputs the model will learn from.
# You can adjust the features as needed for your data.
features = [
    'Weight',           # numerical
    'Carrier',          # categorical
    'orig_port_cd',     # categorical
    'dest_port_cd',     # categorical
    'svc_cd'            # categorical (if present)
    # Add other relevant features if available
]
target = 'freight_cost'

# Ensure 'training_data_df' exists in your notebook
X = training_data_df[features]
y = training_data_df[target]

# --- 2. Preprocess the Data for the Model ---
# XGBoost requires all input features to be numerical.
# We use one-hot encoding to convert categorical columns (like 'Carrier') into numbers.
categorical_cols = [col for col in ['Carrier', 'orig_port_cd', 'dest_port_cd', 'svc_cd'] if col in X.columns]
X_encoded = pd.get_dummies(X, columns=categorical_cols)

# --- 3. Split Data into Training and Testing Sets ---
# We'll train the model on 80% of the data and test its performance on the unseen 20%.
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# --- 4. Train the XGBoost Model ---
# Instantiate the XGBoost Regressor model with common hyperparameters.
# 'n_jobs=-1' tells the model to use all available CPU cores to speed up training.
xgbr = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

print("Training the XGBoost model...")
# Train the model on the training data
xgbr.fit(X_train, y_train)
print("Model training complete.")

# --- 5. Evaluate Model Performance ---
# Make predictions on the unseen test data.
y_pred = xgbr.predict(X_test)

# Calculate key performance metrics.
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\n--- Model Performance on Test Set ---")
print(f"Mean Absolute Error (MAE): ${mae:.2f}")
print(f"R-squared (R²): {r2:.4f}")

print("\nInterpretation:")
print(f"The model's predictions are, on average, within ${mae:.2f} of the actual freight cost. R² of {r2:.4f} indicates how well the model explains the variance in freight cost.")


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


KeyError: "['Carrier'] not in index"