In [2]:
import pandas as pd

# load datasets
products = pd.read_excel("../data/products.xlsx")
materials = pd.read_excel("../data/packaging_materials.xlsx")

print("Products shape:", products.shape)
print("Materials shape:", materials.shape)

products.head()


Products shape: (15000, 16)
Materials shape: (9000, 30)


Unnamed: 0,Order_ID,Date,Item_Name,Category,Weight_kg,Volumetric_Weight_kg,L_cm,W_cm,H_cm,Fragility,Moisture_Sens,Shipping_Mode,Distance_km,Packaging_Used,Cost_USD,CO2_Emission_kg
0,1,2025-05-17,Sneakers,Clothing,0.82,1.41,28,21,12,5,False,Air,1893,Kraft Paper Mailer,1.56,6.673
1,2,2025-09-22,Smartphone,Electronics,0.29,0.0,14,7,0,9,True,Air,2141,Mushroom Pkg (Mycelium),1.92,1.552
2,3,2025-11-12,Office Chair,Furniture,12.26,38.06,60,61,52,6,False,Road,1491,Wood Crate,16.42,28.374
3,4,2025-01-30,Office Chair,Furniture,11.56,38.27,65,64,46,5,False,Road,530,Wood Crate,16.31,10.142
4,5,2025-09-06,T-Shirt,Clothing,0.25,0.08,22,18,1,1,False,Air,1587,Kraft Paper Mailer,0.3,0.992


In [3]:
#Select only useful columns

# Select important features
data = products[[
    "Category",
    "Weight_kg",
    "L_cm",
    "W_cm",
    "H_cm",
    "Fragility",
    "Moisture_Sens",
    "Shipping_Mode",
    "Distance_km",
    "Packaging_Used"
]].copy()

data.head()


Unnamed: 0,Category,Weight_kg,L_cm,W_cm,H_cm,Fragility,Moisture_Sens,Shipping_Mode,Distance_km,Packaging_Used
0,Clothing,0.82,28,21,12,5,False,Air,1893,Kraft Paper Mailer
1,Electronics,0.29,14,7,0,9,True,Air,2141,Mushroom Pkg (Mycelium)
2,Furniture,12.26,60,61,52,6,False,Road,1491,Wood Crate
3,Furniture,11.56,65,64,46,5,False,Road,530,Wood Crate
4,Clothing,0.25,22,18,1,1,False,Air,1587,Kraft Paper Mailer


In [4]:
#Create packaging physics features

# Volume (space occupied)
data["Volume_cm3"] = data["L_cm"] * data["W_cm"] * data["H_cm"]

# Surface area (material required)
data["Surface_Area_cm2"] = 2 * (
    data["L_cm"]*data["W_cm"] +
    data["W_cm"]*data["H_cm"] +
    data["H_cm"]*data["L_cm"]
)

# Handling risk (important for packaging choice)
data["Handling_Risk"] = (
    data["Fragility"]/10 * 0.6 +
    data["Distance_km"]/3000 * 0.4
)

data.head()


Unnamed: 0,Category,Weight_kg,L_cm,W_cm,H_cm,Fragility,Moisture_Sens,Shipping_Mode,Distance_km,Packaging_Used,Volume_cm3,Surface_Area_cm2,Handling_Risk
0,Clothing,0.82,28,21,12,5,False,Air,1893,Kraft Paper Mailer,7056,2352,0.5524
1,Electronics,0.29,14,7,0,9,True,Air,2141,Mushroom Pkg (Mycelium),0,196,0.825467
2,Furniture,12.26,60,61,52,6,False,Road,1491,Wood Crate,190320,19904,0.5588
3,Furniture,11.56,65,64,46,5,False,Road,530,Wood Crate,191360,20188,0.370667
4,Clothing,0.25,22,18,1,1,False,Air,1587,Kraft Paper Mailer,396,872,0.2716


In [5]:
from sklearn.preprocessing import LabelEncoder

encoders = {}

# columns to encode
cat_cols = ["Category", "Shipping_Mode", "Moisture_Sens", "Packaging_Used"]

for col in cat_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    encoders[col] = le

print("Encoding done")
data.head()


Encoding done


Unnamed: 0,Category,Weight_kg,L_cm,W_cm,H_cm,Fragility,Moisture_Sens,Shipping_Mode,Distance_km,Packaging_Used,Volume_cm3,Surface_Area_cm2,Handling_Risk
0,1,0.82,28,21,12,5,0,0,1893,4,7056,2352,0.5524
1,2,0.29,14,7,0,9,1,0,2141,5,0,196,0.825467
2,3,12.26,60,61,52,6,0,1,1491,9,190320,19904,0.5588
3,3,11.56,65,64,46,5,0,1,530,9,191360,20188,0.370667
4,1,0.25,22,18,1,1,0,0,1587,4,396,872,0.2716


In [7]:
#Define input features (X) and output label (y)
# Input features (what AI sees)
X = data[[
    "Category",
    "Weight_kg",
    "Volume_cm3",
    "Surface_Area_cm2",
    "Fragility",
    "Handling_Risk",
    "Moisture_Sens",
    "Shipping_Mode",
    "Distance_km"
]]

# Output label (what AI predicts)
y = data["Packaging_Used"]

print(X.shape)
print(y.shape)



(15000, 9)
(15000,)


In [8]:
#Spliting testing and training data

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training:", X_train.shape)
print("Testing:", X_test.shape)


Training: (12000, 9)
Testing: (3000, 9)


In [9]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=200,
    max_depth=18,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

print("Material recommendation model trained")


Material recommendation model trained


In [10]:
from sklearn.metrics import accuracy_score

pred = model.predict(X_test)

acc = accuracy_score(y_test, pred)
print("Model Accuracy:", acc)


Model Accuracy: 0.95


In [11]:
import joblib

# save trained model
joblib.dump(model, "../models/material_recommender.pkl")

# also save encoders (VERY IMPORTANT)
joblib.dump(encoders, "../models/encoders.pkl")

print("Recommender model saved")


Recommender model saved
