In [1]:
import pandas as pd
# Load materials dataset
materials = pd.read_excel("../data/packaging_materials.xlsx")
# Load product dataset
products = pd.read_excel("../data/products.xlsx")
print("Materials Dataset Shape:", materials.shape)
print("Products Dataset Shape:", products.shape)

Materials Dataset Shape: (9000, 30)
Products Dataset Shape: (15000, 16)


In [2]:
print("Materials Columns:\n", materials.columns.tolist())
print("\nProducts Columns:\n", products.columns.tolist())

Materials Columns:
 ['Material_ID', 'Material_Type', 'Tensile_Strength_MPa', 'Weight_Capacity_kg', 'Thickness_Micrometers', 'Production_Cost_per_kg_INR', 'Shelf_Life_Days', 'Biodegradability_Score', 'CO2_Emission_Score', 'Recyclability_Percentage', 'Moisture_Barrier_g_m2_day', 'Oxygen_Barrier_cc_m2_day', 'Processing_Temperature_C', 'Density_g_cm3', 'Biodegradation_Time_Days', 'Water_Absorption_Percentage', 'Thermal_Stability_C', 'Environmental_Impact_Score', 'Application_Type', 'Manufacturing_Method', 'Additives_Used', 'Supplier_Region_India', 'Food_Product_Type', 'Storage_Temperature', 'Certification', 'Market_State', 'Barrier_Property', 'Testing_Date', 'GST_Applicable', 'Indian_Standard']

Products Columns:
 ['Order_ID', 'Date', 'Item_Name', 'Category', 'Weight_kg', 'Volumetric_Weight_kg', 'L_cm', 'W_cm', 'H_cm', 'Fragility', 'Moisture_Sens', 'Shipping_Mode', 'Distance_km', 'Packaging_Used', 'Cost_USD', 'CO2_Emission_kg']


In [3]:
materials_req = materials[[
 'Material_Type',
 'Tensile_Strength_MPa',
 'Weight_Capacity_kg',
 'Production_Cost_per_kg_INR',
 'Biodegradability_Score',
 'CO2_Emission_Score',
 'Recyclability_Percentage',
 'Moisture_Barrier_g_m2_day',
 'Oxygen_Barrier_cc_m2_day',
 'Thermal_Stability_C',
 'Density_g_cm3',
 'Application_Type'
]]
materials_req.head()

Unnamed: 0,Material_Type,Tensile_Strength_MPa,Weight_Capacity_kg,Production_Cost_per_kg_INR,Biodegradability_Score,CO2_Emission_Score,Recyclability_Percentage,Moisture_Barrier_g_m2_day,Oxygen_Barrier_cc_m2_day,Thermal_Stability_C,Density_g_cm3,Application_Type
0,Alginate,39.94,116.87,669.99,83.17,93.8,94.33,51.55,193.82,168.01,0.93,Box/Carton (Shipping)
1,Alginate,90.95,75.88,1185.69,71.61,55.4,96.61,109.38,250.0,233.24,1.08,Flexible Bag (Flour/Grain)
2,Polybutylene Succinate (PBS),71.58,96.95,719.21,74.92,95.92,32.3,32.41,58.31,162.94,1.22,Coating Layer (Processed Food)
3,Seaweed/Kelp-based Polymer,59.78,66.45,337.05,86.54,68.2,53.56,30.01,277.88,212.45,1.0,Beverage Bottle (Water/Juice)
4,Blended Starch,20.59,86.94,480.16,94.12,77.48,88.94,82.18,90.92,230.32,1.3,Rigid Container (Vegetable)


In [4]:
products_req = products[[
 'Category',
 'Weight_kg',
 'L_cm',
 'W_cm',
 'H_cm',
 'Fragility',
 'Moisture_Sens',
 'Distance_km',
 'Packaging_Used',
 'Cost_USD',
 'CO2_Emission_kg'
]]
products_req.head()

Unnamed: 0,Category,Weight_kg,L_cm,W_cm,H_cm,Fragility,Moisture_Sens,Distance_km,Packaging_Used,Cost_USD,CO2_Emission_kg
0,Clothing,0.82,28,21,12,5,False,1893,Kraft Paper Mailer,1.56,6.673
1,Electronics,0.29,14,7,0,9,True,2141,Mushroom Pkg (Mycelium),1.92,1.552
2,Furniture,12.26,60,61,52,6,False,1491,Wood Crate,16.42,28.374
3,Furniture,11.56,65,64,46,5,False,530,Wood Crate,16.31,10.142
4,Clothing,0.25,22,18,1,1,False,1587,Kraft Paper Mailer,0.3,0.992


In [5]:
# Volume of product (cm³)
products_req["Volume_cm3"] = (
 products_req["L_cm"] *
 products_req["W_cm"] *
 products_req["H_cm"]
)
products_req[["L_cm","W_cm","H_cm","Volume_cm3"]].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  products_req["Volume_cm3"] = (


Unnamed: 0,L_cm,W_cm,H_cm,Volume_cm3
0,28,21,12,7056
1,14,7,0,0
2,60,61,52,190320
3,65,64,46,191360
4,22,18,1,396


In [6]:
# Surface Area of box = 2(LW + LH + WH)
products_req["Surface_Area_cm2"] = 2 * (
 (products_req["L_cm"] * products_req["W_cm"]) +
 (products_req["L_cm"] * products_req["H_cm"]) +
 (products_req["W_cm"] * products_req["H_cm"])
)
products_req[["Surface_Area_cm2"]].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  products_req["Surface_Area_cm2"] = 2 * (


Unnamed: 0,Surface_Area_cm2
0,2352
1,196
2,19904
3,20188
4,872


In [7]:
# 20% packaging allowance
products_req["Adjusted_Packaging_Area_cm2"] = products_req["Surface_Area_cm2"] * 1.20

products_req[["Surface_Area_cm2","Adjusted_Packaging_Area_cm2"]].head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  products_req["Adjusted_Packaging_Area_cm2"] = products_req["Surface_Area_cm2"] * 1.20


Unnamed: 0,Surface_Area_cm2,Adjusted_Packaging_Area_cm2
0,2352,2822.4
1,196,235.2
2,19904,23884.8
3,20188,24225.6
4,872,1046.4


In [8]:
products_req = products_req.copy()
materials_req = materials_req.copy()


In [9]:
# Risk Handling capacity calculation

# Normalize distance (0–1 scale)
products_req["Distance_Score"] = products_req["Distance_km"] / products_req["Distance_km"].max()

# Normalize fragility (assuming scale up to 10)
products_req["Fragility_Score"] = products_req["Fragility"] / 10

# Combined handling risk
products_req["Handling_Risk"] = (
    0.6 * products_req["Fragility_Score"] +
    0.4 * products_req["Distance_Score"]
)

products_req[["Fragility","Distance_km","Handling_Risk"]].head()


Unnamed: 0,Fragility,Distance_km,Handling_Risk
0,5,1893,0.5524
1,9,2141,0.825467
2,6,1491,0.5588
3,5,530,0.370667
4,1,1587,0.2716


In [10]:
# Moisture sensitivity → higher need for protective eco materials
products_req["Moisture_Score"] = products_req["Moisture_Sens"].astype(int)

products_req[["Moisture_Sens","Moisture_Score"]].head()


Unnamed: 0,Moisture_Sens,Moisture_Score
0,False,0
1,True,1
2,False,0
3,False,0
4,False,0


In [11]:
#Convert Material Properties to Comparable Scale

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

material_numeric_cols = [
    'Tensile_Strength_MPa',
    'Weight_Capacity_kg',
    'Biodegradability_Score',
    'CO2_Emission_Score',
    'Recyclability_Percentage',
    'Moisture_Barrier_g_m2_day',
    'Oxygen_Barrier_cc_m2_day',
    'Thermal_Stability_C',
    'Density_g_cm3'
]

materials_req[material_numeric_cols] = scaler.fit_transform(materials_req[material_numeric_cols])

materials_req.head()


Unnamed: 0,Material_Type,Tensile_Strength_MPa,Weight_Capacity_kg,Production_Cost_per_kg_INR,Biodegradability_Score,CO2_Emission_Score,Recyclability_Percentage,Moisture_Barrier_g_m2_day,Oxygen_Barrier_cc_m2_day,Thermal_Stability_C,Density_g_cm3,Application_Type
0,Alginate,0.374647,0.913612,669.99,0.645684,0.864818,0.916618,0.320867,0.644843,0.1801,0.06,Box/Carton (Shipping)
1,Alginate,0.950966,0.525339,1185.69,0.402316,0.026207,0.950147,0.719914,0.832792,0.8324,0.36,Flexible Bag (Flour/Grain)
2,Polybutylene Succinate (PBS),0.732121,0.724922,719.21,0.472,0.911116,0.004412,0.188794,0.191496,0.1294,0.64,Coating Layer (Processed Food)
3,Seaweed/Kelp-based Polymer,0.598802,0.436014,337.05,0.716632,0.305744,0.317059,0.172233,0.926065,0.6245,0.2,Beverage Bottle (Water/Juice)
4,Blended Starch,0.156028,0.630103,480.16,0.876211,0.508408,0.837353,0.532225,0.300592,0.8032,0.8,Rigid Container (Vegetable)


In [12]:
# Create Material Protection Capability Score
materials_req["Protection_Score"] = (
    0.35 * materials_req["Tensile_Strength_MPa"] +
    0.25 * materials_req["Weight_Capacity_kg"] +
    0.20 * materials_req["Moisture_Barrier_g_m2_day"] +
    0.20 * materials_req["Thermal_Stability_C"]
)

materials_req[["Material_Type","Protection_Score"]].head()


Unnamed: 0,Material_Type,Protection_Score
0,Alginate,0.459723
1,Alginate,0.774636
2,Polybutylene Succinate (PBS),0.501111
3,Seaweed/Kelp-based Polymer,0.477931
4,Blended Starch,0.47922


In [13]:
# Create Sustainability score
materials_req["Sustainability_Score"] = (
    0.4 * materials_req["Biodegradability_Score"] +
    0.4 * materials_req["Recyclability_Percentage"] +
    0.2 * (1 - materials_req["CO2_Emission_Score"])
)

materials_req[["Material_Type","Sustainability_Score"]].head()


Unnamed: 0,Material_Type,Sustainability_Score
0,Alginate,0.651957
1,Alginate,0.735744
2,Polybutylene Succinate (PBS),0.208342
3,Seaweed/Kelp-based Polymer,0.552327
4,Blended Starch,0.783744


In [14]:
#Create Product Requirement Score

# Product packaging requirement
products_req["Requirement_Score"] = (
    0.5 * products_req["Handling_Risk"] +
    0.3 * products_req["Moisture_Score"] +
    0.2 * (products_req["Weight_kg"] / products_req["Weight_kg"].max())
)

products_req[["Handling_Risk","Moisture_Score","Requirement_Score"]].head()


Unnamed: 0,Handling_Risk,Moisture_Score,Requirement_Score
0,0.5524,0,0.278251
1,0.825467,1,0.713459
2,0.5588,0,0.310069
3,0.370667,0,0.214251
4,0.2716,0,0.136425


In [15]:
#Create Material Efficiency Score
materials_req["Material_Efficiency"] = (
    0.6 * materials_req["Protection_Score"] +
    0.4 * materials_req["Sustainability_Score"]
)

materials_req[["Material_Type","Material_Efficiency"]].head()



Unnamed: 0,Material_Type,Material_Efficiency
0,Alginate,0.536617
1,Alginate,0.759079
2,Polybutylene Succinate (PBS),0.384003
3,Seaweed/Kelp-based Polymer,0.50769
4,Blended Starch,0.60103


In [16]:
# Take one sample product
sample_product = products_req.iloc[0]

# calculate suitability score for all materials
materials_req["Suitability"] = 1 - abs(materials_req["Material_Efficiency"] - sample_product["Requirement_Score"])

# top recommendations
top_materials = materials_req.sort_values(by="Suitability", ascending=False)[
    ["Material_Type","Suitability"]
].head(5)

top_materials


Unnamed: 0,Material_Type,Suitability
4741,Thermoplastic Starch (TPS),0.999977
2611,Natural Fiber Composite,0.999843
5860,Hemp Fiber Composite,0.999828
909,Polycaprolactone (PCL),0.999804
168,Paper-based Composite,0.999695


In [17]:
# Module 3 — Machine Learning Dataset Preparation

# Prepare Training Features

# ML input features
ml_features = products_req[[
    "Weight_kg",
    "Volume_cm3",
    "Adjusted_Packaging_Area_cm2",
    "Handling_Risk",
    "Moisture_Score",
    "Distance_km"
]]

ml_features.head()


Unnamed: 0,Weight_kg,Volume_cm3,Adjusted_Packaging_Area_cm2,Handling_Risk,Moisture_Score,Distance_km
0,0.82,7056,2822.4,0.5524,0,1893
1,0.29,0,235.2,0.825467,1,2141
2,12.26,190320,23884.8,0.5588,0,1491
3,11.56,191360,24225.6,0.370667,0,530
4,0.25,396,1046.4,0.2716,0,1587


In [18]:
# Targets What AI should predict 

# Targets
cost_target = products_req["Cost_USD"]
co2_target = products_req["CO2_Emission_kg"]

print(cost_target.head())
print(co2_target.head())


0     1.56
1     1.92
2    16.42
3    16.31
4     0.30
Name: Cost_USD, dtype: float64
0     6.673
1     1.552
2    28.374
3    10.142
4     0.992
Name: CO2_Emission_kg, dtype: float64


In [19]:
#Split Training and Testing Data

from sklearn.model_selection import train_test_split

X_train, X_test, y_cost_train, y_cost_test = train_test_split(
    ml_features, cost_target, test_size=0.2, random_state=42
)

X_train2, X_test2, y_co2_train, y_co2_test = train_test_split(
    ml_features, co2_target, test_size=0.2, random_state=42
)

print(X_train.shape, X_test.shape)


(12000, 6) (3000, 6)


In [20]:
# Module 4 — AI Recommendation Model

# Training the cost prediction model
from sklearn.ensemble import RandomForestRegressor

# Create model
cost_model = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

# Train model
cost_model.fit(X_train, y_cost_train)

print("Cost model training complete")


Cost model training complete


In [21]:
# testing the trained model accuracy

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Predict
cost_pred = cost_model.predict(X_test)

# Metrics
mae = mean_absolute_error(y_cost_test, cost_pred)
rmse = np.sqrt(mean_squared_error(y_cost_test, cost_pred))
r2 = r2_score(y_cost_test, cost_pred)

print("MAE:", mae)
print("RMSE:", rmse)
print("R2 Score:", r2)


MAE: 0.05386026666666682
RMSE: 0.09973230182176122
R2 Score: 0.9999892241074791


In [22]:
## Training the co2 prediction model

from xgboost import XGBRegressor

# Create model
co2_model = XGBRegressor(
    n_estimators=200,
    learning_rate=0.08,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# Train
co2_model.fit(X_train2, y_co2_train)

print("CO2 model training complete")


CO2 model training complete


In [23]:
# testinf co2 model accuracy

# Predict
co2_pred = co2_model.predict(X_test2)

# Metrics
mae2 = mean_absolute_error(y_co2_test, co2_pred)
rmse2 = np.sqrt(mean_squared_error(y_co2_test, co2_pred))
r22 = r2_score(y_co2_test, co2_pred)

print("MAE:", mae2)
print("RMSE:", rmse2)
print("R2 Score:", r22)


MAE: 0.865158705252819
RMSE: 2.957451191709725
R2 Score: 0.9980109703816978


In [24]:
#Save the Trained Models

import joblib

# Save cost model
joblib.dump(cost_model, "../models/cost_model.pkl")

# Save CO2 model
joblib.dump(co2_model, "../models/co2_model.pkl")

print("Models saved successfully")


Models saved successfully


In [25]:
# Only keep required columns
material_model_data = materials_req[[
    "Material_Type",
    "Material_Efficiency"
]]

# Save for Flask
material_model_data.to_csv("../models/material_data.csv", index=False)

print("Material data exported")


Material data exported


In [26]:
import pandas as pd

# Load original dataset (your excel file)
materials = pd.read_excel("../data/packaging_materials.xlsx")

# Keep only needed columns for AI engine
materials_required = materials[[
    "Material_Type",
    "Tensile_Strength_MPa",
    "Weight_Capacity_kg",
    "Thickness_Micrometers",
    "Thermal_Stability_C",
    "Biodegradability_Score",
    "Recyclability_Percentage",
    "CO2_Emission_Score",
    "Environmental_Impact_Score"
]]

# Save for backend
materials_required.to_csv("../models/material_data.csv", index=False)

print("Material database exported correctly!")


Material database exported correctly!


In [27]:
materials_req.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Material_Type               9000 non-null   object 
 1   Tensile_Strength_MPa        9000 non-null   float64
 2   Weight_Capacity_kg          9000 non-null   float64
 3   Production_Cost_per_kg_INR  9000 non-null   float64
 4   Biodegradability_Score      9000 non-null   float64
 5   CO2_Emission_Score          9000 non-null   float64
 6   Recyclability_Percentage    9000 non-null   float64
 7   Moisture_Barrier_g_m2_day   9000 non-null   float64
 8   Oxygen_Barrier_cc_m2_day    9000 non-null   float64
 9   Thermal_Stability_C         9000 non-null   float64
 10  Density_g_cm3               9000 non-null   float64
 11  Application_Type            9000 non-null   object 
 12  Protection_Score            9000 non-null   float64
 13  Sustainability_Score        9000 

In [28]:
products_req.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 19 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Category                     15000 non-null  object 
 1   Weight_kg                    15000 non-null  float64
 2   L_cm                         15000 non-null  int64  
 3   W_cm                         15000 non-null  int64  
 4   H_cm                         15000 non-null  int64  
 5   Fragility                    15000 non-null  int64  
 6   Moisture_Sens                15000 non-null  bool   
 7   Distance_km                  15000 non-null  int64  
 8   Packaging_Used               15000 non-null  object 
 9   Cost_USD                     15000 non-null  float64
 10  CO2_Emission_kg              15000 non-null  float64
 11  Volume_cm3                   15000 non-null  int64  
 12  Surface_Area_cm2             15000 non-null  int64  
 13  Adjusted_Packagi

In [29]:
from sklearn.preprocessing import LabelEncoder

le_category = LabelEncoder()
products_req["Category_enc"] = le_category.fit_transform(products_req["Category"])

le_packaging = LabelEncoder()
products_req["Packaging_Used_enc"] = le_packaging.fit_transform(products_req["Packaging_Used"])


In [30]:
products_req[["Category","Category_enc","Packaging_Used","Packaging_Used_enc"]].head()


Unnamed: 0,Category,Category_enc,Packaging_Used,Packaging_Used_enc
0,Clothing,1,Kraft Paper Mailer,4
1,Electronics,2,Mushroom Pkg (Mycelium),5
2,Furniture,3,Wood Crate,9
3,Furniture,3,Wood Crate,9
4,Clothing,1,Kraft Paper Mailer,4


In [31]:
products_req["Moisture_Sens"] = products_req["Moisture_Sens"].astype(int)


In [32]:
# Normalize distance
products_req["Distance_norm"] = products_req["Distance_km"] / products_req["Distance_km"].max()

# Shipping risk score
products_req["Shipping_Risk"] = (
    products_req["Fragility"] * 0.6 +
    products_req["Distance_norm"] * 0.4
)


In [33]:
products_req[["Fragility","Distance_km","Shipping_Risk"]].head()


Unnamed: 0,Fragility,Distance_km,Shipping_Risk
0,5,1893,3.2524
1,9,2141,5.685467
2,6,1491,3.7988
3,5,530,3.070667
4,1,1587,0.8116


In [34]:
materials_req["Sustainability_Index"] = (
    (materials_req["Biodegradability_Score"] * 0.4) +
    (materials_req["Recyclability_Percentage"] * 0.3) +
    ((100 - materials_req["CO2_Emission_Score"]) * 0.3)
)


In [35]:
materials_req[["Material_Type","Sustainability_Index"]].head()


Unnamed: 0,Material_Type,Sustainability_Index
0,Alginate,30.273814
1,Alginate,30.438108
2,Polybutylene Succinate (PBS),29.916789
3,Seaweed/Kelp-based Polymer,30.290047
4,Blended Starch,30.449168


In [36]:
features = products_req[[
    "Weight_kg",
    "Volume_cm3",
    "Surface_Area_cm2",
    "Shipping_Risk",
    "Moisture_Sens",
    "Category_enc"
]]

target = products_req["Packaging_Used_enc"]


In [37]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    features,
    target,
    test_size=0.2,
    random_state=42
)


In [38]:
from xgboost import XGBClassifier

model = XGBClassifier(
    n_estimators=150,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8
)

model.fit(X_train, y_train)


0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [39]:
from sklearn.metrics import accuracy_score

pred = model.predict(X_test)
accuracy = accuracy_score(y_test, pred)

print("Model Accuracy:", accuracy)


Model Accuracy: 0.949


In [40]:
from sklearn.metrics import classification_report

print(classification_report(y_test, pred))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        22
           1       0.00      0.00      0.00        16
           2       0.93      0.76      0.84        85
           3       0.94      0.96      0.95       352
           4       0.94      0.98      0.96       648
           5       0.96      0.99      0.97      1141
           6       0.96      0.96      0.96       225
           7       0.00      0.00      0.00        13
           8       0.00      0.00      0.00        21
           9       0.95      0.97      0.96       477

    accuracy                           0.95      3000
   macro avg       0.57      0.56      0.56      3000
weighted avg       0.93      0.95      0.94      3000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [41]:
from collections import Counter

class_counts = Counter(y_train)
print(class_counts)


Counter({5: 4681, 4: 2636, 9: 1816, 3: 1360, 6: 916, 2: 342, 8: 75, 0: 63, 7: 60, 1: 51})


In [42]:
from xgboost import XGBClassifier

model = XGBClassifier(
    n_estimators=200,
    max_depth=7,
    learning_rate=0.08,
    subsample=0.9,
    colsample_bytree=0.9,
    objective="multi:softprob",
    eval_metric="mlogloss",
    scale_pos_weight=1
)

model.fit(X_train, y_train)


Parameters: { "scale_pos_weight" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.9
,device,
,early_stopping_rounds,
,enable_categorical,False


In [43]:
pred = model.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, pred))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        22
           1       0.00      0.00      0.00        16
           2       0.93      0.76      0.84        85
           3       0.94      0.96      0.95       352
           4       0.94      0.98      0.96       648
           5       0.96      0.99      0.97      1141
           6       0.96      0.96      0.96       225
           7       0.00      0.00      0.00        13
           8       0.00      0.00      0.00        21
           9       0.95      0.97      0.96       477

    accuracy                           0.95      3000
   macro avg       0.57      0.56      0.56      3000
weighted avg       0.93      0.95      0.94      3000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [44]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42, k_neighbors=3)

X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", X_train.shape)
print("After SMOTE:", X_train_bal.shape)


Before SMOTE: (12000, 6)
After SMOTE: (46810, 6)


In [45]:
from xgboost import XGBClassifier

model = XGBClassifier(
    n_estimators=220,
    max_depth=8,
    learning_rate=0.07,
    subsample=0.9,
    colsample_bytree=0.9,
    objective="multi:softprob",
    eval_metric="mlogloss"
)

model.fit(X_train_bal, y_train_bal)


0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.9
,device,
,early_stopping_rounds,
,enable_categorical,False


In [46]:
pred = model.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, pred))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        22
           1       0.00      0.00      0.00        16
           2       0.87      0.76      0.81        85
           3       0.95      0.93      0.94       352
           4       0.94      0.90      0.92       648
           5       0.96      0.93      0.94      1141
           6       0.96      0.95      0.95       225
           7       0.00      0.00      0.00        13
           8       0.03      0.10      0.05        21
           9       0.95      0.91      0.93       477

    accuracy                           0.90      3000
   macro avg       0.56      0.55      0.55      3000
weighted avg       0.92      0.90      0.91      3000



In [47]:
import joblib

joblib.dump(model, "../models/packaging_recommendation_model.pkl")
joblib.dump(le_category, "../models/category_encoder.pkl")
joblib.dump(le_packaging, "../models/packaging_encoder.pkl")

print("MODEL SAVED")


MODEL SAVED
