In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

try:
    from fuzzywuzzy import fuzz, process
except ImportError:
    try:
        from thefuzz import fuzz, process
    except ImportError:
        print("Warning: fuzzywuzzy or thefuzz not found. Install with: pip install fuzzywuzzy python-Levenshtein")
        def process_extractOne(text, choices, scorer=None):
            for choice in choices:
                if text.lower() in choice.lower() or choice.lower() in text.lower():
                    return (choice, 100)
            return None
        process = type('obj', (object,), {'extractOne': process_extractOne})
        fuzz = type('obj', (object,), {'token_sort_ratio': lambda x, y: 0})

import warnings
warnings.filterwarnings('ignore')

try:
    plt.style.use('seaborn-v0_8-darkgrid')
except:
    plt.style.use('seaborn')
sns.set_palette("husl")

## 1. Data Loading & Cleaning

In [2]:
history_df = pd.read_csv('real_packaging_history (1).csv')
materials_df = pd.read_csv('materials_database_600 (1).csv')

print("History Data Shape:", history_df.shape)
print("Materials Data Shape:", materials_df.shape)
print("\nHistory Columns:", history_df.columns.tolist())
print("\nMaterials Columns:", materials_df.columns.tolist())
print("\nFirst few rows of history:")
history_df.head()

History Data Shape: (15000, 16)
Materials Data Shape: (600, 8)

History Columns: ['Order_ID', 'Date', 'Item_Name', 'Category', 'Weight_kg', 'Volumetric_Weight_kg', 'L_cm', 'W_cm', 'H_cm', 'Fragility', 'Moisture_Sens', 'Shipping_Mode', 'Distance_km', 'Packaging_Used', 'Cost_USD', 'CO2_Emission_kg']

Materials Columns: ['Material_ID', 'Material_Name', 'Category', 'Density_kg_m3', 'Tensile_Strength_MPa', 'CO2_Emission_kg', 'Cost_per_kg', 'Biodegradable']

First few rows of history:


Unnamed: 0,Order_ID,Date,Item_Name,Category,Weight_kg,Volumetric_Weight_kg,L_cm,W_cm,H_cm,Fragility,Moisture_Sens,Shipping_Mode,Distance_km,Packaging_Used,Cost_USD,CO2_Emission_kg
0,1,2025-05-17,Sneakers,Clothing,0.82,1.41,28,21,12,5,False,Air,1893,Kraft Paper Mailer,1.56,6.673
1,2,2025-09-22,Smartphone,Electronics,0.29,0.0,14,7,0,9,True,Air,2141,Mushroom Pkg (Mycelium),1.92,1.552
2,3,2025-11-12,Office Chair,Furniture,12.26,38.06,60,61,52,6,False,Road,1491,Wood Crate,16.42,28.374
3,4,2025-01-30,Office Chair,Furniture,11.56,38.27,65,64,46,5,False,Road,530,Wood Crate,16.31,10.142
4,5,2025-09-06,T-Shirt,Clothing,0.25,0.08,22,18,1,1,False,Air,1587,Kraft Paper Mailer,0.3,0.992


In [3]:
print("Missing values in history data:")
print(history_df.isnull().sum())
print("\nMissing values in materials data:")
print(materials_df.isnull().sum())

Missing values in history data:
Order_ID                0
Date                    0
Item_Name               0
Category                0
Weight_kg               0
Volumetric_Weight_kg    0
L_cm                    0
W_cm                    0
H_cm                    0
Fragility               0
Moisture_Sens           0
Shipping_Mode           0
Distance_km             0
Packaging_Used          0
Cost_USD                0
CO2_Emission_kg         0
dtype: int64

Missing values in materials data:
Material_ID             0
Material_Name           0
Category                0
Density_kg_m3           0
Tensile_Strength_MPa    0
CO2_Emission_kg         0
Cost_per_kg             0
Biodegradable           0
dtype: int64


In [4]:
for col in ['Cost_USD', 'CO2_Emission_kg']:
    if history_df[col].isnull().sum() > 0:
        category_medians = history_df.groupby('Category')[col].median()
        history_df[col] = history_df.apply(
            lambda row: category_medians[row['Category']] if pd.isnull(row[col]) else row[col],
            axis=1
        )
        print(f"Imputed {history_df[col].isnull().sum()} missing values in {col}")

print("\nMissing values after imputation:")
print(history_df[['Cost_USD', 'CO2_Emission_kg']].isnull().sum())


Missing values after imputation:
Cost_USD           0
CO2_Emission_kg    0
dtype: int64


In [5]:
for dim in ['L_cm', 'W_cm', 'H_cm']:
    invalid_mask = (history_df[dim] == 0) | (history_df[dim].isnull())
    if invalid_mask.sum() > 0:
        category_means = history_df.groupby('Category')[dim].mean()
        history_df[dim] = history_df.apply(
            lambda row: category_means[row['Category']] if (row[dim] == 0 or pd.isnull(row[dim])) else row[dim],
            axis=1
        )
        print(f"Fixed {invalid_mask.sum()} invalid values in {dim}")

print("\nInvalid dimensions (zeros) after fixing:")
print((history_df[['L_cm', 'W_cm', 'H_cm']] == 0).sum())

Fixed 1128 invalid values in H_cm

Invalid dimensions (zeros) after fixing:
L_cm    0
W_cm    0
H_cm    0
dtype: int64


In [6]:
history_df['Product_Volume_m3'] = (history_df['L_cm'] * history_df['W_cm'] * history_df['H_cm']) / 1_000_000

print("Product_Volume_m3 calculated successfully!")
print(f"Volume statistics:\n{history_df['Product_Volume_m3'].describe()}")

Product_Volume_m3 calculated successfully!
Volume statistics:
count    15000.000000
mean         0.133259
std          0.381661
min          0.000016
25%          0.000475
50%          0.005850
75%          0.028910
max          1.822824
Name: Product_Volume_m3, dtype: float64


## 2. Material Mapping

In [7]:
packaging_used = history_df['Packaging_Used'].unique()
material_names = materials_df['Material_Name'].unique()

print(f"Unique packaging types in history: {len(packaging_used)}")
print(f"Unique materials in database: {len(material_names)}")
print("\nSample packaging types:", packaging_used[:10])
print("\nSample material names:", material_names[:10])

Unique packaging types in history: 10
Unique materials in database: 600

Sample packaging types: ['Kraft Paper Mailer' 'Mushroom Pkg (Mycelium)' 'Wood Crate'
 'PLA Bioplastic' 'Honeycomb Paper' 'Recycled PET Box'
 'Bubble Wrap (LDPE)' 'Corrugated Cardboard' 'Styrofoam (EPS)'
 'Cornstarch Foam']

Sample material names: ['Recycled Palm Leaf' 'Waterproof Palm Leaf' 'Single-Ply Kraft Paper'
 'Single-Ply Aluminum Foil' 'Fire-Retardant Nylon Fabric'
 'Commercial-Grade Recycled Denim' 'Food-Grade Polycarbonate'
 'Lightweight PET Plastic' 'Single-Ply Borosilicate Glass'
 'Food-Grade Mushroom Mycelium']


In [8]:
def map_packaging_to_material(packaging_name, material_list, threshold=60):
    packaging_lower = packaging_name.lower()
    
    if 'mushroom' in packaging_lower or 'mycelium' in packaging_lower:
        matches = [m for m in material_list if 'mushroom' in m.lower() or 'mycelium' in m.lower()]
        if matches:
            return matches[0]
    
    if 'wood' in packaging_lower or 'crate' in packaging_lower:
        matches = [m for m in material_list if 'plywood' in m.lower()]
        if matches:
            return matches[0]
    
    if 'kraft' in packaging_lower:
        matches = [m for m in material_list if 'kraft' in m.lower()]
        if matches:
            return matches[0]
    
    if 'pla' in packaging_lower or 'bioplastic' in packaging_lower:
        matches = [m for m in material_list if 'pla' in m.lower() or 'bioplastic' in m.lower()]
        if matches:
            return matches[0]
    
    if 'bubble' in packaging_lower or 'ldpe' in packaging_lower:
        matches = [m for m in material_list if 'bubble' in m.lower() or 'ldpe' in m.lower()]
        if matches:
            return matches[0]
    
    if 'pet' in packaging_lower and 'recycled' in packaging_lower:
        matches = [m for m in material_list if 'pet' in m.lower()]
        if matches:
            return matches[0]
    
    if 'honeycomb' in packaging_lower or 'paper' in packaging_lower:
        matches = [m for m in material_list if 'paper' in m.lower() or 'honeycomb' in m.lower()]
        if matches:
            return matches[0]
    
    if 'corrugated' in packaging_lower or 'cardboard' in packaging_lower:
        matches = [m for m in material_list if 'cardboard' in m.lower() or 'corrugated' in m.lower()]
        if matches:
            return matches[0]
    
    if 'styrofoam' in packaging_lower or 'eps' in packaging_lower:
        matches = [m for m in material_list if 'foam' in m.lower() and 'polyurethane' in m.lower()]
        if matches:
            return matches[0]
    
    if 'cornstarch' in packaging_lower:
        matches = [m for m in material_list if 'cornstarch' in m.lower()]
        if matches:
            return matches[0]
    
    try:
        best_match = process.extractOne(packaging_name, material_list, scorer=fuzz.token_sort_ratio)
        if best_match and best_match[1] >= threshold:
            return best_match[0]
    except:
        for material in material_list:
            if packaging_lower in material.lower() or material.lower() in packaging_lower:
                return material
    
    return None

packaging_to_material = {}
for packaging in packaging_used:
    mapped = map_packaging_to_material(packaging, material_names)
    packaging_to_material[packaging] = mapped
    if mapped:
        print(f"'{packaging}' -> '{mapped}'")
    else:
        print(f"'{packaging}' -> NOT FOUND")

print(f"\nSuccessfully mapped {sum(1 for v in packaging_to_material.values() if v is not None)}/{len(packaging_to_material)} packaging types")

'Kraft Paper Mailer' -> 'Single-Ply Kraft Paper'
'Mushroom Pkg (Mycelium)' -> 'Food-Grade Mushroom Mycelium'
'Wood Crate' -> 'Lightweight Plywood'
'PLA Bioplastic' -> 'Lightweight PET Plastic'
'Honeycomb Paper' -> 'Single-Ply Kraft Paper'
'Recycled PET Box' -> 'Lightweight PET Plastic'
'Bubble Wrap (LDPE)' -> 'Standard Bubble Wrap (LDPE)'
'Corrugated Cardboard' -> 'Industrial-Grade Corrugated Cardboard'
'Styrofoam (EPS)' -> 'UV-Stabilized Polyurethane Foam'
'Cornstarch Foam' -> 'Double-Wall Cornstarch Foam'

Successfully mapped 10/10 packaging types


In [9]:
history_df['Material_Name'] = history_df['Packaging_Used'].map(packaging_to_material)

print(f"Mapped rows: {history_df['Material_Name'].notna().sum()}/{len(history_df)}")
print(f"Unmapped rows: {history_df['Material_Name'].isna().sum()}")

if history_df['Material_Name'].isna().sum() > 0:
    unmapped = history_df[history_df['Material_Name'].isna()]['Packaging_Used'].unique()
    print(f"\nUnmapped packaging types: {unmapped}")

Mapped rows: 15000/15000
Unmapped rows: 0


In [10]:
materials_merge = materials_df[['Material_Name', 'Density_kg_m3', 'Cost_per_kg', 'CO2_Emission_kg']].copy()
materials_merge = materials_merge.rename(columns={'CO2_Emission_kg': 'Material_CO2_Factor'})

history_df = history_df.merge(
    materials_merge,
    on='Material_Name',
    how='left',
    suffixes=('', '_material')
)

history_df = history_df.rename(columns={'Density_kg_m3': 'Material_Density'})

print("Merged material properties:")
print(f"Rows with material properties: {history_df['Material_Density'].notna().sum()}/{len(history_df)}")
print("\nSample merged data:")
history_df[['Packaging_Used', 'Material_Name', 'Material_Density', 'Material_CO2_Factor', 'Cost_per_kg']].head(10)

Merged material properties:
Rows with material properties: 15000/15000

Sample merged data:


Unnamed: 0,Packaging_Used,Material_Name,Material_Density,Material_CO2_Factor,Cost_per_kg
0,Kraft Paper Mailer,Single-Ply Kraft Paper,742,0.742,0.91
1,Mushroom Pkg (Mycelium),Food-Grade Mushroom Mycelium,146,0.486,5.1
2,Wood Crate,Lightweight Plywood,515,0.535,2.23
3,Wood Crate,Lightweight Plywood,515,0.535,2.23
4,Kraft Paper Mailer,Single-Ply Kraft Paper,742,0.742,0.91
5,Wood Crate,Lightweight Plywood,515,0.535,2.23
6,Kraft Paper Mailer,Single-Ply Kraft Paper,742,0.742,0.91
7,Mushroom Pkg (Mycelium),Food-Grade Mushroom Mycelium,146,0.486,5.1
8,Mushroom Pkg (Mycelium),Food-Grade Mushroom Mycelium,146,0.486,5.1
9,Mushroom Pkg (Mycelium),Food-Grade Mushroom Mycelium,146,0.486,5.1


In [11]:
initial_rows = len(history_df)
history_df = history_df[history_df['Material_Density'].notna()].copy()
removed_rows = initial_rows - len(history_df)

print(f"Removed {removed_rows} rows without material mapping")
print(f"Final dataset size: {len(history_df)} rows")

Removed 0 rows without material mapping
Final dataset size: 15000 rows


## 3. Model Training 

In [12]:
feature_cols = ['Weight_kg', 'Distance_km', 'Shipping_Mode', 
                'Material_CO2_Factor', 'Material_Density']

X = history_df[feature_cols].copy()
y = history_df['CO2_Emission_kg'].copy()

print(f"Feature shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nMissing values in features:")
print(X.isnull().sum())

Feature shape: (15000, 5)
Target shape: (15000,)

Missing values in features:
Weight_kg              0
Distance_km            0
Shipping_Mode          0
Material_CO2_Factor    0
Material_Density       0
dtype: int64


In [13]:
encoder = OneHotEncoder(sparse_output=False, drop='first')
shipping_encoded = encoder.fit_transform(X[['Shipping_Mode']])
shipping_feature_names = encoder.get_feature_names_out(['Shipping_Mode'])

X_encoded = pd.DataFrame(
    shipping_encoded,
    columns=shipping_feature_names,
    index=X.index
)

X_encoded['Weight_kg'] = X['Weight_kg']
X_encoded['Distance_km'] = X['Distance_km']
X_encoded['Material_CO2_Factor'] = X['Material_CO2_Factor']
X_encoded['Material_Density'] = X['Material_Density']

print("Features after encoding:")
print(X_encoded.columns.tolist())
print(f"\nFeature shape: {X_encoded.shape}")
X_encoded.head()

Features after encoding:
['Shipping_Mode_Road', 'Weight_kg', 'Distance_km', 'Material_CO2_Factor', 'Material_Density']

Feature shape: (15000, 5)


Unnamed: 0,Shipping_Mode_Road,Weight_kg,Distance_km,Material_CO2_Factor,Material_Density
0,0.0,0.82,1893,0.742,742
1,0.0,0.29,2141,0.486,146
2,1.0,12.26,1491,0.535,515
3,1.0,11.56,530,0.535,515
4,0.0,0.25,1587,0.742,742


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

Training set: 12000 samples
Test set: 3000 samples


In [15]:
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

print("Training RandomForestRegressor...")
rf_model.fit(X_train, y_train)

print("Training completed!")

Training RandomForestRegressor...
Training completed!


In [16]:
y_pred = rf_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"RMSE on Test Set: {rmse:.4f}")
print(f"Mean CO2 Emission: {y_test.mean():.4f}")
print(f"RMSE as % of mean: {(rmse/y_test.mean())*100:.2f}%")

RMSE on Test Set: 7.8353
Mean CO2 Emission: 24.0177
RMSE as % of mean: 32.62%


In [17]:
def recommend_packaging(product_name, history_df, materials_df, model, encoder, feature_column_order, top_n=5):
    
    product_data = history_df[history_df['Item_Name'].str.contains(product_name, case=False, na=False)]
    
    if len(product_data) == 0:
        print(f"No historical data found for product: {product_name}")
        return None
    
    avg_weight = product_data['Weight_kg'].mean()
    avg_distance = product_data['Distance_km'].mean()
    most_common_shipping = product_data['Shipping_Mode'].mode()[0] if len(product_data['Shipping_Mode'].mode()) > 0 else 'Road'
    
    print(f"Product: {product_name}")
    print(f"Average Weight: {avg_weight:.3f} kg")
    print(f"Average Distance: {avg_distance:.1f} km")
    print(f"Most Common Shipping Mode: {most_common_shipping}")
    print("\nSimulating all materials...")
    
    shipping_encoded = encoder.transform([[most_common_shipping]])
    shipping_feature_names = encoder.get_feature_names_out(['Shipping_Mode'])
    
    shipping_dict = dict(zip(shipping_feature_names, shipping_encoded[0]))
    
    results = []
    
    for _, material_row in materials_df.iterrows():
        feature_dict = {
            **shipping_dict,  # Shipping mode features first
            'Weight_kg': avg_weight,
            'Distance_km': avg_distance,
            'Material_CO2_Factor': material_row['CO2_Emission_kg'],
            'Material_Density': material_row['Density_kg_m3']
        }
        
  
        feature_array = np.array([[feature_dict[col] for col in feature_column_order]])
        
        predicted_co2 = model.predict(feature_array)[0]
        
        results.append({
            'Material_Name': material_row['Material_Name'],
            'Category': material_row['Category'],
            'Density_kg_m3': material_row['Density_kg_m3'],
            'Material_CO2_Factor': material_row['CO2_Emission_kg'],
            'Cost_per_kg': material_row['Cost_per_kg'],
            'Biodegradable': material_row['Biodegradable'],
            'Predicted_CO2_kg': predicted_co2
        })
    
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('Predicted_CO2_kg').head(top_n)
    
    return results_df


def recommend_packaging_from_specs(product_name, weight_kg, distance_km, shipping_mode, 
                                    materials_df, model, encoder, feature_column_order, top_n=5):
    
    print(f"Product: {product_name}")
    print(f"Weight: {weight_kg:.3f} kg")
    print(f"Distance: {distance_km:.1f} km")
    print(f"Shipping Mode: {shipping_mode}")
    print("\nSimulating all materials...")
    
    shipping_encoded = encoder.transform([[shipping_mode]])
    shipping_feature_names = encoder.get_feature_names_out(['Shipping_Mode'])
    
    shipping_dict = dict(zip(shipping_feature_names, shipping_encoded[0]))
    
    results = []
    
    for _, material_row in materials_df.iterrows():
        feature_dict = {
            **shipping_dict,  
            'Weight_kg': weight_kg,
            'Distance_km': distance_km,
            'Material_CO2_Factor': material_row['CO2_Emission_kg'],
            'Material_Density': material_row['Density_kg_m3']
        }
        

        feature_array = np.array([[feature_dict[col] for col in feature_column_order]])
        

        predicted_co2 = model.predict(feature_array)[0]
        
        results.append({
            'Material_Name': material_row['Material_Name'],
            'Category': material_row['Category'],
            'Density_kg_m3': material_row['Density_kg_m3'],
            'Material_CO2_Factor': material_row['CO2_Emission_kg'],
            'Cost_per_kg': material_row['Cost_per_kg'],
            'Biodegradable': material_row['Biodegradable'],
            'Predicted_CO2_kg': predicted_co2
        })
    
    # Create results dataframe and sort by predicted CO2
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('Predicted_CO2_kg').head(top_n)
    
    return results_df


In [18]:

print("="*60)
print("PACKAGING RECOMMENDATION SYSTEM")
print("="*60)
print("\nPlease enter the following product specifications:\n")


product_name = input("Product Name: ").strip()
weight_kg = float(input("Average Weight (kg): "))
distance_km = float(input("Average Distance (km): "))
shipping_mode = input("Shipping Mode (Air/Road): ").strip().capitalize()

if shipping_mode not in ['Air', 'Road']:
    print(f"Warning: '{shipping_mode}' is not a valid shipping mode. Defaulting to 'Road'.")
    shipping_mode = 'Road'

print("\n" + "="*60)
print("GENERATING RECOMMENDATIONS...")
print("="*60 + "\n")

recommendations = recommend_packaging_from_specs(
    product_name=product_name,
    weight_kg=weight_kg,
    distance_km=distance_km,
    shipping_mode=shipping_mode,
    materials_df=materials_df,
    model=rf_model,
    encoder=encoder,
    feature_column_order=X_encoded.columns.tolist(),
    top_n=5
)

if recommendations is not None and len(recommendations) > 0:
    print("\n" + "="*60)
    print(f"TOP 5 RECOMMENDATIONS FOR {product_name.upper()} (Lowest CO2 Emissions):")
    print("="*60)
    print(recommendations.to_string(index=False))
    print("\n" + "="*60)
    
    

PACKAGING RECOMMENDATION SYSTEM

Please enter the following product specifications:


GENERATING RECOMMENDATIONS...

Product: laptop
Weight: 5.000 kg
Distance: 55.0 km
Shipping Mode: Air

Simulating all materials...

TOP 5 RECOMMENDATIONS FOR LAPTOP (Lowest CO2 Emissions):
                       Material_Name Category  Density_kg_m3  Material_CO2_Factor  Cost_per_kg Biodegradable  Predicted_CO2_kg
Industrial-Grade Bagasse (Sugarcane)      Eco            128                0.320         1.60           Yes          0.319868
       Laminated Bagasse (Sugarcane)      Eco            138                0.409         2.20           Yes          0.319868
                 Recycled Balsa Wood     Wood            156                0.234         3.06           Yes          0.319868
  Fire-Retardant Bagasse (Sugarcane)      Eco            129                0.380         2.19           Yes          0.319868
                 Standard Balsa Wood     Wood            166                0.415         3