# Model Training

This notebook performs:
1. Load Processed Data (`cleaned_data.csv`)
2. Train Sustainability Predictor (XGBoost)
3. Train Cost Predictor (Random Forest)
4. Save Models for Deployment

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
import xgboost as xgb
import joblib
import os
from datetime import datetime

# Set display options
pd.set_option('display.max_columns', None)

## 1. Load Data

In [2]:
# Load cleaned data
try:
    history_df = pd.read_csv('cleaned_data.csv')
    materials_df = pd.read_csv('materials_database_600 (1).csv')
    print("‚úÖ Data loaded successfully!")
    print(f"History Data Shape: {history_df.shape}")
    print(f"Materials Database Shape: {materials_df.shape}")
except FileNotFoundError:
    print("‚ùå cleaned_data.csv not found! Please run data_cleaning.ipynb first.")

‚úÖ Data loaded successfully!
History Data Shape: (15000, 21)
Materials Database Shape: (600, 8)


## 2. Model Training (Sustainability Predictor)

In [3]:
# Prepare features and target
# Features: Weight_kg, Distance_km, Shipping_Mode (One-Hot Encoded), 
#           Material_CO2_Factor, Material_Density
# Target: CO2_Emission_kg

# Select features
feature_cols = ['Weight_kg', 'Distance_km', 'Shipping_Mode', 
                'Material_CO2_Factor', 'Material_Density']

# Create feature dataframe
X = history_df[feature_cols].copy()
y = history_df['CO2_Emission_kg'].copy()

print(f"Feature shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nMissing values in features:")
print(X.isnull().sum())

Feature shape: (15000, 5)
Target shape: (15000,)

Missing values in features:
Weight_kg              0
Distance_km            0
Shipping_Mode          0
Material_CO2_Factor    0
Material_Density       0
dtype: int64


In [4]:
# One-Hot Encode Shipping_Mode
encoder = OneHotEncoder(sparse_output=False, drop='first')
shipping_encoded = encoder.fit_transform(X[['Shipping_Mode']])
shipping_feature_names = encoder.get_feature_names_out(['Shipping_Mode'])

# Create encoded dataframe
X_encoded = pd.DataFrame(
    shipping_encoded,
    columns=shipping_feature_names,
    index=X.index
)

# Add other numerical features
X_encoded['Weight_kg'] = X['Weight_kg']
X_encoded['Distance_km'] = X['Distance_km']
X_encoded['Material_CO2_Factor'] = X['Material_CO2_Factor']
X_encoded['Material_Density'] = X['Material_Density']

print("Features after encoding:")
print(X_encoded.columns.tolist())
print(f"\nFeature shape: {X_encoded.shape}")
X_encoded.head()

Features after encoding:
['Shipping_Mode_Road', 'Weight_kg', 'Distance_km', 'Material_CO2_Factor', 'Material_Density']

Feature shape: (15000, 5)


Unnamed: 0,Shipping_Mode_Road,Weight_kg,Distance_km,Material_CO2_Factor,Material_Density
0,0.0,0.82,1893,0.742,742
1,0.0,0.29,2141,0.486,146
2,1.0,12.26,1491,0.535,515
3,1.0,11.56,530,0.535,515
4,0.0,0.25,1587,0.742,742


In [5]:
# Split data into train (80%) and test (20%)
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

Training set: 12000 samples
Test set: 3000 samples


In [6]:
# Train XGBoost for CO2 prediction
xgb_co2_model = xgb.XGBRegressor(
    n_estimators=100,
    max_depth=10,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

print("Training XGBoost for CO2 Prediction...")
xgb_co2_model.fit(X_train, y_train)
print("XGBoost CO2 Prediction Training completed!")

Training XGBoost for CO2 Prediction...
XGBoost CO2 Prediction Training completed!


In [7]:
# Predict on test set and evaluate XGBoost CO2 model
y_pred_xgb = xgb_co2_model.predict(X_test)
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)

print(f"XGBoost CO2 Prediction RMSE on Test Set: {rmse_xgb:.4f}")
print(f"XGBoost CO2 Prediction MAE on Test Set: {mae_xgb:.4f}")
print(f"Mean CO2 Emission: {y_test.mean():.4f}")
print(f"RMSE as % of mean: {(rmse_xgb/y_test.mean())*100:.2f}%")
print(f"MAE as % of mean: {(mae_xgb/y_test.mean())*100:.2f}%")

XGBoost CO2 Prediction RMSE on Test Set: 10.6824
XGBoost CO2 Prediction MAE on Test Set: 2.8192
Mean CO2 Emission: 24.0177
RMSE as % of mean: 44.48%
MAE as % of mean: 11.74%


## 3. Cost Predictor Training

In [8]:
# Prepare features and target for cost prediction
# Features: Weight_kg, Distance_km, Shipping_Mode (One-Hot Encoded), 
#           Material_Density, Cost_per_kg, Product_Volume_m3
# Target: Cost_USD

# Select features for cost prediction
cost_feature_cols = ['Weight_kg', 'Distance_km', 'Shipping_Mode', 
                     'Material_Density', 'Cost_per_kg', 'Product_Volume_m3']

# Create feature dataframe for cost prediction
X_cost = history_df[cost_feature_cols].copy()
y_cost = history_df['Cost_USD'].copy()

print(f"Cost Prediction Feature shape: {X_cost.shape}")
print(f"Cost Prediction Target shape: {y_cost.shape}")
print(f"\nMissing values in cost features:")
print(X_cost.isnull().sum())

Cost Prediction Feature shape: (15000, 6)
Cost Prediction Target shape: (15000,)

Missing values in cost features:
Weight_kg            0
Distance_km          0
Shipping_Mode        0
Material_Density     0
Cost_per_kg          0
Product_Volume_m3    0
dtype: int64


In [9]:
# One-Hot Encode Shipping_Mode for cost prediction
encoder_cost = OneHotEncoder(sparse_output=False, drop='first')
shipping_encoded_cost = encoder_cost.fit_transform(X_cost[['Shipping_Mode']])
shipping_feature_names_cost = encoder_cost.get_feature_names_out(['Shipping_Mode'])

# Create encoded dataframe for cost prediction
X_cost_encoded = pd.DataFrame(
    shipping_encoded_cost,
    columns=shipping_feature_names_cost,
    index=X_cost.index
)

# Add other numerical features
X_cost_encoded['Weight_kg'] = X_cost['Weight_kg']
X_cost_encoded['Distance_km'] = X_cost['Distance_km']
X_cost_encoded['Material_Density'] = X_cost['Material_Density']
X_cost_encoded['Cost_per_kg'] = X_cost['Cost_per_kg']
X_cost_encoded['Product_Volume_m3'] = X_cost['Product_Volume_m3']

print("Cost Prediction Features after encoding:")
print(X_cost_encoded.columns.tolist())
print(f"\nCost Prediction Feature shape: {X_cost_encoded.shape}")
X_cost_encoded.head()

Cost Prediction Features after encoding:
['Shipping_Mode_Road', 'Weight_kg', 'Distance_km', 'Material_Density', 'Cost_per_kg', 'Product_Volume_m3']

Cost Prediction Feature shape: (15000, 6)


Unnamed: 0,Shipping_Mode_Road,Weight_kg,Distance_km,Material_Density,Cost_per_kg,Product_Volume_m3
0,0.0,0.82,1893,742,0.91,0.007056
1,0.0,0.29,2141,146,5.1,0.00017
2,1.0,12.26,1491,515,2.23,0.19032
3,1.0,11.56,530,515,2.23,0.19136
4,0.0,0.25,1587,742,0.91,0.000396


In [10]:
# Split data into train (80%) and test (20%) for cost prediction
X_cost_train, X_cost_test, y_cost_train, y_cost_test = train_test_split(
    X_cost_encoded, y_cost, test_size=0.2, random_state=42
)

print(f"Cost Prediction Training set: {X_cost_train.shape[0]} samples")
print(f"Cost Prediction Test set: {X_cost_test.shape[0]} samples")

Cost Prediction Training set: 12000 samples
Cost Prediction Test set: 3000 samples


In [11]:
# Train RandomForestRegressor for cost prediction
rf_cost_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

print("Training RandomForestRegressor for Cost Prediction...")
rf_cost_model.fit(X_cost_train, y_cost_train)
print("Cost Prediction Training completed!")

Training RandomForestRegressor for Cost Prediction...
Cost Prediction Training completed!


In [12]:
# Predict on test set and evaluate cost prediction model
y_cost_pred = rf_cost_model.predict(X_cost_test)
cost_rmse = np.sqrt(mean_squared_error(y_cost_test, y_cost_pred))
cost_mae = mean_absolute_error(y_cost_test, y_cost_pred)

print(f"Cost Prediction RMSE on Test Set: ${cost_rmse:.4f}")
print(f"Cost Prediction MAE on Test Set: ${cost_mae:.4f}")
print(f"Mean Cost: ${y_cost_test.mean():.4f}")
print(f"RMSE as % of mean: {(cost_rmse/y_cost_test.mean())*100:.2f}%")
print(f"MAE as % of mean: {(cost_mae/y_cost_test.mean())*100:.2f}%")

Cost Prediction RMSE on Test Set: $0.1567
Cost Prediction MAE on Test Set: $0.1240
Mean Cost: $12.7634
RMSE as % of mean: 1.23%
MAE as % of mean: 0.97%


## 4. Save Models

In [13]:
def save_models_for_deployment():
    """Save all trained models and required components for web deployment"""
    
    print("=" * 80)
    print("üíæ SAVING MODELS FOR WEB DEPLOYMENT")
    print("=" * 80)
    
    # Create deployment directory
    deployment_dir = "deployment_models"
    if not os.path.exists(deployment_dir):
        os.makedirs(deployment_dir)
        print(f"üìÅ Created directory: {deployment_dir}")
    
    # Get feature orders for models
    co2_feature_order = X_encoded.columns.tolist()
    cost_feature_order = X_cost_encoded.columns.tolist()

    # Save CO2 prediction model (XGBoost)
    co2_model_path = os.path.join(deployment_dir, "co2_prediction_model.joblib")
    joblib.dump(xgb_co2_model, co2_model_path)
    print(f"‚úÖ CO2 Prediction Model saved: {co2_model_path}")
    
    # Save Cost prediction model (Random Forest)
    cost_model_path = os.path.join(deployment_dir, "cost_prediction_model.joblib")
    joblib.dump(rf_cost_model, cost_model_path)
    print(f"‚úÖ Cost Prediction Model saved: {cost_model_path}")
    
    # Save encoders
    co2_encoder_path = os.path.join(deployment_dir, "co2_label_encoder.joblib")
    joblib.dump(encoder, co2_encoder_path)
    print(f"‚úÖ CO2 Label Encoder saved: {co2_encoder_path}")
    
    cost_encoder_path = os.path.join(deployment_dir, "cost_label_encoder.joblib")
    joblib.dump(encoder_cost, cost_encoder_path)
    print(f"‚úÖ Cost Label Encoder saved: {cost_encoder_path}")
    
    # Save feature orders
    co2_features_path = os.path.join(deployment_dir, "co2_feature_order.joblib")
    joblib.dump(co2_feature_order, co2_features_path)
    print(f"‚úÖ CO2 Feature Order saved: {co2_features_path}")
    
    cost_features_path = os.path.join(deployment_dir, "cost_feature_order.joblib")
    joblib.dump(cost_feature_order, cost_features_path)
    print(f"‚úÖ Cost Feature Order saved: {cost_features_path}")
    
    # Save materials database
    materials_path = os.path.join(deployment_dir, "materials_database.joblib")
    joblib.dump(materials_df, materials_path)
    print(f"‚úÖ Materials Database saved: {materials_path}")
    
    # Save model metadata
    metadata = {
        'model_version': '1.0',
        'training_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'co2_model_type': 'XGBoost Regressor',
        'cost_model_type': 'Random Forest Regressor',
        'co2_rmse': float(rmse_xgb),
        'co2_mae': float(mae_xgb),
        'cost_rmse': float(cost_rmse),
        'cost_mae': float(cost_mae),
        'total_materials': len(materials_df),
        'categories': materials_df['Category'].unique().tolist(),
        'feature_order_co2': co2_feature_order,
        'feature_order_cost': cost_feature_order
    }
    
    metadata_path = os.path.join(deployment_dir, "model_metadata.joblib")
    joblib.dump(metadata, metadata_path)
    print(f"‚úÖ Model Metadata saved: {metadata_path}")
    
    # Create requirements.txt for deployment
    requirements = [
        "scikit-learn>=1.0.0",
        "xgboost>=1.6.0", 
        "pandas>=1.3.0",
        "numpy>=1.21.0",
        "joblib>=1.1.0",
        "flask>=2.0.0"
    ]
    
    requirements_path = os.path.join(deployment_dir, "requirements.txt")
    with open(requirements_path, 'w') as f:
        f.write('\n'.join(requirements))
    print(f"‚úÖ Requirements file created: {requirements_path}")
    
 

    readme_path = os.path.join(deployment_dir, "README.md")
    with open(readme_path, 'w') as f:
        f.write("Model artifacts for Eco Packaging Recommendation")
    print(f"‚úÖ README documentation created: {readme_path}")
    
    print(f"\nüéâ DEPLOYMENT PACKAGE CREATED SUCCESSFULLY!")
    print(f"üìÅ Location: {deployment_dir}/")

# Save all models and deployment files
save_models_for_deployment()

üíæ SAVING MODELS FOR WEB DEPLOYMENT
‚úÖ CO2 Prediction Model saved: deployment_models\co2_prediction_model.joblib
‚úÖ Cost Prediction Model saved: deployment_models\cost_prediction_model.joblib
‚úÖ CO2 Label Encoder saved: deployment_models\co2_label_encoder.joblib
‚úÖ Cost Label Encoder saved: deployment_models\cost_label_encoder.joblib
‚úÖ CO2 Feature Order saved: deployment_models\co2_feature_order.joblib
‚úÖ Cost Feature Order saved: deployment_models\cost_feature_order.joblib
‚úÖ Materials Database saved: deployment_models\materials_database.joblib
‚úÖ Model Metadata saved: deployment_models\model_metadata.joblib
‚úÖ Requirements file created: deployment_models\requirements.txt
‚úÖ README documentation created: deployment_models\README.md

üéâ DEPLOYMENT PACKAGE CREATED SUCCESSFULLY!
üìÅ Location: deployment_models/
