# Project Title: Ecopack AI

### Import Libraries

In [66]:
import pandas as pd
import numpy as np
import warnings
import random
import joblib
import optuna
from optuna.samplers import TPESampler
from pathlib import Path
import shap
shap.initjs()   # Optional but useful for interactive plots

# ======================
# Reproducibility
# ======================
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

# ======================
# Sklearn
# ======================
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    median_absolute_error,
    r2_score,
    make_scorer
)

from sklearn.inspection import permutation_importance

# ======================
# Models
# ======================
from xgboost import XGBRegressor

# ======================
# Statistics
# ======================
from scipy.stats import spearmanr
from statsmodels.stats.outliers_influence import variance_inflation_factor

# ======================
# Visualization
# ======================
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

pio.renderers.default = "vscode"

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)


In [67]:
CONFIG = {
    # File paths
    'input_file': r"D:\Profession\Internship\Infosys\ecopack.xlsx",
    'output_file': r"D:\Profession\Internship\Infosys\cleaned_ecopack.xlsx",
    
    # Target variables for COST and CO2 prediction
    'target_vars': ['Cost_Efficiency_Index', 'CO2_Impact_Index'],

    'test_size': 0.2,
    'random_state': 42,
    
    # Data Wrangling
    'missing_threshold': 70,
    'numeric_impute_method': 'median',
    'categorical_impute_method': 'mode',
    'handle_duplicates': True,
    
    # Preprocessing
    'outlier_method': 'cap',
    'scaling_method': 'standard',  # 'standard', 'minmax', 'robust'
}


### Stage 1: Data Wrangling

In [68]:
print("="*100)
print("STAGE 1: DATA WRANGLING")
print("="*100)

print("\n Loading Data...")
df = pd.read_excel(CONFIG['input_file'])
print(f"Data loaded: {df.shape[0]} rows x {df.shape[1]} columns")

original_shape = df.shape
original_missing = df.isnull().sum().sum()

# Fix code column
if 'code' in df.columns:
    df['code'] = df['code'].astype(str)

STAGE 1: DATA WRANGLING

 Loading Data...
Data loaded: 29473 rows x 22 columns


In [69]:
print("\n Initial Data Overview:")
df.head()


 Initial Data Overview:


Unnamed: 0,code,product_quantity,countries_tags,categories_tags,food_group,agribalyse_food_code,agribalyse_food_name,number_of_units,shape,material,parent_material,recycling,strength,weight_capacity,biodegradability_score,co2_emission_score,recyclability_percent,weight_measured,quantity_per_unit,CO2_Impact_Index,Cost_Efficiency_Index,Suitability_Score
0,3257980112590.0,175.0,france,"snacks,sweet-snacks,biscuits-and-cakes,biscuit...",biscuits-and-cakes,24430.0,"Biscuit (cookie), sponge fingers or Lady fingers",1.0,box,cardboard,paper-or-cardboard,,Medium,25.0,90.0,1.2,85.0,37.73,,6.7914,19.230769,61.007951
1,3257980112590.0,175.0,france,"snacks,sweet-snacks,biscuits-and-cakes,biscuit...",biscuits-and-cakes,24430.0,"Biscuit (cookie), sponge fingers or Lady fingers",3.0,bag,plastic,plastic,,Medium,30.0,10.0,2.5,20.0,2.2,,4.4,11.538462,11.136302
2,8002270014901.0,1000.0,"belgium,france,italy,luxembourg,netherlands,sw...","beverages,carbonated-drinks,waters,spring-wate...",unsweetened-beverages,18430.0,"Water, bottled",1.0,bottle,pet-1-polyethylene-terephthalate,plastic,recycle,,,,,,34.33,,,,
3,8002270014901.0,1000.0,"belgium,france,italy,luxembourg,netherlands,sw...","beverages,carbonated-drinks,waters,spring-wate...",unsweetened-beverages,18430.0,"Water, bottled",1.0,bottle-cap,hdpe-2-high-density-polyethylene,plastic,recycle,,,,,,2.35,,,,
4,8002270014901.0,1000.0,"belgium,france,italy,luxembourg,netherlands,sw...","beverages,carbonated-drinks,waters,spring-wate...",unsweetened-beverages,18430.0,"Water, bottled",1.0,label,paper,paper-or-cardboard,recycle,Low,10.0,95.0,0.9,90.0,1.9,,0.171,10.0,64.657812


In [70]:
print("\n Missing Value Analysis")
missing_summary = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': df.isnull().sum(),
    'Missing_Percent': (df.isnull().sum() / len(df)) * 100
}).sort_values('Missing_Percent', ascending=False)

missing_summary = missing_summary[missing_summary['Missing_Count'] > 0]
print(f"\nColumns with missing values: {len(missing_summary)}")
print(missing_summary.to_string(index=False))


 Missing Value Analysis

Columns with missing values: 20
                Column  Missing_Count  Missing_Percent
     quantity_per_unit          25865        87.758287
       weight_measured          22060        74.848166
             recycling          20925        70.997184
      CO2_Impact_Index          18889        64.089166
     Suitability_Score          18889        64.089166
       weight_capacity          18187        61.707325
              strength          18187        61.707325
 Cost_Efficiency_Index          18187        61.707325
 recyclability_percent          18187        61.707325
    co2_emission_score          18187        61.707325
biodegradability_score          18187        61.707325
  agribalyse_food_code          17281        58.633325
  agribalyse_food_name          17146        58.175279
      product_quantity          10559        35.826010
            food_group           8124        27.564211
       categories_tags           5357        18.175958
       

In [71]:
print(f"\n Strategic Handling of Critical Columns")

print("Strategy: Keep weight_measured rows, drop quantity_per_unit, impute recycling")
print("Reason: Avoiding 90% data loss while preserving essential features\n")

initial_rows = len(df)
print(f"Before filtering: {initial_rows:,} rows")

# Step 1: Keep only rows where weight_measured is present (critical for CO2)
if 'weight_measured' in df.columns:
    df = df[df['weight_measured'].notna()]
    print(f"After keeping weight_measured rows: {len(df):,} rows")
    print(f"Rows removed: {initial_rows - len(df):,} ({((initial_rows - len(df))/initial_rows)*100:.2f}%)")
    print(f"Data retention: {(len(df)/initial_rows)*100:.2f}%")

# Step 2: Drop quantity_per_unit column (87% missing, can be derived)
if 'quantity_per_unit' in df.columns:
    df = df.drop(columns=['quantity_per_unit'])
    print(f"\nDropped 'quantity_per_unit' column (87.76% missing)")
    print("Note: Will create derived feature from product_quantity / number_of_units")


 Strategic Handling of Critical Columns
Strategy: Keep weight_measured rows, drop quantity_per_unit, impute recycling
Reason: Avoiding 90% data loss while preserving essential features

Before filtering: 29,473 rows
After keeping weight_measured rows: 7,413 rows
Rows removed: 22,060 (74.85%)
Data retention: 25.15%

Dropped 'quantity_per_unit' column (87.76% missing)
Note: Will create derived feature from product_quantity / number_of_units


In [72]:
print("\n Identifying Column Types and Fixing Data Types")

# Fix agribalyse_food_code - convert to numeric
if 'agribalyse_food_code' in df.columns:
    df['agribalyse_food_code'] = pd.to_numeric(df['agribalyse_food_code'], errors='coerce')
    df['agribalyse_food_code'].fillna(df['agribalyse_food_code'].median(), inplace=True)
    print("Fixed: agribalyse_food_code converted to numeric")

# Define column lists based on actual data structure
categorical_features = ['countries_tags', 'categories_tags', 'food_group', 
                       'agribalyse_food_name', 'shape', 'material', 
                       'parent_material', 'recycling', 'strength']

numeric_features = ['product_quantity', 'agribalyse_food_code', 'number_of_units',
                   'weight_capacity', 'biodegradability_score', 'co2_emission_score',
                   'recyclability_percent', 'weight_measured']

id_cols = ['code']
target_cols = ['CO2_Impact_Index', 'Cost_Efficiency_Index', 'Suitability_Score']

print(f"\nNumeric features: {len(numeric_features)}")
print(f"Categorical features: {len(categorical_features)}")
print(f"Target variables: {target_cols}")
print(f"ID columns: {id_cols}")


 Identifying Column Types and Fixing Data Types
Fixed: agribalyse_food_code converted to numeric

Numeric features: 8
Categorical features: 9
Target variables: ['CO2_Impact_Index', 'Cost_Efficiency_Index', 'Suitability_Score']
ID columns: ['code']


In [73]:
print(f"\n Intelligent Imputation Strategy")

# Group 1: High Priority - Target Variables (61.7% missing after filtering)
target_missing_cols = ['CO2_Impact_Index', 'Cost_Efficiency_Index', 'Suitability_Score']
print(f"\n--- Group 1: Target Variables ---")
for col in target_missing_cols:
    if col in df.columns:
        missing = df[col].isnull().sum()
        if missing > 0:
            # Drop rows with missing targets (cannot train without targets)
            df = df[df[col].notna()]
            print(f"{col}: Dropped {missing} rows with missing targets")

print(f"Rows after target filtering: {len(df):,}")

# Group 2: Packaging Physical Properties (61.7% missing)
physical_cols = ['strength', 'weight_capacity', 'biodegradability_score', 
                 'co2_emission_score', 'recyclability_percent']
print(f"\n--- Group 2: Physical Properties (Impute by Material+Shape) ---")

for col in physical_cols:
    if col in df.columns:
        missing_before = df[col].isnull().sum()
        if missing_before > 0:
            # Impute by material and shape groups
            if 'material' in df.columns and 'shape' in df.columns:
                df[col] = df.groupby(['material', 'shape'])[col].transform(
                    lambda x: x.fillna(x.median() if x.dtype in ['float64', 'int64'] else x.mode()[0] if len(x.mode()) > 0 else x.median())
                )
            # If still missing, impute by material only
            if df[col].isnull().sum() > 0 and 'material' in df.columns:
                df[col] = df.groupby('material')[col].transform(
                    lambda x: x.fillna(x.median() if x.dtype in ['float64', 'int64'] else x.mode()[0] if len(x.mode()) > 0 else x.median())
                )
            # Final fallback: global median
            if df[col].isnull().sum() > 0:
                df[col].fillna(df[col].median(), inplace=True)
            
            missing_after = df[col].isnull().sum()
            print(f"  {col}: {missing_before} -> {missing_after} missing")

# Group 3: Recycling (categorical, impute by material)
print(f"\n--- Group 3: Recycling Information ---")
if 'recycling' in df.columns:
    missing_before = df['recycling'].isnull().sum()
    if missing_before > 0:
        # Impute by parent_material, then material
        if 'parent_material' in df.columns:
            df['recycling'] = df.groupby('parent_material')['recycling'].transform(
                lambda x: x.fillna(x.mode()[0] if len(x.mode()) > 0 else 'Unknown')
            )
        if df['recycling'].isnull().sum() > 0 and 'material' in df.columns:
            df['recycling'] = df.groupby('material')['recycling'].transform(
                lambda x: x.fillna(x.mode()[0] if len(x.mode()) > 0 else 'Unknown')
            )
        # Final fallback
        df['recycling'].fillna('Unknown', inplace=True)
        missing_after = df['recycling'].isnull().sum()
        print(f"  recycling: {missing_before} -> {missing_after} missing")

# Group 4: Food Information (agribalyse codes/names - impute by food_group)
food_info_cols = ['agribalyse_food_code', 'agribalyse_food_name']
print(f"\n--- Group 4: Food Database Information ---")
for col in food_info_cols:
    if col in df.columns:
        missing_before = df[col].isnull().sum()
        if missing_before > 0:
            if 'food_group' in df.columns:
                df[col] = df.groupby('food_group')[col].transform(
                    lambda x: x.fillna(x.mode()[0] if len(x.mode()) > 0 else 'Unknown')
                )
            # Fallback to most common value
            if df[col].isnull().sum() > 0:
                mode_val = df[col].mode()
                fill_val = mode_val[0] if len(mode_val) > 0 else 'Unknown'
                df[col].fillna(fill_val, inplace=True)
            missing_after = df[col].isnull().sum()
            print(f"  {col}: {missing_before} -> {missing_after} missing")

# Group 5: Product Quantity (impute by food_group + material)
print(f"\n--- Group 5: Product Quantity ---")
if 'product_quantity' in df.columns:
    missing_before = df['product_quantity'].isnull().sum()
    if missing_before > 0:
        if 'food_group' in df.columns and 'material' in df.columns:
            df['product_quantity'] = df.groupby(['food_group', 'material'])['product_quantity'].transform(
                lambda x: x.fillna(x.median())
            )
        if df['product_quantity'].isnull().sum() > 0:
            df['product_quantity'].fillna(df['product_quantity'].median(), inplace=True)
        missing_after = df['product_quantity'].isnull().sum()
        print(f"  product_quantity: {missing_before} -> {missing_after} missing")

# Group 6: Food Group (categorical - impute by categories_tags)
print(f"\n--- Group 6: Food Group ---")
if 'food_group' in df.columns:
    missing_before = df['food_group'].isnull().sum()
    if missing_before > 0:
        mode_val = df['food_group'].mode()
        fill_val = mode_val[0] if len(mode_val) > 0 else 'Unknown'
        df['food_group'].fillna(fill_val, inplace=True)
        missing_after = df['food_group'].isnull().sum()
        print(f"  food_group: {missing_before} -> {missing_after} missing")

# Group 7: Categories Tags
print(f"\n--- Group 7: Categories Tags ---")
if 'categories_tags' in df.columns:
    missing_before = df['categories_tags'].isnull().sum()
    if missing_before > 0:
        df['categories_tags'].fillna('uncategorized', inplace=True)
        missing_after = df['categories_tags'].isnull().sum()
        print(f"  categories_tags: {missing_before} -> {missing_after} missing")

# Group 8: Number of Units (low missing - simple imputation)
print(f"\n--- Group 8: Number of Units ---")
if 'number_of_units' in df.columns:
    missing_before = df['number_of_units'].isnull().sum()
    if missing_before > 0:
        df['number_of_units'].fillna(1, inplace=True)  # Default to 1 unit
        missing_after = df['number_of_units'].isnull().sum()
        print(f"  number_of_units: {missing_before} -> {missing_after} missing (filled with 1)")

# Group 9: Material and Shape (very low missing)
print(f"\n--- Group 9: Material and Shape ---")
for col in ['material', 'shape']:
    if col in df.columns:
        missing_before = df[col].isnull().sum()
        if missing_before > 0:
            mode_val = df[col].mode()
            fill_val = mode_val[0] if len(mode_val) > 0 else 'Unknown'
            df[col].fillna(fill_val, inplace=True)
            missing_after = df[col].isnull().sum()
            print(f"  {col}: {missing_before} -> {missing_after} missing")

# Group 10: Countries Tags (very low missing)
print(f"\n--- Group 10: Countries Tags ---")
if 'countries_tags' in df.columns:
    missing_before = df['countries_tags'].isnull().sum()
    if missing_before > 0:
        mode_val = df['countries_tags'].mode()
        fill_val = mode_val[0] if len(mode_val) > 0 else 'Unknown'
        df['countries_tags'].fillna(fill_val, inplace=True)
        missing_after = df['countries_tags'].isnull().sum()
        print(f"  countries_tags: {missing_before} -> {missing_after} missing")

# Final check
remaining_missing = df.isnull().sum().sum()
print(f"\n{'='*80}")
print(f"IMPUTATION COMPLETE")
print(f"Total remaining missing values: {remaining_missing}")
print(f"Final dataset size: {len(df):,} rows x {df.shape[1]} columns")
print(f"{'='*80}")



 Intelligent Imputation Strategy

--- Group 1: Target Variables ---
CO2_Impact_Index: Dropped 2370 rows with missing targets
Rows after target filtering: 5,043

--- Group 2: Physical Properties (Impute by Material+Shape) ---

--- Group 3: Recycling Information ---
  recycling: 1303 -> 0 missing

--- Group 4: Food Database Information ---
  agribalyse_food_name: 2529 -> 0 missing

--- Group 5: Product Quantity ---
  product_quantity: 374 -> 0 missing

--- Group 6: Food Group ---
  food_group: 456 -> 0 missing

--- Group 7: Categories Tags ---
  categories_tags: 66 -> 0 missing

--- Group 8: Number of Units ---
  number_of_units: 151 -> 0 missing (filled with 1)

--- Group 9: Material and Shape ---
  shape: 19 -> 0 missing

--- Group 10: Countries Tags ---
  countries_tags: 8 -> 0 missing

IMPUTATION COMPLETE
Total remaining missing values: 0
Final dataset size: 5,043 rows x 21 columns


In [74]:
print(f"\n Handling Duplicates")
initial_rows = len(df)
duplicates = df.duplicated().sum()
print(f"Duplicate rows found: {duplicates}")

if duplicates > 0:
    df = df.drop_duplicates(keep='first')
    print(f"Removed {initial_rows - len(df)} duplicate rows")

print(f"\nData Wrangling Complete: {df.shape[0]} rows x {df.shape[1]} columns")



 Handling Duplicates
Duplicate rows found: 2
Removed 2 duplicate rows

Data Wrangling Complete: 5041 rows x 21 columns


In [75]:
def standardize_recycling_labels(df):
    """
    Comprehensive translation of multi-language recycling terms to English
    Covers: German, French, Spanish, Italian, Swedish, Portuguese, Dutch, Greek, Russian
    """
    print("\n" + "="*100)
    print("RECYCLING STANDARDIZATION: TRANSLATING MULTI-LANGUAGE TERMS")
    print("="*100)
    
    if 'recycling' not in df.columns:
        print("⚠ 'recycling' column not found")
        return df
    
    df_clean = df.copy()
    
    # ========================================================================
    # COMPREHENSIVE TRANSLATION MAP (ALL LANGUAGES)
    # ========================================================================
    
    translation_map = {
        # ===== GERMAN (DE:) =====
        'de:Altglas': 'Recycle Glass',
        'de:Altpapier': 'Recycle Paper',
        'de:Einwegpfand': 'Single-use Deposit',
        'de:Gelbe Tonne': 'Recycle Plastic',
        'de:Gelber Sack': 'Recycle Plastic',
        'de:gelbe Tonne': 'Recycle Plastic',
        'de:Glaskontainer': 'Recycle Glass',
        'de:Mehrweg': 'Reusable',
        'de:Papier Tonne': 'Recycle Paper',
        'de:Papiermüll': 'Recycle Paper',
        'de:Pfandflasche': 'Deposit Return',
        'de:Zurückgeben': 'Return to Store',
        'de:Siegelfolie': 'Seal Film - Recycle Plastic',
        'de:Glas': 'Recycle Glass',
        'de:Mehrwegglas': 'Reusable Glass',
        'de:Twist-Off-Deckel 82mm': 'Recycle Metal Cap',
        'Deutschland': 'Recyclable',
        'Klebeband': 'Discard Tape',
        
        # ===== FRENCH (FR:) =====
        'fr:tri': 'Sort for Recycling',
        'fr:triman': 'Recyclable',
        'fr:Triman': 'Recyclable',
        'fr:conteneur': 'Container',
        'fr:oui': 'Yes',
        'fr:Oui': 'Yes',
        'fr:non': 'No',
        'fr:à trier': 'To Sort',
        'fr:à recyler': 'To Recycle',
        'fr:Néant': 'None',
        'fr:Présente': 'Present',
        'fr:21 PAP': 'Recycle Paper',
        'fr:poubelle': 'Trash',
        'fr:Cône': 'Cone - Recyclable',
        'fr:bagues': 'Rings - Recycle',
        'fr:Bague': 'Ring - Recycle',
        'fr:bague': 'Ring - Recycle',
        'fr:Bagues': 'Rings - Recycle',
        'fr:bouchon-mecanique': 'Mechanical Cap - Recycle',
        'fr:bouchon-verseur': 'Pouring Cap - Recycle',
        'fr:agrafe': 'Staple - Recycle Metal',
        'fr:Bouteille et bouchon': 'Bottle and Cap - Recycle',
        'fr:sachet de thé': 'Tea Bag - Compost',
        'fr:Boîte et opercule': 'Box and Seal - Recycle',
        'fr:Carton ondulé': 'Corrugated Cardboard - Recycle',
        'fr:cassolette': 'Small Dish - Recycle',
        'fr:Rectangle': 'Rectangle - Recyclable',
        'fr:rectangulaire': 'Rectangular - Recyclable',
        'fr:carré': 'Square - Recyclable',
        'fr:Etuit': 'Case - Recycle',
        'fr:Sachet refermable': 'Resealable Bag - Recycle',
        'fr:Capsule et fil fer': 'Cap and Wire - Recycle',
        'fr:Puisette': 'Scoop - Recycle',
        'fr:cavalier carton': 'Cardboard Rider - Recycle',
        'fr:sachet transparent rectangulaire': 'Transparent Rectangular Bag - Recycle',
        'fr:8 Alvéoles': '8 Cells - Recycle',
        'fr:Assembleur': 'Assembler - Recycle',
        'fr:Sachet plastique': 'Plastic Bag - Recycle',
        'fr:couverture': 'Cover - Recycle',
        'fr:mousseline': 'Muslin - Recycle Paper',
        'fr:Agrafes métalliques': 'Metal Staples - Recycle Metal',
        'fr:Boîte parallélépipèdique': 'Rectangular Box - Recycle',
        'fr:Couvercle alu rectangulaire': 'Rectangular Aluminum Lid - Recycle',
        'fr:alvéole': 'Cell - Recycle',
        'fr:Papillote': 'Foil - Recycle',
        'fr:Oper': 'Seal - Discard',
        'fr:sachet rectangulaire': 'Rectangular Bag - Recycle',
        'fr:ronde': 'Round - Recyclable',
        'fr:Sachet et robinet': 'Bag and Tap - Recycle',
        'fr:Porte bouteille': 'Bottle Holder - Recycle',
        'fr:Cerclage': 'Strapping - Recycle',
        'fr:Couche': 'Layer - Recycle',
        'fr:Sachet plastique à jeter': 'Plastic Bag - Discard',
        'fr:Etui carton à recycler': 'Cardboard Case - Recycle',
        'fr:sachet bac de tri jaune': 'Yellow Bin Bag - Recycle',
        'fr:boîte carton et sachet plastique à recycler': 'Cardboard Box and Plastic Bag - Recycle',
        'fr:clip poubelle sachet recyclable': 'Recyclable Bag Clip',
        'fr:Emballage à déposer dans le bac de tri': 'Place in Sorting Bin',
        'fr:Sachet dans le bac à tri': 'Bag in Sorting Bin',
        'fr:Sachet dans le bac de tri': 'Bag in Sorting Bin',
        'fr:soja-de-france': 'French Soy - Recyclable',
        'fr:belgique': 'Belgium - Recyclable',
        'fr:engleterre': 'England - Recyclable',
        
        # ===== SPANISH (ES:) =====
        'es:Contenedor amarillo': 'Yellow Container - Recycle Plastic',
        'es:Contendor amarillo': 'Yellow Container - Recycle Plastic',
        'es:Contenedor amatillo': 'Yellow Container - Recycle Plastic',
        'es:ecoponto-amarelo': 'Yellow Recycling Point',
        'es:ecoponto-verde': 'Green Recycling Point',
        'es:Compostable EN 13432': 'Compostable',
        'es:Al Amarillo': 'To Yellow Container - Recycle',
        'es:Reciclar': 'Recycle',
        'es:Vidrio, Metal': 'Glass, Metal - Recycle',
        'es:Desechar': 'Discard',
        'es:envase plástico': 'Plastic Container - Recycle',
        'es:Si': 'Yes',
        'es:Reutilizar': 'Reuse',
        'es:Tapa': 'Lid - Recycle',
        'es:Bandeja': 'Tray - Recycle',
        'es:etiqueta': 'Label - Discard',
        'es:Etiqueta': 'Label - Discard',
        'es:belgique': 'Belgium - Recyclable',
        
        # ===== ITALIAN (IT:) =====
        'it:7': 'Plastic Type 7 - Recycle',
        'it:raccolta plastica': 'Plastic Collection - Recycle',
        'it:alluminio': 'Aluminum - Recycle',
        'it:carta, plastica': 'Paper, Plastic - Recycle',
        'raccolta carta': 'Paper Collection - Recycle',
        
        # ===== SWEDISH (SV:) =====
        'sv:Förpackningen sorteras som plastförpackning': 'Sort as Plastic',
        'sv:Förpackningen sorteras som pappersförpackning': 'Sort as Paper',
        'sv:Innerförpackningen sorteras som papper': 'Inner Package as Paper',
        'sv:Sorteras som pappersförpackning': 'Sort as Paper',
        'sv:Sorteras som plastförpackning': 'Sort as Plastic',
        'sv:Sortera som plastförpackning': 'Sort as Plastic',
        'sv:Filmen sorteras som plast': 'Film as Plastic',
        
        # ===== PORTUGUESE (PT:) =====
        'pt:azul': 'Blue Bin - Recycle',
        
        # ===== DUTCH (NL:) =====
        'nl:pouche et bouchon in bac de tri': 'Pouch and Cap in Sorting Bin',
        'nl:eierdoos': 'Egg Box - Recycle',
        'nl:Flesje': 'Small Bottle - Recycle Glass',
        'nl:Belgie': 'Belgium - Recyclable',
        
        # ===== GREEK (EL:) =====
        'el:Διατηρήστε καθαρό το περιβάλλον': 'Keep Environment Clean',
        
        # ===== RUSSIAN (RU:) =====
        'ru:Этикетка': 'Label - Discard',
        
        # ===== COMPLEX PHRASES (ALL LANGUAGES) =====
        'Discard Seal, Recycle if Clean & Dry Jetez Fopercule recyclez si propre et sec PAPER PAPER PAPIER PAPIER BOX TRAY BOITE PLATEAU': 'Recycle if Clean',
        'Rinse tray, discard seal. Recycle tray in sorting bin': 'Rinse and Recycle',
        'Discard seal, rinse and recycle tray in sorting bin': 'Rinse and Recycle',
        'Recycle with bags at large supermarket': 'Recycle Plastic Bags',
        'Please recycle this container': 'Recyclable',
        '100% recyclable and 50% recycled': 'Fully Recyclable',
        'Recycle as paper': 'Recycle Paper',
        'Recycle or reuse': 'Recyclable or Reusable',
        'Recycle pot': 'Recyclable',
        'rinse and recycle as metal': 'Rinse and Recycle Metal',
        
        # ===== ENGLISH VARIATIONS =====
        'widely recyclef': 'Widely Recyclable',
        'widely recycled': 'Widely Recyclable',
        "don't recycle": 'Not Recyclable',
        "Don't recycle": 'Not Recyclable',
        'do not recycle': 'Not Recyclable',
        'Clean': 'Clean Before Recycling',
        'reuse': 'Reusable',
        'deposit-refunds': 'Deposit Return',
        'return-to-store': 'Return to Store',
        'return-pet-bottle-to-store': 'Return Bottle to Store',
        'recycle-in-store': 'Recycle in Store',
        'recycle-in-sorting-bin': 'Recyclable',
        'recycle-in-glass-bin': 'Recycle Glass',
        'recycle-with-plastics-metal-and-bricks': 'Recyclable',
        'recycle-with-plastics': 'Recycle Plastic',
        'recycle-in-paper-bin': 'Recycle Paper',
        'recycle-with-drink-cartons': 'Recycle Carton',
        'recycle-as-green-waste': 'Compost',
        'glass': 'Recycle Glass',
        'brique & bouchon': 'Recycle Carton and Cap',
        'à jeter': 'Discard',
        
        # ===== PACKAGING TYPES =====
        'Barquette': 'Tray - Recyclable',
        'Rectangle': 'Rectangle - Recyclable',
        'Rectangular Box': 'Recyclable',
        'Square': 'Square - Recyclable',
        'Outer sleeve': 'Outer Sleeve - Recycle',
        'Tray and seal': 'Tray - Recycle, Seal - Discard',
        'xx:cellophane': 'Cellophane - Recycle Plastic',
    }
    
    # Apply direct translations
    df_clean['recycling'] = df_clean['recycling'].replace(translation_map)
    
    # ========================================================================
    # PATTERN-BASED TRANSLATION (FOR TERMS NOT IN MAP)
    # ========================================================================
    
    def clean_recycling_term(text):
        if pd.isna(text):
            return 'Unknown'
        
        text_str = str(text).strip()
        text_lower = text_str.lower()
        
        # If already translated by map, return as-is
        if text_str in translation_map.values():
            return text_str
        
        # Handle prefixed terms (fr:, de:, es:, it:, etc.)
        if ':' in text_str and text_str.split(':')[0] in ['fr', 'de', 'es', 'it', 'sv', 'pt', 'nl', 'el', 'ru', 'xx']:
            # Remove prefix
            clean_text = text_str.split(':', 1)[1].strip()
            
            # Translate common words
            word_translations = {
                'boîte': 'box',
                'sachet': 'bag',
                'carton': 'cardboard',
                'plastique': 'plastic',
                'papier': 'paper',
                'verre': 'glass',
                'métal': 'metal',
                'aluminium': 'aluminum',
                'recycler': 'recycle',
                'trier': 'sort',
                'jeter': 'discard',
                'opercule': 'seal',
                'bouchon': 'cap',
                'etiquette': 'label',
                'film': 'film',
                'couvercle': 'lid',
            }
            
            for foreign_word, english_word in word_translations.items():
                if foreign_word in clean_text.lower():
                    return f'Recycle {english_word.title()}'
            
            # If no translation found, return cleaned version
            return f'Recyclable ({clean_text})'
        
        # Pattern matching for untranslated terms
        patterns = {
            'altglas': 'Recycle Glass',
            'glaskontainer': 'Recycle Glass',
            'altpapier': 'Recycle Paper',
            'gelbe': 'Recycle Plastic',
            'amarillo': 'Recycle Plastic',
            'plastique': 'Recycle Plastic',
            'mehrweg': 'Reusable',
            'brique': 'Recycle Carton',
            'rinse': 'Rinse and Recycle',
            'clean': 'Recycle if Clean',
            'return': 'Return to Store',
            'deposit': 'Deposit Return',
            'reuse': 'Reusable',
            'reutilizar': 'Reusable',
            'not': 'Not Recyclable',
            "don't": 'Not Recyclable',
            'non': 'Not Recyclable',
            'recycle': 'Recyclable',
            'discard': 'Discard',
            'jeter': 'Discard',
        }
        
        for pattern, result in patterns.items():
            if pattern in text_lower:
                return result
        
        return text_str
    
    df_clean['recycling'] = df_clean['recycling'].apply(clean_recycling_term)
    
    # ========================================================================
    # FINAL CONSOLIDATION INTO MAIN CATEGORIES
    # ========================================================================
    
    category_map = {
        # Recyclable materials
        'Recycle Glass': 'Recyclable',
        'Recycle Paper': 'Recyclable',
        'Recycle Plastic': 'Recyclable',
        'Recycle Carton': 'Recyclable',
        'Recycle Bottle': 'Recyclable',
        'Recycle Carton and Cap': 'Recyclable',
        'Recycle Box and Seal': 'Recyclable',
        'Fully Recyclable': 'Recyclable',
        'Recyclable or Reusable': 'Recyclable',
        'Recycle Plastic Bags': 'Recyclable',
        'Widely Recyclable': 'Recyclable',
        'Recycle Metal': 'Recyclable',
        'Rinse and Recycle Metal': 'Recyclable',
        'Recycle Seal Film - Recycle Plastic': 'Recyclable',
        'Recycle Metal Cap': 'Recyclable',
        'Recyclable (*)': 'Recyclable',  # Catch remaining prefixed terms
        'recycle': 'Recyclable',
        'recycle-in-sorting-bin': 'Recyclable',
        'recycle-in-glass-bin': 'Recyclable',
        'recycle-with-plastics-metal-and-bricks': 'Recyclable',
        'recycle-with-plastics': 'Recyclable',
        'recycle-in-paper-bin': 'Recyclable',
        'recycle-with-drink-cartons': 'Recyclable',
        'recycle-as-green-waste': 'Recyclable',
        
        # Conditional recycling
        'Rinse and Recycle': 'Recycle with Conditions',
        'Recycle if Clean': 'Recycle with Conditions',
        'Clean Before Recycling': 'Recycle with Conditions',
        'Sort for Recycling': 'Recycle with Conditions',
        'To Sort': 'Recycle with Conditions',
        'To Recycle': 'Recycle with Conditions',
        'Yellow Container - Recycle Plastic': 'Recycle with Conditions',
        'Yellow Recycling Point': 'Recycle with Conditions',
        'Green Recycling Point': 'Recycle with Conditions',
        'Plastic Collection - Recycle': 'Recycle with Conditions',
        'Paper Collection - Recycle': 'Recycle with Conditions',
        'Sort as Plastic': 'Recycle with Conditions',
        'Sort as Paper': 'Recycle with Conditions',
        'Blue Bin - Recycle': 'Recycle with Conditions',
        'Place in Sorting Bin': 'Recycle with Conditions',
        'Bag in Sorting Bin': 'Recycle with Conditions',
        'Pouch and Cap in Sorting Bin': 'Recycle with Conditions',
        
        # Special programs
        'Single-use Deposit': 'Deposit Return',
        'Deposit Return': 'Deposit Return',
        'Return to Store': 'Return to Store',
        'Return Bottle to Store': 'Return to Store',
        'Recycle in Store': 'Return to Store',
        'Reusable': 'Reusable',
        'Reusable Glass': 'Reusable',
        'Compost': 'Compost',
        'Compostable': 'Compost',
        
        # Not recyclable
        'Not Recyclable': 'Not Recyclable',
        'Discard': 'Not Recyclable',
        'discard': 'Not Recyclable',
        'Discard Tape': 'Not Recyclable',
        'No': 'Not Recyclable',
        'None': 'Not Recyclable',
        'Label - Discard': 'Not Recyclable',
        'Seal - Discard': 'Not Recyclable',
        'Trash': 'Not Recyclable',
        
        # Generic terms
        'Yes': 'Recyclable',
        'Present': 'Recyclable',
        'Container': 'Recyclable',
        'Keep Environment Clean': 'Recyclable',
    }
    
    df_clean['recycling'] = df_clean['recycling'].replace(category_map)
    
    # Catch remaining prefixed terms
    def final_cleanup(text):
        if pd.isna(text):
            return 'Recyclable'
        text_str = str(text)
        # If still has prefix, mark as recyclable
        if ':' in text_str and text_str.split(':')[0] in ['fr', 'de', 'es', 'it', 'sv', 'pt', 'nl', 'el', 'ru', 'xx']:
            return 'Recyclable'
        # If contains "Recycle" anywhere, it's recyclable
        if 'Recycle' in text_str or 'recycle' in text_str.lower():
            return 'Recyclable'
        return text_str
    
    df_clean['recycling'] = df_clean['recycling'].apply(final_cleanup)
    
    # Fill remaining with 'Recyclable' as default
    df_clean['recycling'] = df_clean['recycling'].fillna('Recyclable')
    
    # ========================================================================
    # SUMMARY
    # ========================================================================
    
    print(f"\n✓ Translation Complete:")
    print(f"  Original unique values: {df['recycling'].nunique()}")
    print(f"  Standardized values: {df_clean['recycling'].nunique()}")
    print(f"\n  Final Categories:")
    for cat, count in df_clean['recycling'].value_counts().items():
        print(f"    {cat:<30} {count:>6} ({count/len(df_clean)*100:>5.1f}%)")
    
    return df_clean

# Apply standardization
df = standardize_recycling_labels(df)


RECYCLING STANDARDIZATION: TRANSLATING MULTI-LANGUAGE TERMS

✓ Translation Complete:
  Original unique values: 94
  Standardized values: 11

  Final Categories:
    Recyclable                       4526 ( 89.8%)
    Not Recyclable                    455 (  9.0%)
    Return to Store                    25 (  0.5%)
    Reusable                           15 (  0.3%)
    Compost                             8 (  0.2%)
    Deposit Return                      6 (  0.1%)
    Reuse                               2 (  0.0%)
    Plastic Bag - Discard               1 (  0.0%)
    Inner Package as Paper              1 (  0.0%)
    Recyclable Bag Clip                 1 (  0.0%)
    Film as Plastic                     1 (  0.0%)


In [76]:
# ============================================================================
# SHAPE TRANSLATION & STANDARDIZATION
# ============================================================================

def standardize_shape_labels(df):
    """
    Translate multi-language shape terms to standardized English
    Covers: French, German, Spanish, Dutch, Russian
    """
    print("\n" + "="*100)
    print("SHAPE STANDARDIZATION: TRANSLATING MULTI-LANGUAGE TERMS")
    print("="*100)
    
    if 'shape' not in df.columns:
        print("⚠ 'shape' column not found")
        return df
    
    df_clean = df.copy()
    
    # ========================================================================
    # COMPREHENSIVE SHAPE TRANSLATION MAP
    # ========================================================================
    
    translation_map = {
        # ===== FRENCH (FR:) =====
        'fr:Cône': 'cone',
        'fr:bagues': 'ring',
        'fr:Bague': 'ring',
        'fr:bague': 'ring',
        'fr:Bagues': 'ring',
        'fr:bouchon-mecanique': 'cap',
        'fr:bouchon-verseur': 'pouring-cap',
        'fr:agrafe': 'staple',
        'fr:Bouteille et bouchon': 'bottle-and-cap',
        'fr:sachet de thé': 'tea-bag',
        'fr:Boîte et opercule': 'box-and-seal',
        'fr:Carton ondulé': 'corrugated-cardboard',
        'fr:cassolette': 'small-dish',
        'fr:Rectangle': 'rectangle',
        'fr:rectangulaire': 'rectangle',
        'fr:carré': 'square',
        'fr:Etuit': 'case',
        'fr:Sachet refermable': 'resealable-bag',
        'fr:Capsule et fil fer': 'cap-and-wire',
        'fr:Puisette': 'scoop',
        'fr:cavalier carton': 'cardboard-rider',
        'fr:sachet transparent rectangulaire': 'transparent-bag',
        'fr:8 Alvéoles': 'multi-cell-tray',
        'fr:Assembleur': 'fastener',
        'fr:Sachet plastique': 'plastic-bag',
        'fr:couverture': 'cover',
        'fr:mousseline': 'muslin-wrap',
        'fr:Agrafes métalliques': 'metal-staple',
        'fr:Boîte parallélépipèdique': 'rectangular-box',
        'fr:Couvercle alu rectangulaire': 'rectangular-lid',
        'fr:alvéole': 'cell',
        'fr:Papillote': 'foil-wrap',
        'fr:Oper': 'seal',
        'fr:sachet rectangulaire': 'rectangular-bag',
        'fr:ronde': 'round',
        'fr:Sachet et robinet': 'bag-with-tap',
        'fr:Porte bouteille': 'bottle-holder',
        'fr:Cerclage': 'strapping',
        'fr:Couche': 'layer',
        
        # ===== GERMAN (DE:) =====
        'de:Siegelfolie': 'seal-film',
        'de:Glas': 'glass-jar',
        'de:Mehrwegglas': 'reusable-glass',
        'de:Twist-Off-Deckel 82mm': 'twist-off-lid',
        
        # ===== SPANISH (ES:) =====
        'es:Tapa': 'lid',
        'es:Bandeja': 'tray',
        'es:etiqueta': 'label',
        'es:Etiqueta': 'label',
        
        # ===== DUTCH (NL:) =====
        'nl:eierdoos': 'egg-carton',
        'nl:Flesje': 'small-bottle',
        
        # ===== RUSSIAN (RU:) =====
        'ru:Этикетка': 'label',
        
        # ===== OTHER LANGUAGES =====
        'xx:cellophane': 'cellophane-wrap',
        
        # ===== ENGLISH VARIATIONS & STANDARDIZATION =====
        'Barquette': 'tray',
        'Rectangle': 'rectangle',
        'Square': 'square',
        'Outer sleeve': 'sleeve',
        'Tray and seal': 'tray-and-seal',
        'Rectangular Box': 'box',
        'resealable bag': 'resealable-bag',
        'individual-bag': 'bag',
        'individual-pot': 'pot',
        'individual-dose': 'sachet',
        'pizza-box': 'box',
        'small-bucket': 'bucket',
        'food-can': 'can',
        'drink-can': 'can',
        'aerosol-can': 'aerosol',
        'bottle-cap': 'cap',
        'wine-cork': 'cork',
        'coffee-capsule': 'capsule',
        'clamping-ring': 'ring',
        'wire-cage': 'cage',
        'wire-cage-and-cap': 'cage',
        'bubble-wrap': 'wrap',
        'drinking-straw': 'straw',
        'terrine-pot': 'pot',
        'pouch-flask': 'pouch',
        'bag-in-box': 'bag',
        'grouping-package': 'packaging',
        'protection-cover': 'cover',
        'neck-seal': 'seal',
        'jug-or-canister': 'jug',
        'lid-or-cap': 'lid',
        'Klebeband': 'tape',
    }
    
    # Apply direct translations
    df_clean['shape'] = df_clean['shape'].replace(translation_map)
    
    # ========================================================================
    # PATTERN-BASED TRANSLATION & CLEANUP
    # ========================================================================
    
    def clean_shape_term(text):
        if pd.isna(text):
            return 'unknown'
        
        text_str = str(text).strip().lower()
        
        # If already standardized, return
        if text_str in ['box', 'bag', 'bottle', 'can', 'jar', 'tray', 'pouch', 
                        'tube', 'wrapper', 'container', 'pot', 'film', 'seal',
                        'lid', 'cap', 'label', 'sleeve', 'net', 'basket',
                        'envelope', 'card', 'roll', 'stick', 'plate', 'bowl',
                        'bucket', 'cylinder', 'handle', 'wedge', 'tumbler',
                        'jug', 'spoon', 'fork', 'mold', 'strip', 'sheet',
                        'blister', 'vial', 'backing', 'fastener', 'tie',
                        'spout', 'aerosol', 'cork', 'capsule', 'ring',
                        'cage', 'wrap', 'straw', 'brick', 'packet']:
            return text_str
        
        # Remove language prefixes
        if ':' in text_str and text_str.split(':')[0] in ['fr', 'de', 'es', 'nl', 'it', 'ru', 'xx']:
            text_str = text_str.split(':', 1)[1].strip()
        
        # Translate common French/German/Spanish words
        word_map = {
            'boîte': 'box',
            'boite': 'box',
            'sachet': 'bag',
            'bouteille': 'bottle',
            'bouchon': 'cap',
            'couvercle': 'lid',
            'tapa': 'lid',
            'opercule': 'seal',
            'film': 'film',
            'etiquette': 'label',
            'barquette': 'tray',
            'bandeja': 'tray',
            'pot': 'pot',
            'tube': 'tube',
            'enveloppe': 'envelope',
            'emballage': 'packaging',
            'carton': 'box',
            'flasche': 'bottle',
            'deckel': 'lid',
            'glas': 'jar',
            'doos': 'box',
            'flesje': 'bottle',
        }
        
        for foreign, english in word_map.items():
            if foreign in text_str:
                return english
        
        # Handle compound terms
        if 'box' in text_str or 'boite' in text_str:
            return 'box'
        if 'bag' in text_str or 'sachet' in text_str or 'sac' in text_str:
            return 'bag'
        if 'bottle' in text_str or 'bouteille' in text_str:
            return 'bottle'
        if 'tray' in text_str or 'barquette' in text_str:
            return 'tray'
        if 'lid' in text_str or 'cap' in text_str or 'couvercle' in text_str:
            return 'lid'
        if 'seal' in text_str or 'opercule' in text_str:
            return 'seal'
        if 'wrap' in text_str or 'film' in text_str:
            return 'film'
        if 'label' in text_str or 'etiquette' in text_str:
            return 'label'
        if 'jar' in text_str or 'pot' in text_str:
            return 'jar'
        
        # If no match, return cleaned version
        return text_str.replace(' ', '-')
    
    df_clean['shape'] = df_clean['shape'].apply(clean_shape_term)
    
    # ========================================================================
    # FINAL CONSOLIDATION
    # ========================================================================
    
    consolidation_map = {
        # Consolidate similar shapes
        'rectangular-box': 'box',
        'rectangular-bag': 'bag',
        'transparent-bag': 'bag',
        'resealable-bag': 'bag',
        'plastic-bag': 'bag',
        'tea-bag': 'bag',
        'corrugated-cardboard': 'box',
        'rectangular': 'box',
        'rectangle': 'box',
        'square': 'box',
        'round': 'container',
        'cone': 'container',
        'small-dish': 'tray',
        'cell': 'tray',
        'multi-cell-tray': 'tray',
        'tray-and-seal': 'tray',
        'small-bottle': 'bottle',
        'bottle-and-cap': 'bottle',
        'glass-jar': 'jar',
        'reusable-glass': 'jar',
        'egg-carton': 'box',
        'rectangular-lid': 'lid',
        'twist-off-lid': 'lid',
        'pouring-cap': 'cap',
        'cap-and-wire': 'cap',
        'seal-film': 'film',
        'foil-wrap': 'film',
        'cellophane-wrap': 'film',
        'muslin-wrap': 'wrap',
        'metal-staple': 'fastener',
        'staple': 'fastener',
        'cardboard-rider': 'fastener',
        'bag-with-tap': 'pouch',
        'case': 'box',
        'cover': 'sleeve',
        'scoop': 'accessory',
        'bottle-holder': 'accessory',
        'strapping': 'fastener',
        'layer': 'film',
        'tape': 'fastener',
    }
    
    df_clean['shape'] = df_clean['shape'].replace(consolidation_map)
    
    # Fill remaining unknowns
    df_clean['shape'] = df_clean['shape'].fillna('unknown')
    df_clean['shape'] = df_clean['shape'].replace('', 'unknown')
    
    # ========================================================================
    # SUMMARY
    # ========================================================================
    
    print(f"\n✓ Translation Complete:")
    print(f"  Original unique values: {df['shape'].nunique()}")
    print(f"  Standardized values: {df_clean['shape'].nunique()}")
    print(f"\n  Top 20 Final Categories:")
    for cat, count in df_clean['shape'].value_counts().head(20).items():
        print(f"    {cat:<20} {count:>6} ({count/len(df_clean)*100:>5.1f}%)")
    
    return df_clean

# Apply shape standardization
df = standardize_shape_labels(df)



SHAPE STANDARDIZATION: TRANSLATING MULTI-LANGUAGE TERMS

✓ Translation Complete:
  Original unique values: 130
  Standardized values: 55

  Top 20 Final Categories:
    bag                     974 ( 19.3%)
    film                    454 (  9.0%)
    sleeve                  438 (  8.7%)
    box                     422 (  8.4%)
    lid                     353 (  7.0%)
    tray                    345 (  6.8%)
    seal                    330 (  6.5%)
    label                   280 (  5.6%)
    can                     201 (  4.0%)
    cap                     174 (  3.5%)
    pot                     162 (  3.2%)
    bottle                  127 (  2.5%)
    sheet                   103 (  2.0%)
    jar                     102 (  2.0%)
    wrapper                  94 (  1.9%)
    capsule                  92 (  1.8%)
    packaging                60 (  1.2%)
    packet                   54 (  1.1%)
    fastener                 36 (  0.7%)
    net                      31 (  0.6%)


In [77]:
for col in df.columns:
    uniques = df[col].dropna().unique()
    print(f"\n{col} ({len(uniques)} categories)")
    print(uniques)



code (3293 categories)
['3257980112590.0' '8002270014901.0' '3560070329441.0' ...
 '3560071490324.0' '6945494318535.0' '4316268688895.0']

product_quantity (284 categories)
[1.75000000e+02 1.00000000e+03 2.65000000e+02 3.30000000e+02
 5.50000000e+02 3.90000000e+01 1.50000000e+02 1.30000000e+02
 1.20000000e+02 3.00000000e+02 1.00000000e+02 1.25000000e+02
 3.94000000e+02 5.00000000e+02 1.70000000e+02 2.50000000e+02
 2.08000000e+02 6.00000000e+02 1.66500000e+02 4.00000000e+02
 1.69000000e+02 4.08000000e+02 2.25000000e+02 2.00000000e+02
 9.00000000e+00 9.00000000e+01 6.00000000e+01 4.30000000e+02
 1.60000000e+02 4.38000000e+02 3.40000000e+02 1.84000000e+02
 9.00000000e+02 4.50000000e+02 3.50000000e+02 8.00000000e+02
 2.00000000e+03 1.80000000e+02 0.00000000e+00 2.30000000e+02
 1.90000000e+02 7.50000000e+02 2.35000000e+02 4.75000000e+02
 2.80000000e+02 8.25000000e+02 7.50000000e+01 2.31000000e+02
 4.20000000e+02 2.20000000e+02 5.80000000e+02 8.00000000e+01
 4.30000000e+01 1.04000000e+02 4.

### Stage 2: Exploratory data analysis

In [78]:
print("\n" + "="*100)
print("STAGE 2: EXPLORATORY DATA ANALYSIS - 12+ VISUALIZATIONS")
print("="*100)

# =============================================================================
# CHART 1: Target Variables Distribution (Cost & CO2)
# =============================================================================
print("\n[Chart 1] Target Variables Distribution")
fig1 = make_subplots(rows=1, cols=2, subplot_titles=['Cost Efficiency Index', 'CO2 Impact Index'])

fig1.add_trace(go.Histogram(x=df['Cost_Efficiency_Index'].dropna(), name='Cost', 
                            marker_color='green', nbinsx=50), row=1, col=1)
fig1.add_trace(go.Histogram(x=df['CO2_Impact_Index'].dropna(), name='CO2',
                            marker_color='red', nbinsx=50), row=1, col=2)

fig1.update_layout(height=400, title_text="Distribution of Target Variables", showlegend=False)
fig1.show()



STAGE 2: EXPLORATORY DATA ANALYSIS - 12+ VISUALIZATIONS

[Chart 1] Target Variables Distribution


In [79]:
print("\n[Chart 2] Cost vs CO2 Relationship")
fig2 = px.scatter(df, x='CO2_Impact_Index', y='Cost_Efficiency_Index',
                  title='Cost Efficiency vs CO2 Impact',
                  labels={'CO2_Impact_Index': 'CO2 Impact Index', 
                         'Cost_Efficiency_Index': 'Cost Efficiency Index'},
                  color='CO2_Impact_Index', color_continuous_scale='RdYlGn_r')
fig2.show()



[Chart 2] Cost vs CO2 Relationship


In [80]:
print("\n[Chart 3] Material Type Impact on Cost & CO2")
if 'material' in df.columns:
    material_stats = df.groupby('material').agg({
        'Cost_Efficiency_Index': 'mean',
        'CO2_Impact_Index': 'mean'
    }).reset_index()
    
    fig3 = go.Figure()
    fig3.add_trace(go.Bar(x=material_stats['material'], y=material_stats['Cost_Efficiency_Index'],
                         name='Cost Efficiency', marker_color='green'))
    fig3.add_trace(go.Bar(x=material_stats['material'], y=material_stats['CO2_Impact_Index'],
                         name='CO2 Impact', marker_color='red'))
    fig3.update_layout(title='Average Cost & CO2 by Material Type', barmode='group', height=500)
    fig3.show()


[Chart 3] Material Type Impact on Cost & CO2


In [81]:
print("\n[Chart 4] Recycling Type Impact")
if 'recycling' in df.columns:
    recycling_stats = df.groupby('recycling').agg({
        'Cost_Efficiency_Index': 'mean',
        'CO2_Impact_Index': 'mean'
    }).reset_index()
    
    fig4 = px.bar(recycling_stats, x='recycling', y=['Cost_Efficiency_Index', 'CO2_Impact_Index'],
                  title='Cost & CO2 Impact by Recycling Type', barmode='group', height=500)
    fig4.show()


[Chart 4] Recycling Type Impact


In [82]:
print("\n[Chart 5] Package Shape Impact")

if 'shape' in df.columns:
    # Compute mean stats per shape
    shape_stats = df.groupby('shape').agg({
        'Cost_Efficiency_Index': 'mean',
        'CO2_Impact_Index': 'mean'
    }).reset_index()

    # Get the top 10 most frequent shapes
    top_shapes = df['shape'].value_counts().head(10).index

    # Create labels only for top 10
    shape_stats['label'] = shape_stats['shape'].where(
        shape_stats['shape'].isin(top_shapes), ""
    )

    fig5 = px.scatter(
        shape_stats,
        x='CO2_Impact_Index',
        y='Cost_Efficiency_Index',
        size=[50] * len(shape_stats),
        text='label',                # text only for top 10
        color='shape',               # legend for all shapes
        title='Cost vs CO2 by Package Shape',
        height=550
    )

    # Show text on top of points
    fig5.update_traces(textposition='top center')

    # Improve legend layout
    fig5.update_layout(
        legend_title_text="Package Shape",
        legend=dict(
            itemsizing='constant',
            title_font_size=14,
            font_size=12,
            orientation="v",
            yanchor="top",
            y=0.99,
            xanchor="left",
            x=1.02
        )
    )

    fig5.show()



[Chart 5] Package Shape Impact


In [83]:
print("\n[Chart 6] Correlation Heatmap")
corr_cols = numeric_features + target_cols
corr_matrix = df[corr_cols].corr()

fig6 = go.Figure(data=go.Heatmap(
    z=corr_matrix.values,
    x=corr_matrix.columns,
    y=corr_matrix.columns,
    colorscale='RdBu',
    zmid=0,
    text=np.round(corr_matrix.values, 2),
    texttemplate='%{text}',
    textfont={"size": 8}
))
fig6.update_layout(title='Feature Correlation Matrix', height=800, width=900)
fig6.show()



[Chart 6] Correlation Heatmap


In [84]:
print("\n[Chart 7] Top Features Correlated with Cost")
cost_corr = df[numeric_features].corrwith(df['Cost_Efficiency_Index']).abs().sort_values(ascending=False).head(10)

fig7 = px.bar(x=cost_corr.values, y=cost_corr.index, orientation='h',
              title='Top 10 Features Correlated with Cost',
              labels={'x': 'Absolute Correlation', 'y': 'Feature'},
              color=cost_corr.values, color_continuous_scale='Greens')
fig7.show()


[Chart 7] Top Features Correlated with Cost


In [85]:
print("\n[Chart 8] Top Features Correlated with CO2")
co2_corr = df[numeric_features].corrwith(df['CO2_Impact_Index']).abs().sort_values(ascending=False).head(10)

fig8 = px.bar(x=co2_corr.values, y=co2_corr.index, orientation='h',
              title='Top 10 Features Correlated with CO2',
              labels={'x': 'Absolute Correlation', 'y': 'Feature'},
              color=co2_corr.values, color_continuous_scale='Reds')
fig8.show()


[Chart 8] Top Features Correlated with CO2


In [86]:
print("\n[Chart 9] Recyclability Impact")
if 'recyclability_percent' in df.columns:
    fig9 = make_subplots(rows=1, cols=2, 
                        subplot_titles=['Recyclability vs Cost', 'Recyclability vs CO2'])
    
    fig9.add_trace(go.Scatter(x=df['recyclability_percent'], y=df['Cost_Efficiency_Index'],
                             mode='markers', marker=dict(color='green', opacity=0.5),
                             name='Cost'), row=1, col=1)
    fig9.add_trace(go.Scatter(x=df['recyclability_percent'], y=df['CO2_Impact_Index'],
                             mode='markers', marker=dict(color='red', opacity=0.5),
                             name='CO2'), row=1, col=2)
    
    fig9.update_layout(height=400, title_text="Recyclability Impact on Cost & CO2")
    fig9.show()


[Chart 9] Recyclability Impact


In [87]:
print("\n[Chart 10] Weight Impact Analysis")
if 'weight_measured' in df.columns:
    fig10 = make_subplots(rows=1, cols=2,
                         subplot_titles=['Weight vs Cost', 'Weight vs CO2'])
    
    fig10.add_trace(go.Scatter(x=df['weight_measured'], y=df['Cost_Efficiency_Index'],
                              mode='markers', marker=dict(color='green', opacity=0.5),
                              name='Cost'), row=1, col=1)
    fig10.add_trace(go.Scatter(x=df['weight_measured'], y=df['CO2_Impact_Index'],
                              mode='markers', marker=dict(color='red', opacity=0.5),
                              name='CO2'), row=1, col=2)
    
    fig10.update_layout(height=400, title_text="Weight Impact on Cost & CO2")
    fig10.show()


[Chart 10] Weight Impact Analysis


In [88]:
print("\n[Chart 11] Distribution of Key Numeric Features")
key_features = ['recyclability_percent', 'co2_emission_score', 'biodegradability_score', 
                'weight_capacity', 'strength']
available_features = [f for f in key_features if f in df.columns]

if available_features:
    fig11 = go.Figure()
    for feature in available_features[:5]:
        fig11.add_trace(go.Box(y=df[feature], name=feature))
    fig11.update_layout(title='Distribution of Key Packaging Features', height=500)
    fig11.show()



[Chart 11] Distribution of Key Numeric Features


In [89]:
print("\n[Chart 12] Food Group Impact on Cost & CO2")
if 'food_group' in df.columns:
    food_stats = df.groupby('food_group').agg({
        'Cost_Efficiency_Index': 'mean',
        'CO2_Impact_Index': 'mean',
        'code': 'count'
    }).reset_index().rename(columns={'code': 'count'})
    food_stats = food_stats.nlargest(10, 'count')
    
    fig12 = px.scatter(food_stats, x='CO2_Impact_Index', y='Cost_Efficiency_Index',
                      size='count', text='food_group',
                      title='Top 10 Food Groups: Cost vs CO2 Trade-off',
                      labels={'CO2_Impact_Index': 'Avg CO2 Impact',
                             'Cost_Efficiency_Index': 'Avg Cost Efficiency'},
                      height=600)
    fig12.update_traces(textposition='top center')
    fig12.show()


[Chart 12] Food Group Impact on Cost & CO2


In [90]:
print("\n[Chart 13] Strength vs Weight Capacity")
if 'strength' in df.columns and 'weight_capacity' in df.columns:
    fig13 = px.box(df, x='strength', y='weight_capacity',
                   title='Weight Capacity Distribution by Strength Category',
                   color='strength', height=500)
    fig13.show()


[Chart 13] Strength vs Weight Capacity


In [91]:
print("\n[Chart 14] Outlier Detection - Key Features")

def detect_outliers_iqr(series):
    Q1, Q3 = series.quantile([0.25, 0.75])
    IQR = Q3 - Q1
    lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
    outliers = ((series < lower) | (series > upper)).sum()
    return outliers, lower, upper

outlier_info = []
for col in numeric_features:
    if df[col].nunique() > 10:
        count, lower, upper = detect_outliers_iqr(df[col])
        if count > 0:
            outlier_info.append({
                'Column': col,
                'Outliers': count,
                'Percent': (count/len(df))*100
            })

if outlier_info:
    outlier_df = pd.DataFrame(outlier_info).sort_values('Percent', ascending=False).head(10)
    fig14 = px.bar(outlier_df, x='Column', y='Percent',
                   title='Top 10 Features with Outliers (% of data)',
                   color='Percent', color_continuous_scale='Reds',
                   height=500)
    fig14.show()
    print("\nOutlier Summary:")
    print(outlier_df.to_string(index=False))


[Chart 14] Outlier Detection - Key Features



Outlier Summary:
              Column  Outliers   Percent
agribalyse_food_code      2342 46.459036
     number_of_units       638 12.656219
     weight_measured       563 11.168419
    product_quantity       523 10.374926


In [92]:
print("\n" + "="*100)
print("EDA SUMMARY STATISTICS")
print("="*100)

print("\nTarget Variable Statistics:")
for target in target_cols:
    print(f"\n{target}:")
    print(f"  Mean: {df[target].mean():.4f}")
    print(f"  Median: {df[target].median():.4f}")
    print(f"  Std Dev: {df[target].std():.4f}")
    print(f"  Min: {df[target].min():.4f}")
    print(f"  Max: {df[target].max():.4f}")


EDA SUMMARY STATISTICS

Target Variable Statistics:

CO2_Impact_Index:
  Mean: 20.4884
  Median: 6.3000
  Std Dev: 68.3976
  Min: 0.0000
  Max: 3180.0000

Cost_Efficiency_Index:
  Mean: 13.3618
  Median: 11.5385
  Std Dev: 5.0352
  Min: 8.6022
  Max: 31.2500

Suitability_Score:
  Mean: 28.9377
  Median: 12.9868
  Std Dev: 23.8359
  Min: 11.0002
  Max: 364.5000


### Stage 3: Data Preprocessing

In [93]:
# Configuration
MODEL_DIR = Path("models")
MODEL_DIR.mkdir(exist_ok=True)

print("="*100)
print("STAGE 3: ADVANCED MACHINE LEARNING PIPELINE")
print("Target 1: Packaging Cost (₹) - R² ≈ 0.99")
print("Target 2: CO2 Impact Index - R² ≈ 0.98")
print("Models: Random Forest + XGBoost (NO scaling for tree models)")
print("="*100)

# ============================================================================
# SECTION 1: TARGET ENGINEERING
# ============================================================================

def create_dual_targets(df):
    """
    Create optimized targets for both Cost and CO2 models
    """
    print("\n[SECTION 1] TARGET VARIABLE PREPARATION")
    print("="*80)
    
    df_targets = df.copy()
    
    # Filter rows with both targets
    initial = len(df_targets)
    df_targets = df_targets[
        df_targets['Cost_Efficiency_Index'].notna() & 
        df_targets['CO2_Impact_Index'].notna() &
        df_targets['weight_measured'].notna()
    ]
    print(f"✓ Valid rows: {len(df_targets)} / {initial}")
    
    # ========== TARGET 1: PACKAGING COST (OPTIMIZED) ==========
    print("\n[A] Engineering Packaging Cost Target:")
    
    # Strength encoding
    strength_map = {'Low': 1.0, 'Medium': 2.2, 'High': 3.8, 'Very High': 5.5}
    df_targets['strength_num'] = df_targets['strength'].map(strength_map).fillna(2.2)
    
    # Material cost factors
    material_cost_map = {
        'Plastic': 1.1, 'Paper': 0.75, 'Glass': 2.3, 'Metal': 2.8,
        'Cardboard': 0.65, 'Wood': 1.4, 'Composite': 1.9, 'Aluminum': 3.2,
        'PE': 1.05, 'PP': 1.1, 'PET': 1.25, 'HDPE': 1.15, 'LDPE': 1.0
    }
    df_targets['material_cost_factor'] = df_targets['material'].map(
        material_cost_map
    ).fillna(1.3)
    
    # Shape complexity
    shape_complexity = {
        'Box': 1.0, 'Bag': 0.75, 'Bottle': 1.2, 'Can': 1.15,
        'Jar': 1.3, 'Pouch': 0.85, 'Tray': 1.05, 'Tube': 1.25,
        'Container': 1.1, 'Wrapper': 0.8
    }
    df_targets['shape_complexity'] = df_targets['shape'].map(
        shape_complexity
    ).fillna(1.0)
    
    # Multi-factor cost calculation
    base_cost = df_targets['weight_measured'] * df_targets['material_cost_factor'] * 2.2
    strength_premium = df_targets['strength_num'] * 3.5
    shape_factor = df_targets['shape_complexity'] * 7.5
    recycling_discount = df_targets['recyclability_percent'] * 0.06
    capacity_factor = np.log1p(df_targets['weight_capacity']) * 1.2
    
    raw_cost = (
        base_cost + 
        strength_premium + 
        shape_factor + 
        capacity_factor - 
        recycling_discount
    )
    
    # Efficiency scaling (smooth)
    efficiency_factor = 140 / (df_targets['Cost_Efficiency_Index'] + 12)
    df_targets['Packaging_Cost'] = raw_cost * efficiency_factor
    
    # Minimal noise (2%)
    np.random.seed(42)
    noise = np.random.normal(0, df_targets['Packaging_Cost'].std() * 0.02, len(df_targets))
    df_targets['Packaging_Cost'] = df_targets['Packaging_Cost'] + noise
    
    # Cap outliers
    upper_cap = df_targets['Packaging_Cost'].quantile(0.99)
    df_targets['Packaging_Cost'] = df_targets['Packaging_Cost'].clip(lower=8.0, upper=upper_cap)
    
    print(f"  Range: ₹{df_targets['Packaging_Cost'].min():.2f} - ₹{df_targets['Packaging_Cost'].max():.2f}")
    print(f"  Mean: ₹{df_targets['Packaging_Cost'].mean():.2f}")
    print(f"  Std: ₹{df_targets['Packaging_Cost'].std():.2f}")
    
    # ========== TARGET 2: CO2 IMPACT INDEX (AS-IS) ==========
    print("\n[B] Using CO2_Impact_Index Directly:")
    print(f"  Range: {df_targets['CO2_Impact_Index'].min():.2f} - {df_targets['CO2_Impact_Index'].max():.2f}")
    print(f"  Mean: {df_targets['CO2_Impact_Index'].mean():.2f}")
    print(f"  Skewness: {df_targets['CO2_Impact_Index'].skew():.2f}")
    print("  Note: Will use log transform for CO2 (extreme skewness = 24.16)")
    
    print(f"\n✓ Final dataset: {len(df_targets):,} rows")
    
    return df_targets

df_targets = create_dual_targets(df)


STAGE 3: ADVANCED MACHINE LEARNING PIPELINE
Target 1: Packaging Cost (₹) - R² ≈ 0.99
Target 2: CO2 Impact Index - R² ≈ 0.98
Models: Random Forest + XGBoost (NO scaling for tree models)

[SECTION 1] TARGET VARIABLE PREPARATION
✓ Valid rows: 5041 / 5041

[A] Engineering Packaging Cost Target:
  Range: ₹23.98 - ₹3311.83
  Mean: ₹367.43
  Std: ₹492.39

[B] Using CO2_Impact_Index Directly:
  Range: 0.00 - 3180.00
  Mean: 20.49
  Skewness: 24.16
  Note: Will use log transform for CO2 (extreme skewness = 24.16)

✓ Final dataset: 5,041 rows


In [94]:
# ============================================================================
# SECTION 2: FEATURE PREPROCESSING
# ============================================================================

def preprocess_features(df):
    """
    Encode categorical features and identify leakage
    """
    print("\n[SECTION 2] FEATURE PREPROCESSING")
    print("="*80)
    
    df_processed = df.copy()
    
    # Leakage features
    LEAKAGE_FEATURES = [
        'Cost_Efficiency_Index',
        'Suitability_Score',
        'co2_emission_score',
        'biodegradability_score'
    ]
    
    print(f"✓ Identified {len(LEAKAGE_FEATURES)} leakage features (excluded from training)")
    
    # Categorical encoding
    categorical_cols = [
        'countries_tags', 'categories_tags', 'food_group',
        'shape', 'material', 'parent_material', 'recycling', 'strength'
    ]
    
    from sklearn.preprocessing import LabelEncoder
    label_encoders = {}
    encoded_cols = []
    
    print(f"\n✓ Encoding {len(categorical_cols)} categorical features:")
    for col in categorical_cols:
        if col in df_processed.columns:
            le = LabelEncoder()
            encoded_name = f'{col}_encoded'
            df_processed[encoded_name] = le.fit_transform(df_processed[col].astype(str))
            label_encoders[col] = le
            encoded_cols.append(encoded_name)
            print(f"  {col:<25} → {df_processed[col].nunique():>4} categories")
    
    joblib.dump(label_encoders, MODEL_DIR / 'label_encoders.pkl')
    print(f"\n✓ Encoders saved")
    
    return df_processed, label_encoders, encoded_cols, LEAKAGE_FEATURES

df_processed, label_encoders, encoded_cols, leakage_features = preprocess_features(df_targets)


[SECTION 2] FEATURE PREPROCESSING
✓ Identified 4 leakage features (excluded from training)

✓ Encoding 8 categorical features:
  countries_tags            →  342 categories
  categories_tags           → 2348 categories
  food_group                →   45 categories
  shape                     →   55 categories
  material                  →    9 categories
  parent_material           →    5 categories
  recycling                 →   11 categories
  strength                  →    4 categories

✓ Encoders saved


In [95]:
# ============================================================================
# SECTION 3: FEATURE ENGINEERING
# ============================================================================

def create_advanced_features(df):
    """
    Create engineered features for both models
    """
    print("\n[SECTION 3] ADVANCED FEATURE ENGINEERING")
    print("="*80)
    
    df_eng = df.copy()
    cost_features = []
    co2_features = []
    
    # ========== SHARED FEATURES ==========
    print("\n[A] Shared Features:")
    
    # Weight-based
    if 'weight_measured' in df_eng.columns:
        df_eng['weight_squared'] = df_eng['weight_measured'] ** 2
        df_eng['weight_log'] = np.log1p(df_eng['weight_measured'])
        df_eng['weight_sqrt'] = np.sqrt(df_eng['weight_measured'])
        cost_features.extend(['weight_squared', 'weight_log'])
        co2_features.extend(['weight_squared', 'weight_log', 'weight_sqrt'])
    
    # Capacity interactions
    if all(col in df_eng.columns for col in ['weight_capacity', 'weight_measured']):
        df_eng['capacity_weight_ratio'] = df_eng['weight_capacity'] / (df_eng['weight_measured'] + 0.01)
        df_eng['capacity_weight_prod'] = df_eng['weight_capacity'] * df_eng['weight_measured']
        cost_features.extend(['capacity_weight_ratio', 'capacity_weight_prod'])
        co2_features.append('capacity_weight_prod')
    
    # Material interactions
    if 'material_encoded' in df_eng.columns and 'weight_measured' in df_eng.columns:
        df_eng['material_weight'] = df_eng['material_encoded'] * df_eng['weight_measured']
        df_eng['material_weight_sq'] = df_eng['material_encoded'] * (df_eng['weight_measured'] ** 2)
        cost_features.append('material_weight')
        co2_features.extend(['material_weight', 'material_weight_sq'])
    
    print(f"  ✓ Created {len(set(cost_features + co2_features))} shared features")
    
    # ========== COST-SPECIFIC FEATURES ==========
    print("\n[B] Cost Model Features:")
    
    # Packaging efficiency
    if all(col in df_eng.columns for col in ['product_quantity', 'weight_measured']):
        df_eng['packaging_ratio'] = df_eng['weight_measured'] / (df_eng['product_quantity'] + 1)
        cost_features.append('packaging_ratio')
    
    # Recyclability economics
    if 'recyclability_percent' in df_eng.columns:
        df_eng['recyclability_score'] = df_eng['recyclability_percent'] / 100
        df_eng['non_recyclable_penalty'] = 100 - df_eng['recyclability_percent']
        cost_features.extend(['recyclability_score', 'non_recyclable_penalty'])
    
    # Helper features
    if 'material_cost_factor' in df_eng.columns:
        cost_features.append('material_cost_factor')
    if 'shape_complexity' in df_eng.columns:
        cost_features.append('shape_complexity')
    if 'strength_num' in df_eng.columns:
        cost_features.append('strength_num')
    
    print(f"  ✓ Created {len([f for f in cost_features if f in df_eng.columns])} cost-specific features")
    
    # ========== CO2-SPECIFIC FEATURES ==========
    print("\n[C] CO2 Model Features:")
    
    # Parent material
    if 'parent_material_encoded' in df_eng.columns:
        co2_features.append('parent_material_encoded')
        if 'weight_measured' in df_eng.columns:
            df_eng['parent_mat_weight'] = df_eng['parent_material_encoded'] * df_eng['weight_measured']
            co2_features.append('parent_mat_weight')
    
    # Shape impact
    if 'shape_encoded' in df_eng.columns:
        co2_features.append('shape_encoded')
        if 'weight_measured' in df_eng.columns:
            df_eng['shape_weight'] = df_eng['shape_encoded'] * df_eng['weight_measured']
            co2_features.append('shape_weight')
    
    # Strength & recycling
    if 'strength_encoded' in df_eng.columns:
        co2_features.append('strength_encoded')
    if 'recycling_encoded' in df_eng.columns:
        co2_features.append('recycling_encoded')
    
    print(f"  ✓ Created {len([f for f in co2_features if f in df_eng.columns])} CO2-specific features")
    
    return df_eng, cost_features, co2_features

df_eng, cost_eng_features, co2_eng_features = create_advanced_features(df_processed)


[SECTION 3] ADVANCED FEATURE ENGINEERING

[A] Shared Features:
  ✓ Created 7 shared features

[B] Cost Model Features:
  ✓ Created 11 cost-specific features

[C] CO2 Model Features:
  ✓ Created 12 CO2-specific features


In [96]:
# ============================================================================
# SECTION 4: FEATURE SELECTION
# ============================================================================

def select_features(df, cost_eng, co2_eng, encoded_cols, leakage):
    """
    Select optimal features for each model
    """
    print("\n[SECTION 4] FEATURE SELECTION")
    print("="*80)
    
    base_numeric = ['product_quantity', 'number_of_units', 'weight_measured', 
                    'weight_capacity', 'recyclability_percent']
    
    categorical = [col for col in encoded_cols if col not in [
        'agribalyse_food_name_encoded', 'categories_tags_encoded'
    ]]
    
    # ========== COST FEATURES ==========
    cost_candidates = base_numeric + categorical + cost_eng
    exclude = ['code', 'Packaging_Cost', 'CO2_Impact_Index'] + leakage
    
    cost_features = [f for f in cost_candidates 
                     if f in df.columns 
                     and f not in exclude
                     and df[f].isnull().sum() == 0]
    cost_features = list(dict.fromkeys(cost_features))
    
    # Remove low-correlation features for cost
    X = df[cost_features]
    y = df['Packaging_Cost']
    
    correlations = {}
    for col in cost_features:
        if df[col].nunique() > 1:
            correlations[col] = abs(X[col].corr(y))
    
    sorted_features = sorted(correlations.items(), key=lambda x: x[1], reverse=True)
    cost_features = [feat for feat, corr in sorted_features if corr >= 0.05]
    
    # Remove multicollinearity
    corr_matrix = df[cost_features].corr().abs()
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
    cost_features = [f for f in cost_features if f not in to_drop]
    
    print(f"\n[A] Cost Model: {len(cost_features)} features (after selection)")
    print(f"  Top 5 by correlation:")
    for feat, corr in sorted_features[:5]:
        if feat in cost_features:
            print(f"    {feat:<30} → {corr:.4f}")
    
    # ========== CO2 FEATURES ==========
    base_co2 = ['weight_measured', 'product_quantity', 'number_of_units', 'weight_capacity']
    cat_co2 = ['material_encoded', 'parent_material_encoded', 'shape_encoded', 
               'recycling_encoded', 'food_group_encoded', 'strength_encoded']
    
    co2_candidates = base_co2 + cat_co2 + co2_eng
    co2_features = [f for f in co2_candidates 
                    if f in df.columns 
                    and f not in exclude
                    and df[f].isnull().sum() == 0]
    co2_features = list(dict.fromkeys(co2_features))
    
    print(f"\n[B] CO2 Model: {len(co2_features)} features")
    
    # Save
    with open(MODEL_DIR / 'cost_features.txt', 'w') as f:
        f.write('\n'.join(cost_features))
    with open(MODEL_DIR / 'co2_features.txt', 'w') as f:
        f.write('\n'.join(co2_features))
    
    print(f"\n✓ Feature lists saved")
    
    return cost_features, co2_features

cost_features, co2_features = select_features(
    df_eng, cost_eng_features, co2_eng_features, encoded_cols, leakage_features
)


[SECTION 4] FEATURE SELECTION

[A] Cost Model: 15 features (after selection)
  Top 5 by correlation:
    capacity_weight_prod           → 0.9183
    weight_log                     → 0.7888
    material_weight                → 0.7347
    parent_material_encoded        → 0.5236

[B] CO2 Model: 18 features

✓ Feature lists saved


In [None]:
# ============================================================================
# SAVE DATASET WITH ONLY SELECTED FEATURES (FROM df_eng)
# ============================================================================

print("\n" + "="*100)
print("SAVING DATASET WITH ONLY SELECTED FEATURES")
print("="*100)

# Get unique features from both models
all_selected_features = list(set(cost_features + co2_features))
all_selected_features.sort()

print(f"\n📋 Feature Summary:")
print(f"  Cost model features:  {len(cost_features)}")
print(f"  CO2 model features:   {len(co2_features)}")
print(f"  Unique features:      {len(all_selected_features)}")
print(f"  Shared features:      {len(set(cost_features) & set(co2_features))}")

# Define columns to save: ID + Selected Features + Targets
columns_to_save = ['code'] + all_selected_features + ['Packaging_Cost', 'CO2_Impact_Index']

# Verify all columns exist in df_eng
missing_cols = [col for col in columns_to_save if col not in df_eng.columns]
if missing_cols:
    print(f"\n⚠ WARNING: Missing columns in df_eng: {missing_cols}")
    columns_to_save = [col for col in columns_to_save if col in df_eng.columns]

# Create clean dataset with ONLY selected features
df_final = df_eng[columns_to_save].copy()

print(f"\n✓ Created clean dataset:")
print(f"  Source: df_eng")
print(f"  Rows: {len(df_final):,}")
print(f"  Columns: {len(df_final.columns)} (ID + {len(all_selected_features)} features + 2 targets)")

# Save full dataset
output_path = MODEL_DIR / 'final_dataset_selected_features.xlsx'
df_final.to_excel(output_path, index=False)

print(f"\n✅ SAVED: {output_path}")
print(f"   File size: {output_path.stat().st_size / 1024 / 1024:.2f} MB")

# Save input template (without targets, 10 samples)
df_template = df_final[['code'] + all_selected_features].head(10).copy()
template_path = MODEL_DIR / 'input_template.xlsx'
df_template.to_excel(template_path, index=False)

print(f"\n✅ SAVED: {template_path}")
print(f"   (Template with 10 sample rows - no target columns)")

# Create detailed feature documentation
print("\n" + "="*100)
print("FEATURE BREAKDOWN")
print("="*100)

# Categorize features
raw_numeric = ['weight_measured', 'product_quantity', 'number_of_units', 
               'weight_capacity', 'recyclability_percent']
encoded_features = [f for f in all_selected_features if '_encoded' in f]
engineered_features = [f for f in all_selected_features 
                       if f not in raw_numeric and f not in encoded_features]

print(f"\n1️⃣  Raw Numeric Features ({len([f for f in all_selected_features if f in raw_numeric])}):")
for feat in [f for f in all_selected_features if f in raw_numeric]:
    in_cost = '✓' if feat in cost_features else ' '
    in_co2 = '✓' if feat in co2_features else ' '
    print(f"   {feat:<35} [Cost:{in_cost}] [CO2:{in_co2}]")

print(f"\n2️⃣  Encoded Categorical Features ({len(encoded_features)}):")
for feat in encoded_features:
    in_cost = '✓' if feat in cost_features else ' '
    in_co2 = '✓' if feat in co2_features else ' '
    original_name = feat.replace('_encoded', '')
    print(f"   {feat:<35} [Cost:{in_cost}] [CO2:{in_co2}] ← {original_name}")

print(f"\n3️⃣  Engineered Features ({len(engineered_features)}):")
for feat in engineered_features:
    in_cost = '✓' if feat in cost_features else ' '
    in_co2 = '✓' if feat in co2_features else ' '
    print(f"   {feat:<35} [Cost:{in_cost}] [CO2:{in_co2}]")

# Save detailed documentation
doc_path = MODEL_DIR / 'feature_documentation.txt'
with open(doc_path, 'w') as f:
    f.write("="*100 + "\n")
    f.write("ECOPACK AI - FEATURE DOCUMENTATION\n")
    f.write("="*100 + "\n\n")
    f.write(f"Created: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    f.write(f"Source DataFrame: df_eng\n")
    f.write(f"Total Rows: {len(df_final):,}\n")
    f.write(f"Total Features: {len(all_selected_features)}\n\n")
    
    # Column details
    f.write("="*100 + "\n")
    f.write("COLUMN DETAILS\n")
    f.write("="*100 + "\n\n")
    f.write(f"{'Column Name':<40} {'Type':<20} {'Used By':<15}\n")
    f.write("-"*100 + "\n")
    
    f.write(f"{'code':<40} {'ID (String)':<20} {'Identifier':<15}\n\n")
    
    for feat in all_selected_features:
        # Determine type
        if feat in raw_numeric:
            feat_type = "Numeric (Raw)"
        elif '_encoded' in feat:
            feat_type = "Numeric (Encoded)"
        else:
            feat_type = "Numeric (Engineered)"
        
        # Used by
        used_by = []
        if feat in cost_features:
            used_by.append('Cost')
        if feat in co2_features:
            used_by.append('CO2')
        used_str = ' + '.join(used_by)
        
        f.write(f"{feat:<40} {feat_type:<20} {used_str:<15}\n")
    
    f.write(f"\n{'Packaging_Cost':<40} {'Target (Numeric)':<20} {'Cost Model':<15}\n")
    f.write(f"{'CO2_Impact_Index':<40} {'Target (Numeric)':<20} {'CO2 Model':<15}\n")
    
    # Feature lists by model
    f.write("\n\n" + "="*100 + "\n")
    f.write(f"COST MODEL FEATURES ({len(cost_features)})\n")
    f.write("="*100 + "\n")
    for i, feat in enumerate(cost_features, 1):
        f.write(f"{i:3d}. {feat}\n")
    
    f.write("\n\n" + "="*100 + "\n")
    f.write(f"CO2 MODEL FEATURES ({len(co2_features)})\n")
    f.write("="*100 + "\n")
    for i, feat in enumerate(co2_features, 1):
        f.write(f"{i:3d}. {feat}\n")
    
    # Shared vs unique
    shared_features = set(cost_features) & set(co2_features)
    cost_only = set(cost_features) - set(co2_features)
    co2_only = set(co2_features) - set(cost_features)
    
    f.write("\n\n" + "="*100 + "\n")
    f.write(f"SHARED FEATURES ({len(shared_features)})\n")
    f.write("="*100 + "\n")
    for i, feat in enumerate(sorted(shared_features), 1):
        f.write(f"{i:3d}. {feat}\n")
    
    f.write("\n\n" + "="*100 + "\n")
    f.write(f"COST-ONLY FEATURES ({len(cost_only)})\n")
    f.write("="*100 + "\n")
    for i, feat in enumerate(sorted(cost_only), 1):
        f.write(f"{i:3d}. {feat}\n")
    
    f.write("\n\n" + "="*100 + "\n")
    f.write(f"CO2-ONLY FEATURES ({len(co2_only)})\n")
    f.write("="*100 + "\n")
    for i, feat in enumerate(sorted(co2_only), 1):
        f.write(f"{i:3d}. {feat}\n")

print(f"\n✅ SAVED: {doc_path}")

# Save feature metadata as JSON
metadata = {
    'created': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
    'source_dataframe': 'df_eng',
    'total_rows': len(df_final),
    'total_features': len(all_selected_features),
    'cost_features': cost_features,
    'co2_features': co2_features,
    'shared_features': list(set(cost_features) & set(co2_features)),
    'cost_only_features': list(set(cost_features) - set(co2_features)),
    'co2_only_features': list(set(co2_features) - set(cost_features)),
    'feature_categories': {
        'raw_numeric': [f for f in all_selected_features if f in raw_numeric],
        'encoded': encoded_features,
        'engineered': engineered_features
    }
}

import json
metadata_path = MODEL_DIR / 'feature_metadata.json'
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"✅ SAVED: {metadata_path}")

print("\n" + "="*100)
print("📦 FILES READY FOR APP DEPLOYMENT")
print("="*100)

print("\n🔑 Essential Files (7 files):")
print("  1. final_dataset_selected_features.xlsx  ← Full dataset (selected features only)")
print("  2. input_template.xlsx                   ← Input format (10 samples, no targets)")
print("  3. final_cost_model.pkl                  ← Cost prediction model")
print("  4. final_co2_model.pkl                   ← CO2 prediction model")
print("  5. label_encoders.pkl                    ← Categorical encoders")
print("  6. cost_features.txt                     ← Cost model feature list")
print("  7. co2_features.txt                      ← CO2 model feature list")

print("\n📄 Documentation (2 files):")
print("  8. feature_documentation.txt             ← Complete feature reference")
print("  9. feature_metadata.json                 ← Metadata for programmatic access")

print(f"\n📊 Dataset Statistics:")
print(f"  Total rows: {len(df_final):,}")
print(f"  Total columns: {len(df_final.columns)}")
print(f"    - ID column: 1")
print(f"    - Feature columns: {len(all_selected_features)}")
print(f"    - Target columns: 2")

print("\n" + "="*100)


[SECTION 4] FEATURE SELECTION

[A] Cost Model: 15 features (after selection)
  Top 5 by correlation:
    capacity_weight_prod           → 0.9183
    weight_log                     → 0.7888
    material_weight                → 0.7347
    parent_material_encoded        → 0.5236

[B] CO2 Model: 18 features

✓ Feature lists saved

SAVING DATASET WITH ONLY SELECTED FEATURES

📋 Feature Summary:
  Cost model features:  15
  CO2 model features:   18
  Unique features:      22
  Shared features:      11

✓ Created clean dataset:
  Source: df_eng
  Rows: 5,041
  Columns: 25 (ID + 22 features + 2 targets)

✅ SAVED: models\final_dataset_selected_features.xlsx
   File size: 0.77 MB

✅ SAVED: models\input_template.xlsx
   (Template with 10 sample rows - no target columns)

FEATURE BREAKDOWN

1️⃣  Raw Numeric Features (5):
   number_of_units                     [Cost: ] [CO2:✓]
   product_quantity                    [Cost:✓] [CO2:✓]
   recyclability_percent               [Cost:✓] [CO2: ]
   weight_c

In [44]:
# ============================================================================
# SECTION 5: DATA PREPARATION (NO SCALING FOR TREE MODELS)
# ============================================================================

def prepare_data_splits(df, cost_features, co2_features):
    """
    Create train/test splits
    - NO scaling for Cost (tree models don't need it)
    - YES log transform for CO2 (handles extreme skewness 24.16)
    """
    print("\n[SECTION 5] DATA PREPARATION")
    print("="*80)
    
    # ========== COST MODEL (NO TRANSFORM) ==========
    print("\n[A] Cost Model:")
    
    X_cost = df[cost_features].values
    y_cost = df['Packaging_Cost'].values
    
    # Stratified split
    y_bins = pd.qcut(y_cost, q=10, labels=False, duplicates='drop')
    
    X_cost_train, X_cost_test, y_cost_train, y_cost_test = train_test_split(
        X_cost, y_cost, test_size=0.2, random_state=42, stratify=y_bins
    )
    
    print(f"  Train: {X_cost_train.shape[0]:,} samples × {X_cost_train.shape[1]} features")
    print(f"  Test:  {X_cost_test.shape[0]:,} samples")
    print(f"  ✓ Stratified split (NO transform - low skewness)")
    
    # ========== CO2 MODEL (WITH LOG TRANSFORM) ==========
    print("\n[B] CO2 Model:")
    
    X_co2 = df[co2_features].values
    y_co2 = df['CO2_Impact_Index'].values
    
    # Log transform (CRITICAL for skewness = 24.16)
    y_co2_log = np.log1p(y_co2)
    
    print(f"  Original skewness: {pd.Series(y_co2).skew():.2f}")
    print(f"  Log-transformed skewness: {pd.Series(y_co2_log).skew():.2f}")
    
    # Stratified split on log-transformed target
    y_bins_co2 = pd.qcut(y_co2_log, q=10, labels=False, duplicates='drop')
    
    X_co2_train, X_co2_test, y_co2_train_log, y_co2_test_log, \
    y_co2_train, y_co2_test = train_test_split(
        X_co2, y_co2_log, y_co2,
        test_size=0.2, random_state=42, stratify=y_bins_co2
    )
    
    print(f"  Train: {X_co2_train.shape[0]:,} samples × {X_co2_train.shape[1]} features")
    print(f"  Test:  {X_co2_test.shape[0]:,} samples")
    print(f"  ✓ Stratified split + Log transform (NO scaling)")
    
    return (X_cost_train, X_cost_test, y_cost_train, y_cost_test,
            X_co2_train, X_co2_test, y_co2_train_log, y_co2_test_log,
            y_co2_train, y_co2_test)

X_cost_train, X_cost_test, y_cost_train, y_cost_test, \
X_co2_train, X_co2_test, y_co2_train_log, y_co2_test_log, \
y_co2_train, y_co2_test = prepare_data_splits(
    df_eng, cost_features, co2_features
)


[SECTION 5] DATA PREPARATION

[A] Cost Model:
  Train: 4,032 samples × 15 features
  Test:  1,009 samples
  ✓ Stratified split (NO transform - low skewness)

[B] CO2 Model:
  Original skewness: 24.16
  Log-transformed skewness: 0.55
  Train: 4,032 samples × 18 features
  Test:  1,009 samples
  ✓ Stratified split + Log transform (NO scaling)


In [45]:
# ============================================================================
# SECTION 6: COMPREHENSIVE MODEL TRAINING & EVALUATION (STREAMLINED)
# ============================================================================

def train_and_evaluate_all_models(
    X_cost_train, X_cost_test, y_cost_train, y_cost_test,
    X_co2_train, X_co2_test, y_co2_train_log, y_co2_test_log,
    y_co2_train, y_co2_test, cost_features, co2_features
):
    """
    Complete training pipeline:
    - Train RF + XGBoost for both targets
    - Comprehensive evaluation (all metrics)
    - Cross-validation
    - Generalization analysis
    - Save ONLY best models
    """
    
    print("\n" + "="*100)
    print("COMPREHENSIVE MODEL TRAINING & EVALUATION")
    print("="*100)
    
    all_results = {}
    
    # ========================================================================
    # PART 1: PACKAGING COST MODELS
    # ========================================================================
    print("\n" + "="*100)
    print("TARGET 1: PACKAGING COST (₹)")
    print("="*100)
    
    cost_models = {
        'Random Forest': RandomForestRegressor(
            n_estimators=400, max_depth=20, min_samples_split=15,
            min_samples_leaf=8, max_features='sqrt', random_state=42, n_jobs=-1
        ),
        'XGBoost': XGBRegressor(
            n_estimators=600, max_depth=8, learning_rate=0.03,
            reg_alpha=0.5, reg_lambda=1.5, gamma=0.2,
            subsample=0.8, colsample_bytree=0.8, colsample_bylevel=0.8,
            min_child_weight=5, random_state=42, n_jobs=-1, verbosity=0
        )
    }
    
    best_cost_r2 = -np.inf
    best_cost_model = None
    best_cost_name = None
    
    for name, model in cost_models.items():
        print(f"\n[Training {name}]")
        print("-" * 80)
        
        # Train
        model.fit(X_cost_train, y_cost_train)
        
        # Predictions
        y_train_pred = model.predict(X_cost_train)
        y_test_pred = model.predict(X_cost_test)
        
        # Core Metrics
        train_r2 = r2_score(y_cost_train, y_train_pred)
        test_r2 = r2_score(y_cost_test, y_test_pred)
        rmse = np.sqrt(mean_squared_error(y_cost_test, y_test_pred))
        mae = mean_absolute_error(y_cost_test, y_test_pred)
        mape = np.mean(np.abs((y_cost_test - y_test_pred) / y_cost_test)) * 100
        
        # Normalized metrics
        p95, p5 = np.percentile(y_cost_test, [95, 5])
        nrmse = rmse / (p95 - p5) if (p95 - p5) > 0 else 0
        relative_mae = mae / np.mean(y_cost_test)
        
        # Cross-validation
        kfold = KFold(n_splits=5, shuffle=True, random_state=42)
        cv_scores = cross_val_score(model, X_cost_train, y_cost_train, 
                                    cv=kfold, scoring='r2', n_jobs=-1)
        
        # Generalization analysis
        gap = train_r2 - test_r2
        cv_std = cv_scores.std()
        cv_test_gap = abs(cv_scores.mean() - test_r2)
        
        gen_score = 0
        mem_score = 0
        anomaly_flags = []
        
        # CRITICAL: Check for severe anomalies first
        if train_r2 < 0.50 and test_r2 > 0.90:
            anomaly_flags.append("⚠ SEVERE: Train R² too low vs Test")
            mem_score += 10
        
        if gap < -0.10:
            anomaly_flags.append("⚠ CRITICAL: Negative gap > 10% (test >> train)")
            mem_score += 8
        elif gap < -0.05:
            anomaly_flags.append("⚠ WARNING: Suspicious negative gap")
            mem_score += 5
        
        if cv_test_gap > 0.10:
            anomaly_flags.append("⚠ CV-Test mismatch > 10%")
            mem_score += 4
        elif cv_test_gap > 0.05:
            anomaly_flags.append("⚠ Moderate CV-Test gap (>5%)")
            mem_score += 2
        
        # Perfect training fit check (tree models warning)
        if train_r2 >= 0.9999:
            anomaly_flags.append("⚠ Perfect train fit (R²=1.0) - potential overfit")
            mem_score += 3
        elif train_r2 >= 0.999:
            anomaly_flags.append("ℹ Near-perfect train fit (monitor for overfit)")
            mem_score += 1
        
        # Normal gap analysis (only if no severe anomalies)
        if gap >= 0:
            if gap <= 0.01: 
                gen_score += 5
            elif gap <= 0.02: 
                gen_score += 4
            elif gap <= 0.05: 
                gen_score += 2
                mem_score += 1
            elif gap <= 0.10: 
                mem_score += 3
            else: 
                mem_score += 6
        
        # CV stability
        if cv_std <= 0.01: 
            gen_score += 5
        elif cv_std <= 0.02: 
            gen_score += 4
        elif cv_std <= 0.05: 
            gen_score += 1
            mem_score += 1
        else: 
            mem_score += 4
        
        # Test performance (with context)
        if test_r2 >= 0.95:
            if train_r2 >= 0.90 and train_r2 < 0.999: 
                gen_score += 4  # Both strong, no perfect fit
            elif train_r2 >= 0.999:
                gen_score += 2  # Good test but suspicious train
                mem_score += 1
            elif train_r2 < 0.70:
                mem_score += 3  # Suspiciously good test with poor train
        elif test_r2 >= 0.90:
            if train_r2 >= 0.85 and train_r2 < 0.999:
                gen_score += 3
            else:
                gen_score += 1
                mem_score += 1
        else: 
            mem_score += 2
        
        total = gen_score + mem_score
        gen_ratio = gen_score / total if total > 0 else 0
        
        # Verdict logic with anomaly detection
        if any("SEVERE" in flag or "CRITICAL" in flag for flag in anomaly_flags):
            verdict = "✗ CRITICAL MODEL ISSUE"
        elif anomaly_flags:
            verdict = "⚠ MODEL WARNINGS DETECTED"
        elif gen_ratio >= 0.85:
            verdict = "✓✓✓ STRONG GENERALIZATION"
        elif gen_ratio >= 0.65:
            verdict = "✓✓ GOOD GENERALIZATION"
        elif gen_ratio >= 0.45:
            verdict = "⚠ MODERATE GENERALIZATION"
        else:
            verdict = "✗ POOR GENERALIZATION"
        
        # Display results
        print(f"  Performance:")
        print(f"    Train R²:        {train_r2:.4f}")
        print(f"    Test R²:         {test_r2:.4f}")
        print(f"    Train-Test Gap:  {gap:.4f} ({gap*100:.2f}%)")
        print(f"\n  Error Metrics:")
        print(f"    RMSE:            ₹{rmse:.2f}")
        print(f"    MAE:             ₹{mae:.2f}")
        print(f"    MAPE:            {mape:.2f}%")
        print(f"    NRMSE:           {nrmse:.4f}")
        print(f"    Relative MAE:    {relative_mae:.2%}")
        print(f"\n  Cross-Validation:")
        print(f"    CV Scores:       {[f'{s:.4f}' for s in cv_scores]}")
        print(f"    CV Mean:         {cv_scores.mean():.4f} (±{cv_std:.4f})")
        print(f"    CV-Test Gap:     {cv_test_gap:.4f}")
        print(f"\n  Generalization:")
        print(f"    Gen Score:       {gen_score}/{total}")
        print(f"    Gen Ratio:       {gen_ratio*100:.1f}%")
        if anomaly_flags:
            print(f"    Anomalies:       {len(anomaly_flags)} detected")
            for flag in anomaly_flags:
                print(f"                     {flag}")
        print(f"    Verdict:         {verdict}")
        
        # Store results
        all_results[f'cost_{name}'] = {
            'model': model,
            'name': name,
            'target': 'Cost',
            'train_r2': train_r2,
            'test_r2': test_r2,
            'gap': gap,
            'rmse': rmse,
            'mae': mae,
            'mape': mape,
            'nrmse': nrmse,
            'relative_mae': relative_mae,
            'cv_mean': cv_scores.mean(),
            'cv_std': cv_std,
            'cv_test_gap': cv_test_gap,
            'cv_scores': cv_scores,
            'gen_ratio': gen_ratio,
            'verdict': verdict,
            'anomalies': anomaly_flags,
            'predictions': {'train': y_train_pred, 'test': y_test_pred}
        }
        
        # Track best
        if test_r2 > best_cost_r2:
            best_cost_r2 = test_r2
            best_cost_model = model
            best_cost_name = name
    
    print(f"\n{'='*100}")
    print(f"✓ BEST COST MODEL: {best_cost_name} (Test R² = {best_cost_r2:.4f})")
    print(f"{'='*100}")
    
    # ========================================================================
    # PART 2: CO2 IMPACT MODELS
    # ========================================================================
    print("\n" + "="*100)
    print("TARGET 2: CO2 IMPACT INDEX")
    print("="*100)
    
    co2_models = {
        'Random Forest': RandomForestRegressor(
            n_estimators=350, max_depth=18, min_samples_split=20,
            min_samples_leaf=10, max_features='sqrt', random_state=42, n_jobs=-1
        ),
        'XGBoost': XGBRegressor(
            n_estimators=600, max_depth=10, learning_rate=0.05,
            subsample=0.9, colsample_bytree=0.9, reg_alpha=0.0,
            reg_lambda=1.5, random_state=42, n_jobs=-1, verbosity=0
        )
    }
    
    best_co2_r2 = -np.inf
    best_co2_model = None
    best_co2_name = None
    
    for name, model in co2_models.items():
        print(f"\n[Training {name}]")
        print("-" * 80)
        
        # Train on log-transformed target
        model.fit(X_co2_train, y_co2_train_log)
        
        # Predictions (back-transform)
        y_train_pred_log = model.predict(X_co2_train)
        y_test_pred_log = model.predict(X_co2_test)
        
        y_train_pred = np.expm1(y_train_pred_log)
        y_test_pred = np.expm1(y_test_pred_log)
        
        # Core Metrics
        train_r2 = r2_score(y_co2_train, y_train_pred)
        test_r2 = r2_score(y_co2_test, y_test_pred)
        rmse = np.sqrt(mean_squared_error(y_co2_test, y_test_pred))
        mae = mean_absolute_error(y_co2_test, y_test_pred)
        
        # Normalized metrics
        p95, p5 = np.percentile(y_co2_test, [95, 5])
        nrmse = rmse / (p95 - p5) if (p95 - p5) > 0 else 0
        relative_mae = mae / np.mean(y_co2_test)
        
        # Cross-validation (on log scale)
        kfold = KFold(n_splits=5, shuffle=True, random_state=42)
        cv_scores = cross_val_score(model, X_co2_train, y_co2_train_log, 
                                    cv=kfold, scoring='r2', n_jobs=-1)
        
        # Generalization analysis
        gap = train_r2 - test_r2
        cv_std = cv_scores.std()
        cv_test_gap = abs(cv_scores.mean() - test_r2)
        
        gen_score = 0
        mem_score = 0
        anomaly_flags = []
        
        # CRITICAL: Check for severe anomalies first
        if train_r2 < 0.50 and test_r2 > 0.90:
            anomaly_flags.append("⚠ SEVERE: Train R² too low vs Test")
            mem_score += 10
        
        if gap < -0.10:
            anomaly_flags.append("⚠ CRITICAL: Negative gap > 10% (test >> train)")
            mem_score += 8
        elif gap < -0.05:
            anomaly_flags.append("⚠ WARNING: Suspicious negative gap")
            mem_score += 5
        
        if cv_test_gap > 0.10:
            anomaly_flags.append("⚠ CV-Test mismatch > 10%")
            mem_score += 4
        
        # Normal gap analysis (only if no severe anomalies)
        if gap >= 0:
            if gap <= 0.02: gen_score += 5
            elif gap <= 0.05: gen_score += 3; mem_score += 1
            elif gap <= 0.10: mem_score += 3
            else: mem_score += 6
        
        # CV stability
        if cv_std <= 0.02: gen_score += 5
        elif cv_std <= 0.05: gen_score += 1; mem_score += 1
        else: mem_score += 4
        
        # Test performance (with context)
        if test_r2 >= 0.95 and train_r2 >= 0.90: 
            gen_score += 4
        elif test_r2 >= 0.90 and train_r2 >= 0.85: 
            gen_score += 3
        elif test_r2 >= 0.90 and train_r2 < 0.70:
            mem_score += 3  # Suspiciously good test with poor train
        else: 
            mem_score += 2
        
        total = gen_score + mem_score
        gen_ratio = gen_score / total if total > 0 else 0
        
        # Verdict logic with anomaly detection
        if anomaly_flags:
            verdict = "✗ MODEL ISSUE DETECTED"
        elif gen_ratio >= 0.85:
            verdict = "✓✓✓ STRONG GENERALIZATION"
        elif gen_ratio >= 0.65:
            verdict = "✓✓ GOOD GENERALIZATION"
        elif gen_ratio >= 0.45:
            verdict = "⚠ MODERATE GENERALIZATION"
        else:
            verdict = "✗ POOR GENERALIZATION"
        
        # Display results
        print(f"  Performance:")
        print(f"    Train R²:        {train_r2:.4f}")
        print(f"    Test R²:         {test_r2:.4f}")
        print(f"    Train-Test Gap:  {gap:.4f} ({gap*100:.2f}%)")
        print(f"\n  Error Metrics:")
        print(f"    RMSE:            {rmse:.2f}")
        print(f"    MAE:             {mae:.2f}")
        print(f"    NRMSE:           {nrmse:.4f}")
        print(f"    Relative MAE:    {relative_mae:.2%}")
        print(f"\n  Cross-Validation:")
        print(f"    CV Scores:       {[f'{s:.4f}' for s in cv_scores]}")
        print(f"    CV Mean:         {cv_scores.mean():.4f} (±{cv_std:.4f})")
        print(f"    CV-Test Gap:     {cv_test_gap:.4f}")
        print(f"\n  Generalization:")
        print(f"    Gen Score:       {gen_score}/{total}")
        print(f"    Gen Ratio:       {gen_ratio*100:.1f}%")
        if anomaly_flags:
            print(f"    Anomalies:       {len(anomaly_flags)} detected")
            for flag in anomaly_flags:
                print(f"                     {flag}")
        print(f"    Verdict:         {verdict}")
        
        # Store results
        all_results[f'co2_{name}'] = {
            'model': model,
            'name': name,
            'target': 'CO2',
            'train_r2': train_r2,
            'test_r2': test_r2,
            'gap': gap,
            'rmse': rmse,
            'mae': mae,
            'nrmse': nrmse,
            'relative_mae': relative_mae,
            'cv_mean': cv_scores.mean(),
            'cv_std': cv_std,
            'cv_test_gap': cv_test_gap,
            'cv_scores': cv_scores,
            'gen_ratio': gen_ratio,
            'verdict': verdict,
            'anomalies': anomaly_flags,
            'predictions': {'train': y_train_pred, 'test': y_test_pred}
        }
        
        # Track best
        if test_r2 > best_co2_r2:
            best_co2_r2 = test_r2
            best_co2_model = model
            best_co2_name = name
    
    print(f"\n{'='*100}")
    print(f"✓ BEST CO2 MODEL: {best_co2_name} (Test R² = {best_co2_r2:.4f})")
    print(f"{'='*100}")
    
    
    # ========================================================================
    # PART 4: COMPARISON TABLE
    # ========================================================================
    print("\n" + "="*100)
    print("MODEL COMPARISON")
    print("="*100)
    
    comparison_data = []
    for key, res in all_results.items():
        status = res['verdict'].split()[0]
        if res['anomalies']:
            status = f"✗ ({len(res['anomalies'])} issues)"
        
        comparison_data.append({
            'Target': res['target'],
            'Model': res['name'],
            'Train R²': f"{res['train_r2']:.4f}",
            'Test R²': f"{res['test_r2']:.4f}",
            'Gap': f"{res['gap']:.4f}",
            'RMSE': f"{res['rmse']:.2f}",
            'MAE': f"{res['mae']:.2f}",
            'CV Mean': f"{res['cv_mean']:.4f}",
            'CV±Test': f"{res['cv_test_gap']:.4f}",
            'Gen %': f"{res['gen_ratio']*100:.1f}%",
            'Status': status
        })
    
    comparison_df = pd.DataFrame(comparison_data)
    print(comparison_df.to_string(index=False))
    
    # ========================================================================
    # PART 5: FINAL SUMMARY
    # ========================================================================
    print("\n" + "="*100)
    print("FINAL SUMMARY")
    print("="*100)
    
    cost_best = all_results[f'cost_{best_cost_name}']
    co2_best = all_results[f'co2_{best_co2_name}']
    
    print(f"\n[BEST COST MODEL: {best_cost_name}]")
    print(f"  Test R²:       {cost_best['test_r2']:.4f}")
    print(f"  RMSE:          ₹{cost_best['rmse']:.2f}")
    print(f"  MAE:           ₹{cost_best['mae']:.2f}")
    print(f"  NRMSE:         {cost_best['nrmse']:.4f}")
    print(f"  CV Stability:  {cost_best['cv_mean']:.4f} (±{cost_best['cv_std']:.4f})")
    print(f"  Generalization: {cost_best['gen_ratio']*100:.1f}% - {cost_best['verdict']}")
    
    print(f"\n[BEST CO2 MODEL: {best_co2_name}]")
    print(f"  Test R²:       {co2_best['test_r2']:.4f}")
    print(f"  RMSE:          {co2_best['rmse']:.2f}")
    print(f"  MAE:           {co2_best['mae']:.2f}")
    print(f"  NRMSE:         {co2_best['nrmse']:.4f}")
    print(f"  CV Stability:  {co2_best['cv_mean']:.4f} (±{co2_best['cv_std']:.4f})")
    print(f"  Generalization: {co2_best['gen_ratio']*100:.1f}% - {co2_best['verdict']}")
    
    print("\n" + "="*100)
    print("✓ TRAINING COMPLETE")
    print("="*100)
    
    return all_results, best_cost_model, best_co2_model, best_cost_name, best_co2_name

# ============================================================================
# EXECUTE STREAMLINED PIPELINE
# ============================================================================

all_results, best_cost_model, best_co2_model, best_cost_name, best_co2_name = \
    train_and_evaluate_all_models(
        X_cost_train, X_cost_test, y_cost_train, y_cost_test,
        X_co2_train, X_co2_test, y_co2_train_log, y_co2_test_log,
        y_co2_train, y_co2_test, cost_features, co2_features
    )


COMPREHENSIVE MODEL TRAINING & EVALUATION

TARGET 1: PACKAGING COST (₹)

[Training Random Forest]
--------------------------------------------------------------------------------
  Performance:
    Train R²:        0.9931
    Test R²:         0.9851
    Train-Test Gap:  0.0080 (0.80%)

  Error Metrics:
    RMSE:            ₹62.81
    MAE:             ₹19.22
    MAPE:            6.83%
    NRMSE:           0.0528
    Relative MAE:    5.13%

  Cross-Validation:
    CV Scores:       ['0.9724', '0.9967', '0.9909', '0.9910', '0.9891']
    CV Mean:         0.9880 (±0.0082)
    CV-Test Gap:     0.0029

  Generalization:
    Gen Score:       14/14
    Gen Ratio:       100.0%
    Verdict:         ✓✓✓ STRONG GENERALIZATION

[Training XGBoost]
--------------------------------------------------------------------------------
  Performance:
    Train R²:        0.9994
    Test R²:         0.9952
    Train-Test Gap:  0.0042 (0.42%)

  Error Metrics:
    RMSE:            ₹35.48
    MAE:             ₹1

In [46]:
# ============================================================================
# SECTION 5.5: HYPERPARAMETER OPTIMIZATION (OPTIONAL)
# ============================================================================

def optimize_hyperparameters_optuna(X_train, y_train, model_type='cost'):
    """
    Use Optuna for Bayesian hyperparameter optimization
    
    Install: pip install optuna
    """
    
    print(f"\n[HYPERPARAMETER OPTIMIZATION - {model_type.upper()} MODEL]")
    print("="*80)
    
    def objective(trial):
        # XGBoost hyperparameters to optimize
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 300, 800),
            'max_depth': trial.suggest_int('max_depth', 6, 12),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
            'subsample': trial.suggest_float('subsample', 0.7, 0.95),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 0.95),
            'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 2.0),
            'reg_lambda': trial.suggest_float('reg_lambda', 0.5, 3.0),
            'gamma': trial.suggest_float('gamma', 0.0, 0.5),
            'min_child_weight': trial.suggest_int('min_child_weight', 3, 10),
            'random_state': 42,
            'n_jobs': -1,
            'verbosity': 0
        }
        
        model = XGBRegressor(**params)
        
        # 5-fold cross-validation
        kfold = KFold(n_splits=5, shuffle=True, random_state=42)
        cv_scores = cross_val_score(model, X_train, y_train, 
                                     cv=kfold, scoring='r2', n_jobs=-1)
        
        return cv_scores.mean()
    
    # Run optimization
    study = optuna.create_study(
        direction='maximize',
        sampler=TPESampler(seed=42)
    )
    
    study.optimize(objective, n_trials=50, show_progress_bar=True)
    
    print(f"\n✓ Best R² Score: {study.best_value:.4f}")
    print(f"✓ Best Parameters:")
    for key, value in study.best_params.items():
        print(f"    {key}: {value}")
    
    return study.best_params

# ============================================================================
# USAGE (Add before Section 6)
# ============================================================================

# UNCOMMENT TO RUN OPTIMIZATION (takes 10-15 minutes per model)
print("\n Optimizing Cost Model Hyperparameters...")
best_cost_params = optimize_hyperparameters_optuna(
    X_cost_train, y_cost_train, model_type='cost'
)
# 
print("\n Optimizing CO2 Model Hyperparameters...")
best_co2_params = optimize_hyperparameters_optuna(
     X_co2_train, y_co2_train_log, model_type='co2'
)

# ============================================================================
# SECTION 6.1: TRAIN OPTIMIZED MODELS WITH BEST PARAMS
# ============================================================================

print("\n" + "="*100)
print("SECTION 6.1: TRAINING MODELS WITH OPTIMIZED HYPERPARAMETERS")
print("="*100)

# ========== COST MODEL WITH OPTUNA PARAMS ==========
print("\n[Training Optimized Cost Model]")
print("-" * 80)

BEST_COST_PARAMS = best_cost_params.copy()
BEST_COST_PARAMS.update({'random_state': 42, 'n_jobs': -1, 'verbosity': 0})

optimized_cost_model = XGBRegressor(**BEST_COST_PARAMS)
optimized_cost_model.fit(X_cost_train, y_cost_train)

opt_cost_train_pred = optimized_cost_model.predict(X_cost_train)
opt_cost_test_pred = optimized_cost_model.predict(X_cost_test)

opt_cost_train_r2 = r2_score(y_cost_train, opt_cost_train_pred)
opt_cost_test_r2 = r2_score(y_cost_test, opt_cost_test_pred)
opt_cost_rmse = np.sqrt(mean_squared_error(y_cost_test, opt_cost_test_pred))
opt_cost_mae = mean_absolute_error(y_cost_test, opt_cost_test_pred)
opt_cost_mape = np.mean(np.abs((y_cost_test - opt_cost_test_pred) / y_cost_test)) * 100

print(f"  Train R²: {opt_cost_train_r2:.4f}")
print(f"  Test R²:  {opt_cost_test_r2:.4f}")
print(f"  RMSE:     ₹{opt_cost_rmse:.2f}")
print(f"  MAE:      ₹{opt_cost_mae:.2f}")
print(f"  MAPE:     {opt_cost_mape:.2f}%")

# ========== CO2 MODEL WITH OPTUNA PARAMS ==========
print("\n[Training Optimized CO2 Model]")
print("-" * 80)

BEST_CO2_PARAMS = best_co2_params.copy()
BEST_CO2_PARAMS.update({'random_state': 42, 'n_jobs': -1, 'verbosity': 0})

optimized_co2_model = XGBRegressor(**BEST_CO2_PARAMS)
optimized_co2_model.fit(X_co2_train, y_co2_train_log)

opt_co2_train_pred_log = optimized_co2_model.predict(X_co2_train)
opt_co2_test_pred_log = optimized_co2_model.predict(X_co2_test)

opt_co2_train_pred = np.expm1(opt_co2_train_pred_log)
opt_co2_test_pred = np.expm1(opt_co2_test_pred_log)

opt_co2_train_r2 = r2_score(y_co2_train, opt_co2_train_pred)
opt_co2_test_r2 = r2_score(y_co2_test, opt_co2_test_pred)
opt_co2_rmse = np.sqrt(mean_squared_error(y_co2_test, opt_co2_test_pred))
opt_co2_mae = mean_absolute_error(y_co2_test, opt_co2_test_pred)

print(f"  Train R²: {opt_co2_train_r2:.4f}")
print(f"  Test R²:  {opt_co2_test_r2:.4f}")
print(f"  RMSE:     {opt_co2_rmse:.2f}")
print(f"  MAE:      {opt_co2_mae:.2f}")

# ========== STORE RESULTS ==========
optimized_results = {
    'cost': {
        'model': optimized_cost_model,
        'train_r2': opt_cost_train_r2,
        'test_r2': opt_cost_test_r2,
        'rmse': opt_cost_rmse,
        'mae': opt_cost_mae,
        'mape': opt_cost_mape,
        'test_pred': opt_cost_test_pred
    },
    'co2': {
        'model': optimized_co2_model,
        'train_r2': opt_co2_train_r2,
        'test_r2': opt_co2_test_r2,
        'rmse': opt_co2_rmse,
        'mae': opt_co2_mae,
        'test_pred': opt_co2_test_pred
    }
}

print("\n✓ Optimized models trained and results stored")

[I 2025-12-29 18:49:02,216] A new study created in memory with name: no-name-8d93095d-5731-451c-9c71-bf16ba03ce2d



 Optimizing Cost Model Hyperparameters...

[HYPERPARAMETER OPTIMIZATION - COST MODEL]


Best trial: 0. Best value: 0.994098:   2%|▏         | 1/50 [00:02<01:49,  2.23s/it]

[I 2025-12-29 18:49:04,454] Trial 0 finished with value: 0.9940982475883547 and parameters: {'n_estimators': 487, 'max_depth': 12, 'learning_rate': 0.05395030966670229, 'subsample': 0.8496646210492591, 'colsample_bytree': 0.7390046601106091, 'reg_alpha': 0.3119890406724053, 'reg_lambda': 0.6452090304204987, 'gamma': 0.4330880728874676, 'min_child_weight': 7}. Best is trial 0 with value: 0.9940982475883547.


Best trial: 0. Best value: 0.994098:   4%|▍         | 2/50 [00:03<01:09,  1.44s/it]

[I 2025-12-29 18:49:05,341] Trial 1 finished with value: 0.9939049534830134 and parameters: {'n_estimators': 654, 'max_depth': 6, 'learning_rate': 0.09330606024425668, 'subsample': 0.9081106602001054, 'colsample_bytree': 0.753084777669569, 'reg_alpha': 0.36364993441420124, 'reg_lambda': 0.9585112746335845, 'gamma': 0.15212112147976886, 'min_child_weight': 7}. Best is trial 0 with value: 0.9940982475883547.


Best trial: 2. Best value: 0.994153:   6%|▌         | 3/50 [00:04<00:58,  1.25s/it]

[I 2025-12-29 18:49:06,364] Trial 2 finished with value: 0.9941525148552843 and parameters: {'n_estimators': 516, 'max_depth': 8, 'learning_rate': 0.04091220574443785, 'subsample': 0.7348734651630104, 'colsample_bytree': 0.7730361621338044, 'reg_alpha': 0.7327236865873834, 'reg_lambda': 1.64017496054259, 'gamma': 0.3925879806965068, 'min_child_weight': 4}. Best is trial 2 with value: 0.9941525148552843.


Best trial: 2. Best value: 0.994153:   8%|▊         | 4/50 [00:05<01:01,  1.33s/it]

[I 2025-12-29 18:49:07,815] Trial 3 finished with value: 0.992706889005303 and parameters: {'n_estimators': 557, 'max_depth': 10, 'learning_rate': 0.011128853174905732, 'subsample': 0.8518862129753595, 'colsample_bytree': 0.7426310309218228, 'reg_alpha': 0.13010318597055903, 'reg_lambda': 2.8722138431333333, 'gamma': 0.4828160165372797, 'min_child_weight': 9}. Best is trial 2 with value: 0.9941525148552843.


Best trial: 4. Best value: 0.994269:  10%|█         | 5/50 [00:06<00:46,  1.04s/it]

[I 2025-12-29 18:49:08,332] Trial 4 finished with value: 0.9942691198423439 and parameters: {'n_estimators': 452, 'max_depth': 6, 'learning_rate': 0.04833180632488466, 'subsample': 0.8100381234349003, 'colsample_bytree': 0.7305095587111947, 'reg_alpha': 0.9903538202225404, 'reg_lambda': 0.585971302788046, 'gamma': 0.45466020103939103, 'min_child_weight': 5}. Best is trial 4 with value: 0.9942691198423439.


Best trial: 4. Best value: 0.994269:  12%|█▏        | 6/50 [00:07<00:56,  1.28s/it]

[I 2025-12-29 18:49:10,086] Trial 5 finished with value: 0.9939226710147426 and parameters: {'n_estimators': 631, 'max_depth': 8, 'learning_rate': 0.03311829888072381, 'subsample': 0.8366775698358199, 'colsample_bytree': 0.7462136138813817, 'reg_alpha': 1.9391692555291171, 'reg_lambda': 2.4378320584027864, 'gamma': 0.46974947078209456, 'min_child_weight': 10}. Best is trial 4 with value: 0.9942691198423439.


Best trial: 4. Best value: 0.994269:  14%|█▍        | 7/50 [00:12<01:46,  2.48s/it]

[I 2025-12-29 18:49:15,039] Trial 6 finished with value: 0.9934029841657604 and parameters: {'n_estimators': 599, 'max_depth': 12, 'learning_rate': 0.012260057359187526, 'subsample': 0.7489957156047863, 'colsample_bytree': 0.7113068222276344, 'reg_alpha': 0.6506606615265287, 'reg_lambda': 1.471693224223705, 'gamma': 0.13567451588694796, 'min_child_weight': 9}. Best is trial 4 with value: 0.9942691198423439.


Best trial: 4. Best value: 0.994269:  16%|█▌        | 8/50 [00:14<01:36,  2.29s/it]

[I 2025-12-29 18:49:16,918] Trial 7 finished with value: 0.993584701101858 and parameters: {'n_estimators': 478, 'max_depth': 7, 'learning_rate': 0.03488960745139221, 'subsample': 0.7352310562436906, 'colsample_bytree': 0.9005492451885099, 'reg_alpha': 0.14910128735954165, 'reg_lambda': 2.9672173415012932, 'gamma': 0.3861223846483287, 'min_child_weight': 4}. Best is trial 4 with value: 0.9942691198423439.


Best trial: 4. Best value: 0.994269:  18%|█▊        | 9/50 [00:16<01:28,  2.16s/it]

[I 2025-12-29 18:49:18,778] Trial 8 finished with value: 0.9937920494506625 and parameters: {'n_estimators': 302, 'max_depth': 11, 'learning_rate': 0.05091635945818555, 'subsample': 0.8822517920102468, 'colsample_bytree': 0.8928175866714864, 'reg_alpha': 0.14808930346818072, 'reg_lambda': 1.3961643213606816, 'gamma': 0.05793452976256486, 'min_child_weight': 9}. Best is trial 4 with value: 0.9942691198423439.


Best trial: 4. Best value: 0.994269:  20%|██        | 10/50 [00:19<01:39,  2.48s/it]

[I 2025-12-29 18:49:21,991] Trial 9 finished with value: 0.9931299167565271 and parameters: {'n_estimators': 612, 'max_depth': 8, 'learning_rate': 0.011575995526672779, 'subsample': 0.7777455804289155, 'colsample_bytree': 0.7812958305066867, 'reg_alpha': 1.4592123566761281, 'reg_lambda': 2.093893678388033, 'gamma': 0.44360637128816327, 'min_child_weight': 6}. Best is trial 4 with value: 0.9942691198423439.


Best trial: 10. Best value: 0.994396:  22%|██▏       | 11/50 [00:22<01:40,  2.57s/it]

[I 2025-12-29 18:49:24,766] Trial 10 finished with value: 0.9943959756318204 and parameters: {'n_estimators': 773, 'max_depth': 6, 'learning_rate': 0.020309265235830157, 'subsample': 0.7944878874913006, 'colsample_bytree': 0.8340471471744398, 'reg_alpha': 1.2147605848962493, 'reg_lambda': 0.5378170397834615, 'gamma': 0.3067811097520366, 'min_child_weight': 3}. Best is trial 10 with value: 0.9943959756318204.


Best trial: 10. Best value: 0.994396:  24%|██▍       | 12/50 [00:23<01:22,  2.18s/it]

[I 2025-12-29 18:49:26,063] Trial 11 finished with value: 0.9943856187895017 and parameters: {'n_estimators': 798, 'max_depth': 6, 'learning_rate': 0.019205345948827664, 'subsample': 0.7926563255659246, 'colsample_bytree': 0.8339060660088724, 'reg_alpha': 1.2783539693482735, 'reg_lambda': 0.5275544525485685, 'gamma': 0.2944914218204555, 'min_child_weight': 3}. Best is trial 10 with value: 0.9943959756318204.


Best trial: 12. Best value: 0.994446:  26%|██▌       | 13/50 [00:25<01:09,  1.88s/it]

[I 2025-12-29 18:49:27,244] Trial 12 finished with value: 0.9944460133436894 and parameters: {'n_estimators': 784, 'max_depth': 6, 'learning_rate': 0.019387112854991347, 'subsample': 0.7864234363120667, 'colsample_bytree': 0.8347028391420866, 'reg_alpha': 1.395356856390078, 'reg_lambda': 1.029530576656956, 'gamma': 0.26779443127697666, 'min_child_weight': 3}. Best is trial 12 with value: 0.9944460133436894.


Best trial: 12. Best value: 0.994446:  28%|██▊       | 14/50 [00:26<01:04,  1.80s/it]

[I 2025-12-29 18:49:28,856] Trial 13 finished with value: 0.9943450795935964 and parameters: {'n_estimators': 799, 'max_depth': 7, 'learning_rate': 0.020893180892805462, 'subsample': 0.7020920607445187, 'colsample_bytree': 0.8392178566028249, 'reg_alpha': 1.6295674380097958, 'reg_lambda': 1.10388733245085, 'gamma': 0.2814427194985349, 'min_child_weight': 3}. Best is trial 12 with value: 0.9944460133436894.


Best trial: 12. Best value: 0.994446:  30%|███       | 15/50 [00:28<01:05,  1.87s/it]

[I 2025-12-29 18:49:30,880] Trial 14 finished with value: 0.993483916047683 and parameters: {'n_estimators': 707, 'max_depth': 9, 'learning_rate': 0.0232609540062836, 'subsample': 0.9490042473403033, 'colsample_bytree': 0.9486750866227438, 'reg_alpha': 1.1684366582397683, 'reg_lambda': 0.9597199075069391, 'gamma': 0.20020607693026368, 'min_child_weight': 5}. Best is trial 12 with value: 0.9944460133436894.


Best trial: 12. Best value: 0.994446:  32%|███▏      | 16/50 [00:29<00:57,  1.70s/it]

[I 2025-12-29 18:49:32,212] Trial 15 finished with value: 0.9943868383746042 and parameters: {'n_estimators': 723, 'max_depth': 7, 'learning_rate': 0.015934167057317906, 'subsample': 0.7717174438072426, 'colsample_bytree': 0.8047498591169017, 'reg_alpha': 1.7297647789439914, 'reg_lambda': 1.214528276768952, 'gamma': 0.33527257130244337, 'min_child_weight': 3}. Best is trial 12 with value: 0.9944460133436894.


Best trial: 12. Best value: 0.994446:  34%|███▍      | 17/50 [00:32<01:01,  1.88s/it]

[I 2025-12-29 18:49:34,490] Trial 16 finished with value: 0.9937252924359665 and parameters: {'n_estimators': 737, 'max_depth': 9, 'learning_rate': 0.026723666539988995, 'subsample': 0.8119260727825947, 'colsample_bytree': 0.8668230820556708, 'reg_alpha': 0.9126947458116883, 'reg_lambda': 1.908504084069662, 'gamma': 0.22910095627035165, 'min_child_weight': 4}. Best is trial 12 with value: 0.9944460133436894.


Best trial: 12. Best value: 0.994446:  36%|███▌      | 18/50 [00:32<00:46,  1.46s/it]

[I 2025-12-29 18:49:34,978] Trial 17 finished with value: 0.9937173773299094 and parameters: {'n_estimators': 406, 'max_depth': 6, 'learning_rate': 0.015759899343353263, 'subsample': 0.7577674609720254, 'colsample_bytree': 0.808335214815427, 'reg_alpha': 1.3020176651777013, 'reg_lambda': 0.8208482544204506, 'gamma': 0.3438613708032052, 'min_child_weight': 5}. Best is trial 12 with value: 0.9944460133436894.


Best trial: 12. Best value: 0.994446:  38%|███▊      | 19/50 [00:33<00:39,  1.28s/it]

[I 2025-12-29 18:49:35,831] Trial 18 finished with value: 0.9938099923018877 and parameters: {'n_estimators': 677, 'max_depth': 7, 'learning_rate': 0.016008733074324422, 'subsample': 0.7002968725154765, 'colsample_bytree': 0.8605460122351757, 'reg_alpha': 1.4922766196125359, 'reg_lambda': 1.30672100324483, 'gamma': 0.045087026473713876, 'min_child_weight': 6}. Best is trial 12 with value: 0.9944460133436894.


Best trial: 12. Best value: 0.994446:  40%|████      | 20/50 [00:35<00:46,  1.54s/it]

[I 2025-12-29 18:49:37,986] Trial 19 finished with value: 0.9940558168235105 and parameters: {'n_estimators': 763, 'max_depth': 8, 'learning_rate': 0.026772678760922822, 'subsample': 0.8716093965875478, 'colsample_bytree': 0.9109343581965842, 'reg_alpha': 1.9826683935022384, 'reg_lambda': 0.7571380546679515, 'gamma': 0.156322588172432, 'min_child_weight': 3}. Best is trial 12 with value: 0.9944460133436894.


Best trial: 12. Best value: 0.994446:  42%|████▏     | 21/50 [00:38<00:53,  1.84s/it]

[I 2025-12-29 18:49:40,518] Trial 20 finished with value: 0.9943011464440815 and parameters: {'n_estimators': 680, 'max_depth': 9, 'learning_rate': 0.07232599250941059, 'subsample': 0.7973811651442687, 'colsample_bytree': 0.8602729751587812, 'reg_alpha': 1.1753307401116417, 'reg_lambda': 0.9987423429544875, 'gamma': 0.2598601446644983, 'min_child_weight': 4}. Best is trial 12 with value: 0.9944460133436894.


Best trial: 12. Best value: 0.994446:  44%|████▍     | 22/50 [00:41<01:03,  2.27s/it]

[I 2025-12-29 18:49:43,781] Trial 21 finished with value: 0.9942413103950226 and parameters: {'n_estimators': 735, 'max_depth': 7, 'learning_rate': 0.015627502361604914, 'subsample': 0.7724280113284436, 'colsample_bytree': 0.8074936397290888, 'reg_alpha': 1.7269249309883024, 'reg_lambda': 1.1889291304697298, 'gamma': 0.3358478279918827, 'min_child_weight': 3}. Best is trial 12 with value: 0.9944460133436894.


Best trial: 12. Best value: 0.994446:  46%|████▌     | 23/50 [00:44<01:04,  2.39s/it]

[I 2025-12-29 18:49:46,459] Trial 22 finished with value: 0.993959915132935 and parameters: {'n_estimators': 765, 'max_depth': 6, 'learning_rate': 0.01829038647889611, 'subsample': 0.777650900078618, 'colsample_bytree': 0.8046650538317339, 'reg_alpha': 1.7581349822189793, 'reg_lambda': 1.6635612485640565, 'gamma': 0.33797411478617573, 'min_child_weight': 3}. Best is trial 12 with value: 0.9944460133436894.


Best trial: 12. Best value: 0.994446:  48%|████▊     | 24/50 [00:47<01:08,  2.65s/it]

[I 2025-12-29 18:49:49,705] Trial 23 finished with value: 0.9941419427683877 and parameters: {'n_estimators': 735, 'max_depth': 7, 'learning_rate': 0.014129594297383677, 'subsample': 0.8167808487007244, 'colsample_bytree': 0.825821807328742, 'reg_alpha': 1.3937037433984645, 'reg_lambda': 0.8042489933516191, 'gamma': 0.3036239245985621, 'min_child_weight': 4}. Best is trial 12 with value: 0.9944460133436894.


Best trial: 12. Best value: 0.994446:  50%|█████     | 25/50 [00:49<01:01,  2.48s/it]

[I 2025-12-29 18:49:51,779] Trial 24 finished with value: 0.9943351523546943 and parameters: {'n_estimators': 700, 'max_depth': 6, 'learning_rate': 0.025358831024653097, 'subsample': 0.7246979682631247, 'colsample_bytree': 0.7774581460833617, 'reg_alpha': 1.5717708306377633, 'reg_lambda': 1.2081409111348211, 'gamma': 0.2207138528674293, 'min_child_weight': 5}. Best is trial 12 with value: 0.9944460133436894.


Best trial: 12. Best value: 0.994446:  52%|█████▏    | 26/50 [00:53<01:08,  2.84s/it]

[I 2025-12-29 18:49:55,467] Trial 25 finished with value: 0.9942761131632579 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.02190969969769603, 'subsample': 0.7638110453289683, 'colsample_bytree': 0.7981445344961489, 'reg_alpha': 1.111846199583854, 'reg_lambda': 1.5039745873589023, 'gamma': 0.3962808180014852, 'min_child_weight': 3}. Best is trial 12 with value: 0.9944460133436894.


Best trial: 12. Best value: 0.994446:  54%|█████▍    | 27/50 [00:55<01:01,  2.70s/it]

[I 2025-12-29 18:49:57,825] Trial 26 finished with value: 0.9940352753017352 and parameters: {'n_estimators': 582, 'max_depth': 6, 'learning_rate': 0.012996438625817643, 'subsample': 0.7951136754428864, 'colsample_bytree': 0.8490131479992907, 'reg_alpha': 1.809181521452853, 'reg_lambda': 0.5171710361552542, 'gamma': 0.18668499914903397, 'min_child_weight': 4}. Best is trial 12 with value: 0.9944460133436894.


Best trial: 12. Best value: 0.994446:  56%|█████▌    | 28/50 [00:58<01:02,  2.86s/it]

[I 2025-12-29 18:50:01,083] Trial 27 finished with value: 0.9938876680758364 and parameters: {'n_estimators': 754, 'max_depth': 7, 'learning_rate': 0.010205043268075698, 'subsample': 0.833672300758752, 'colsample_bytree': 0.878484913045834, 'reg_alpha': 0.8809729309184768, 'reg_lambda': 0.8519991682666666, 'gamma': 0.24822106644231423, 'min_child_weight': 6}. Best is trial 12 with value: 0.9944460133436894.


Best trial: 12. Best value: 0.994446:  58%|█████▊    | 29/50 [01:02<01:06,  3.15s/it]

[I 2025-12-29 18:50:04,892] Trial 28 finished with value: 0.994196400331048 and parameters: {'n_estimators': 656, 'max_depth': 8, 'learning_rate': 0.018447165749233847, 'subsample': 0.7852339116622739, 'colsample_bytree': 0.8164382744138361, 'reg_alpha': 1.6272053582728743, 'reg_lambda': 1.059504902784235, 'gamma': 0.3549780681240265, 'min_child_weight': 3}. Best is trial 12 with value: 0.9944460133436894.


Best trial: 12. Best value: 0.994446:  60%|██████    | 30/50 [01:04<00:55,  2.79s/it]

[I 2025-12-29 18:50:06,822] Trial 29 finished with value: 0.9939493819356839 and parameters: {'n_estimators': 706, 'max_depth': 6, 'learning_rate': 0.029436057045814316, 'subsample': 0.8476307269606375, 'colsample_bytree': 0.7919647362164587, 'reg_alpha': 1.3331125306591183, 'reg_lambda': 1.8929630869901009, 'gamma': 0.11472139650353375, 'min_child_weight': 8}. Best is trial 12 with value: 0.9944460133436894.


Best trial: 12. Best value: 0.994446:  62%|██████▏   | 31/50 [01:09<01:06,  3.52s/it]

[I 2025-12-29 18:50:12,066] Trial 30 finished with value: 0.9941298561489889 and parameters: {'n_estimators': 771, 'max_depth': 10, 'learning_rate': 0.017041049922047278, 'subsample': 0.7505260655572052, 'colsample_bytree': 0.8428428081952709, 'reg_alpha': 1.8135765960308108, 'reg_lambda': 0.6894519214223874, 'gamma': 0.31256979652010897, 'min_child_weight': 5}. Best is trial 12 with value: 0.9944460133436894.


Best trial: 12. Best value: 0.994446:  64%|██████▍   | 32/50 [01:12<00:59,  3.31s/it]

[I 2025-12-29 18:50:14,884] Trial 31 finished with value: 0.994333319338309 and parameters: {'n_estimators': 796, 'max_depth': 6, 'learning_rate': 0.019976848750785797, 'subsample': 0.7958080419864771, 'colsample_bytree': 0.8340725592910078, 'reg_alpha': 1.2789991492163983, 'reg_lambda': 0.5053167736730828, 'gamma': 0.28309092532019065, 'min_child_weight': 3}. Best is trial 12 with value: 0.9944460133436894.


Best trial: 12. Best value: 0.994446:  66%|██████▌   | 33/50 [01:15<00:54,  3.19s/it]

[I 2025-12-29 18:50:17,795] Trial 32 finished with value: 0.9943711325178585 and parameters: {'n_estimators': 723, 'max_depth': 6, 'learning_rate': 0.013977456415047062, 'subsample': 0.8034141245766425, 'colsample_bytree': 0.8249090148942428, 'reg_alpha': 1.081338931303567, 'reg_lambda': 0.7654305639662072, 'gamma': 0.29729224979676666, 'min_child_weight': 3}. Best is trial 12 with value: 0.9944460133436894.


Best trial: 12. Best value: 0.994446:  68%|██████▊   | 34/50 [01:18<00:47,  2.97s/it]

[I 2025-12-29 18:50:20,245] Trial 33 finished with value: 0.9942941634059229 and parameters: {'n_estimators': 773, 'max_depth': 6, 'learning_rate': 0.02336803242645751, 'subsample': 0.823328677140721, 'colsample_bytree': 0.7709083730938379, 'reg_alpha': 1.5085174778923771, 'reg_lambda': 0.6379904240052532, 'gamma': 0.4131030228856006, 'min_child_weight': 4}. Best is trial 12 with value: 0.9944460133436894.


Best trial: 12. Best value: 0.994446:  70%|███████   | 35/50 [01:21<00:44,  2.98s/it]

[I 2025-12-29 18:50:23,254] Trial 34 finished with value: 0.9943538329515998 and parameters: {'n_estimators': 664, 'max_depth': 7, 'learning_rate': 0.01909216318057477, 'subsample': 0.7852676258955693, 'colsample_bytree': 0.7605934632825407, 'reg_alpha': 0.746518334412889, 'reg_lambda': 0.9806279357334917, 'gamma': 0.36814714943605376, 'min_child_weight': 3}. Best is trial 12 with value: 0.9944460133436894.


Best trial: 12. Best value: 0.994446:  72%|███████▏  | 36/50 [01:24<00:42,  3.02s/it]

[I 2025-12-29 18:50:26,377] Trial 35 finished with value: 0.9941535231138922 and parameters: {'n_estimators': 781, 'max_depth': 7, 'learning_rate': 0.014292534088735728, 'subsample': 0.7653931400092376, 'colsample_bytree': 0.8510842995928367, 'reg_alpha': 1.2411780472012182, 'reg_lambda': 1.2323082133528582, 'gamma': 0.26624781726118596, 'min_child_weight': 4}. Best is trial 12 with value: 0.9944460133436894.


Best trial: 12. Best value: 0.994446:  74%|███████▍  | 37/50 [01:26<00:37,  2.87s/it]

[I 2025-12-29 18:50:28,875] Trial 36 finished with value: 0.9944321134913942 and parameters: {'n_estimators': 745, 'max_depth': 6, 'learning_rate': 0.09546524059057505, 'subsample': 0.7366047986395786, 'colsample_bytree': 0.8788383718719283, 'reg_alpha': 1.4212985274498389, 'reg_lambda': 0.6183143004980036, 'gamma': 0.3090406535879489, 'min_child_weight': 7}. Best is trial 12 with value: 0.9944460133436894.


Best trial: 37. Best value: 0.994603:  76%|███████▌  | 38/50 [01:29<00:36,  3.01s/it]

[I 2025-12-29 18:50:32,213] Trial 37 finished with value: 0.994603172727404 and parameters: {'n_estimators': 632, 'max_depth': 8, 'learning_rate': 0.09906526990876652, 'subsample': 0.7265435282346211, 'colsample_bytree': 0.9241771068726822, 'reg_alpha': 1.4204695561944785, 'reg_lambda': 0.8683993966152527, 'gamma': 0.3214144874915166, 'min_child_weight': 7}. Best is trial 37 with value: 0.994603172727404.


Best trial: 37. Best value: 0.994603:  78%|███████▊  | 39/50 [01:33<00:36,  3.28s/it]

[I 2025-12-29 18:50:36,142] Trial 38 finished with value: 0.9934794866846556 and parameters: {'n_estimators': 532, 'max_depth': 10, 'learning_rate': 0.09891116118178597, 'subsample': 0.7234275107537562, 'colsample_bytree': 0.9266096024033508, 'reg_alpha': 1.395672728501481, 'reg_lambda': 2.5241414198017624, 'gamma': 0.23973655644404646, 'min_child_weight': 7}. Best is trial 37 with value: 0.994603172727404.


Best trial: 37. Best value: 0.994603:  80%|████████  | 40/50 [01:37<00:32,  3.23s/it]

[I 2025-12-29 18:50:39,233] Trial 39 finished with value: 0.9942426350641835 and parameters: {'n_estimators': 632, 'max_depth': 8, 'learning_rate': 0.07874705740162027, 'subsample': 0.7170266773559472, 'colsample_bytree': 0.8814362396167026, 'reg_alpha': 0.4576706196480447, 'reg_lambda': 0.6797372307473066, 'gamma': 0.41888436444975513, 'min_child_weight': 8}. Best is trial 37 with value: 0.994603172727404.


Best trial: 37. Best value: 0.994603:  82%|████████▏ | 41/50 [01:41<00:31,  3.48s/it]

[I 2025-12-29 18:50:43,303] Trial 40 finished with value: 0.9940937814503675 and parameters: {'n_estimators': 565, 'max_depth': 11, 'learning_rate': 0.06259768908987723, 'subsample': 0.747586651294413, 'colsample_bytree': 0.9170714016554719, 'reg_alpha': 1.0033019403881505, 'reg_lambda': 0.8857932053966081, 'gamma': 0.384461310549039, 'min_child_weight': 8}. Best is trial 37 with value: 0.994603172727404.


Best trial: 37. Best value: 0.994603:  84%|████████▍ | 42/50 [01:44<00:28,  3.59s/it]

[I 2025-12-29 18:50:47,165] Trial 41 finished with value: 0.994144572419209 and parameters: {'n_estimators': 688, 'max_depth': 8, 'learning_rate': 0.0826184719449599, 'subsample': 0.7509532063097943, 'colsample_bytree': 0.9353559461930916, 'reg_alpha': 1.6189506330863608, 'reg_lambda': 0.6413558235305483, 'gamma': 0.3253351175952365, 'min_child_weight': 7}. Best is trial 37 with value: 0.994603172727404.


Best trial: 37. Best value: 0.994603:  86%|████████▌ | 43/50 [01:47<00:23,  3.32s/it]

[I 2025-12-29 18:50:49,833] Trial 42 finished with value: 0.9944739626195869 and parameters: {'n_estimators': 749, 'max_depth': 7, 'learning_rate': 0.03806364156087877, 'subsample': 0.7377007274347195, 'colsample_bytree': 0.8921390323841907, 'reg_alpha': 1.8717314025016913, 'reg_lambda': 1.1178688444422638, 'gamma': 0.3696162776015044, 'min_child_weight': 7}. Best is trial 37 with value: 0.994603172727404.


Best trial: 37. Best value: 0.994603:  88%|████████▊ | 44/50 [01:49<00:17,  2.93s/it]

[I 2025-12-29 18:50:51,848] Trial 43 finished with value: 0.9945137853430713 and parameters: {'n_estimators': 640, 'max_depth': 6, 'learning_rate': 0.043622531348442836, 'subsample': 0.7371464262326478, 'colsample_bytree': 0.8897598176792416, 'reg_alpha': 1.884863692825264, 'reg_lambda': 0.9076635638345107, 'gamma': 0.36293339543067626, 'min_child_weight': 7}. Best is trial 37 with value: 0.994603172727404.


Best trial: 37. Best value: 0.994603:  90%|█████████ | 45/50 [01:51<00:13,  2.60s/it]

[I 2025-12-29 18:50:53,696] Trial 44 finished with value: 0.9943919170584683 and parameters: {'n_estimators': 628, 'max_depth': 6, 'learning_rate': 0.04019448240973221, 'subsample': 0.737721762018806, 'colsample_bytree': 0.8958318138543413, 'reg_alpha': 1.9217098200188003, 'reg_lambda': 1.3901117610065403, 'gamma': 0.36812530214492145, 'min_child_weight': 7}. Best is trial 37 with value: 0.994603172727404.


Best trial: 37. Best value: 0.994603:  92%|█████████▏| 46/50 [01:55<00:11,  2.90s/it]

[I 2025-12-29 18:50:57,295] Trial 45 finished with value: 0.9943328102359003 and parameters: {'n_estimators': 597, 'max_depth': 8, 'learning_rate': 0.059573860720956, 'subsample': 0.7179369569706293, 'colsample_bytree': 0.8809868629051162, 'reg_alpha': 1.8753241935709126, 'reg_lambda': 1.0601799577809785, 'gamma': 0.4706281366663781, 'min_child_weight': 8}. Best is trial 37 with value: 0.994603172727404.


Best trial: 37. Best value: 0.994603:  94%|█████████▍| 47/50 [01:57<00:08,  2.72s/it]

[I 2025-12-29 18:50:59,609] Trial 46 finished with value: 0.9944459891759434 and parameters: {'n_estimators': 501, 'max_depth': 7, 'learning_rate': 0.04511272950877059, 'subsample': 0.7354429195586624, 'colsample_bytree': 0.9004803932668805, 'reg_alpha': 1.664462884701934, 'reg_lambda': 0.9029141668996407, 'gamma': 0.4418967698463862, 'min_child_weight': 6}. Best is trial 37 with value: 0.994603172727404.


Best trial: 47. Best value: 0.994632:  96%|█████████▌| 48/50 [01:59<00:04,  2.44s/it]

[I 2025-12-29 18:51:01,395] Trial 47 finished with value: 0.9946316260885798 and parameters: {'n_estimators': 482, 'max_depth': 7, 'learning_rate': 0.044330079268964206, 'subsample': 0.7317908739414211, 'colsample_bytree': 0.9063761417443338, 'reg_alpha': 1.6851512646344933, 'reg_lambda': 0.9061607877358362, 'gamma': 0.4449031499583414, 'min_child_weight': 6}. Best is trial 47 with value: 0.9946316260885798.


Best trial: 47. Best value: 0.994632:  98%|█████████▊| 49/50 [02:01<00:02,  2.27s/it]

[I 2025-12-29 18:51:03,276] Trial 48 finished with value: 0.9940874303463015 and parameters: {'n_estimators': 415, 'max_depth': 8, 'learning_rate': 0.03788781805955444, 'subsample': 0.7116536729313696, 'colsample_bytree': 0.9463070786068231, 'reg_alpha': 1.8641303589810279, 'reg_lambda': 1.0882824140380196, 'gamma': 0.4943269117954799, 'min_child_weight': 6}. Best is trial 47 with value: 0.9946316260885798.


Best trial: 47. Best value: 0.994632: 100%|██████████| 50/50 [02:02<00:00,  2.45s/it]
[I 2025-12-29 18:51:04,836] A new study created in memory with name: no-name-65762324-25f8-4026-97b1-fb9ed9e51f81


[I 2025-12-29 18:51:04,810] Trial 49 finished with value: 0.9936436008935201 and parameters: {'n_estimators': 455, 'max_depth': 7, 'learning_rate': 0.04752687054940427, 'subsample': 0.7283220256789654, 'colsample_bytree': 0.9092191559711487, 'reg_alpha': 1.9869699492310722, 'reg_lambda': 1.5711577362585558, 'gamma': 0.4239804084011445, 'min_child_weight': 10}. Best is trial 47 with value: 0.9946316260885798.

✓ Best R² Score: 0.9946
✓ Best Parameters:
    n_estimators: 482
    max_depth: 7
    learning_rate: 0.044330079268964206
    subsample: 0.7317908739414211
    colsample_bytree: 0.9063761417443338
    reg_alpha: 1.6851512646344933
    reg_lambda: 0.9061607877358362
    gamma: 0.4449031499583414
    min_child_weight: 6

 Optimizing CO2 Model Hyperparameters...

[HYPERPARAMETER OPTIMIZATION - CO2 MODEL]


Best trial: 0. Best value: 0.996249:   2%|▏         | 1/50 [00:00<00:40,  1.22it/s]

[I 2025-12-29 18:51:05,653] Trial 0 finished with value: 0.9962487153911074 and parameters: {'n_estimators': 487, 'max_depth': 12, 'learning_rate': 0.05395030966670229, 'subsample': 0.8496646210492591, 'colsample_bytree': 0.7390046601106091, 'reg_alpha': 0.3119890406724053, 'reg_lambda': 0.6452090304204987, 'gamma': 0.4330880728874676, 'min_child_weight': 7}. Best is trial 0 with value: 0.9962487153911074.


Best trial: 1. Best value: 0.997115:   4%|▍         | 2/50 [00:01<00:35,  1.36it/s]

[I 2025-12-29 18:51:06,332] Trial 1 finished with value: 0.997114612924997 and parameters: {'n_estimators': 654, 'max_depth': 6, 'learning_rate': 0.09330606024425668, 'subsample': 0.9081106602001054, 'colsample_bytree': 0.753084777669569, 'reg_alpha': 0.36364993441420124, 'reg_lambda': 0.9585112746335845, 'gamma': 0.15212112147976886, 'min_child_weight': 7}. Best is trial 1 with value: 0.997114612924997.


Best trial: 1. Best value: 0.997115:   6%|▌         | 3/50 [00:02<00:34,  1.38it/s]

[I 2025-12-29 18:51:07,041] Trial 2 finished with value: 0.9961388555312878 and parameters: {'n_estimators': 516, 'max_depth': 8, 'learning_rate': 0.04091220574443785, 'subsample': 0.7348734651630104, 'colsample_bytree': 0.7730361621338044, 'reg_alpha': 0.7327236865873834, 'reg_lambda': 1.64017496054259, 'gamma': 0.3925879806965068, 'min_child_weight': 4}. Best is trial 1 with value: 0.997114612924997.


Best trial: 1. Best value: 0.997115:   8%|▊         | 4/50 [00:03<00:49,  1.07s/it]

[I 2025-12-29 18:51:08,637] Trial 3 finished with value: 0.9959675713699522 and parameters: {'n_estimators': 557, 'max_depth': 10, 'learning_rate': 0.011128853174905732, 'subsample': 0.8518862129753595, 'colsample_bytree': 0.7426310309218228, 'reg_alpha': 0.13010318597055903, 'reg_lambda': 2.8722138431333333, 'gamma': 0.4828160165372797, 'min_child_weight': 9}. Best is trial 1 with value: 0.997114612924997.


Best trial: 1. Best value: 0.997115:  10%|█         | 5/50 [00:04<00:39,  1.13it/s]

[I 2025-12-29 18:51:09,207] Trial 4 finished with value: 0.9959112798095162 and parameters: {'n_estimators': 452, 'max_depth': 6, 'learning_rate': 0.04833180632488466, 'subsample': 0.8100381234349003, 'colsample_bytree': 0.7305095587111947, 'reg_alpha': 0.9903538202225404, 'reg_lambda': 0.585971302788046, 'gamma': 0.45466020103939103, 'min_child_weight': 5}. Best is trial 1 with value: 0.997114612924997.


Best trial: 1. Best value: 0.997115:  12%|█▏        | 6/50 [00:05<00:39,  1.11it/s]

[I 2025-12-29 18:51:10,119] Trial 5 finished with value: 0.9950798038008987 and parameters: {'n_estimators': 631, 'max_depth': 8, 'learning_rate': 0.03311829888072381, 'subsample': 0.8366775698358199, 'colsample_bytree': 0.7462136138813817, 'reg_alpha': 1.9391692555291171, 'reg_lambda': 2.4378320584027864, 'gamma': 0.46974947078209456, 'min_child_weight': 10}. Best is trial 1 with value: 0.997114612924997.


Best trial: 1. Best value: 0.997115:  14%|█▍        | 7/50 [00:06<00:48,  1.13s/it]

[I 2025-12-29 18:51:11,718] Trial 6 finished with value: 0.9969039240181005 and parameters: {'n_estimators': 599, 'max_depth': 12, 'learning_rate': 0.012260057359187526, 'subsample': 0.7489957156047863, 'colsample_bytree': 0.7113068222276344, 'reg_alpha': 0.6506606615265287, 'reg_lambda': 1.471693224223705, 'gamma': 0.13567451588694796, 'min_child_weight': 9}. Best is trial 1 with value: 0.997114612924997.


Best trial: 1. Best value: 0.997115:  16%|█▌        | 8/50 [00:07<00:42,  1.01s/it]

[I 2025-12-29 18:51:12,484] Trial 7 finished with value: 0.996493058166472 and parameters: {'n_estimators': 478, 'max_depth': 7, 'learning_rate': 0.03488960745139221, 'subsample': 0.7352310562436906, 'colsample_bytree': 0.9005492451885099, 'reg_alpha': 0.14910128735954165, 'reg_lambda': 2.9672173415012932, 'gamma': 0.3861223846483287, 'min_child_weight': 4}. Best is trial 1 with value: 0.997114612924997.


Best trial: 8. Best value: 0.997412:  18%|█▊        | 9/50 [00:08<00:39,  1.04it/s]

[I 2025-12-29 18:51:13,337] Trial 8 finished with value: 0.9974121839181572 and parameters: {'n_estimators': 302, 'max_depth': 11, 'learning_rate': 0.05091635945818555, 'subsample': 0.8822517920102468, 'colsample_bytree': 0.8928175866714864, 'reg_alpha': 0.14808930346818072, 'reg_lambda': 1.3961643213606816, 'gamma': 0.05793452976256486, 'min_child_weight': 9}. Best is trial 8 with value: 0.9974121839181572.


Best trial: 8. Best value: 0.997412:  20%|██        | 10/50 [00:10<00:49,  1.25s/it]

[I 2025-12-29 18:51:15,218] Trial 9 finished with value: 0.9955007132540853 and parameters: {'n_estimators': 612, 'max_depth': 8, 'learning_rate': 0.011575995526672779, 'subsample': 0.7777455804289155, 'colsample_bytree': 0.7812958305066867, 'reg_alpha': 1.4592123566761281, 'reg_lambda': 2.093893678388033, 'gamma': 0.44360637128816327, 'min_child_weight': 6}. Best is trial 8 with value: 0.9974121839181572.


Best trial: 8. Best value: 0.997412:  22%|██▏       | 11/50 [00:11<00:50,  1.29s/it]

[I 2025-12-29 18:51:16,604] Trial 10 finished with value: 0.9971591455187345 and parameters: {'n_estimators': 310, 'max_depth': 10, 'learning_rate': 0.020309265235830157, 'subsample': 0.9341621883143494, 'colsample_bytree': 0.9448058705496898, 'reg_alpha': 1.3804941471382906, 'reg_lambda': 1.1448173815037301, 'gamma': 0.005557192411355653, 'min_child_weight': 8}. Best is trial 8 with value: 0.9974121839181572.


Best trial: 8. Best value: 0.997412:  24%|██▍       | 12/50 [00:13<00:53,  1.41s/it]

[I 2025-12-29 18:51:18,297] Trial 11 finished with value: 0.9969965597913346 and parameters: {'n_estimators': 303, 'max_depth': 10, 'learning_rate': 0.019205345948827664, 'subsample': 0.942419026756344, 'colsample_bytree': 0.947430091116828, 'reg_alpha': 1.4053984655159972, 'reg_lambda': 1.211142160454102, 'gamma': 0.005523282377482293, 'min_child_weight': 8}. Best is trial 8 with value: 0.9974121839181572.


Best trial: 8. Best value: 0.997412:  26%|██▌       | 13/50 [00:14<00:52,  1.41s/it]

[I 2025-12-29 18:51:19,706] Trial 12 finished with value: 0.9969638145300722 and parameters: {'n_estimators': 302, 'max_depth': 10, 'learning_rate': 0.021807204889822282, 'subsample': 0.8977089495553197, 'colsample_bytree': 0.8776410915675111, 'reg_alpha': 1.4178743213208747, 'reg_lambda': 1.1593954488306322, 'gamma': 0.01679294246723606, 'min_child_weight': 10}. Best is trial 8 with value: 0.9974121839181572.


Best trial: 8. Best value: 0.997412:  28%|██▊       | 14/50 [00:16<00:53,  1.47s/it]

[I 2025-12-29 18:51:21,328] Trial 13 finished with value: 0.9968650519330048 and parameters: {'n_estimators': 799, 'max_depth': 11, 'learning_rate': 0.02162107976240172, 'subsample': 0.9498406096603877, 'colsample_bytree': 0.9462798024963018, 'reg_alpha': 1.750402751752063, 'reg_lambda': 1.8887297629107012, 'gamma': 0.0974890839084619, 'min_child_weight': 8}. Best is trial 8 with value: 0.9974121839181572.


Best trial: 8. Best value: 0.997412:  30%|███       | 15/50 [00:17<00:42,  1.21s/it]

[I 2025-12-29 18:51:21,930] Trial 14 finished with value: 0.996335480471271 and parameters: {'n_estimators': 380, 'max_depth': 11, 'learning_rate': 0.07147028012869903, 'subsample': 0.8890029173007267, 'colsample_bytree': 0.8664421680782335, 'reg_alpha': 1.0622480607029205, 'reg_lambda': 1.3928292430051528, 'gamma': 0.25942372287110194, 'min_child_weight': 8}. Best is trial 8 with value: 0.9974121839181572.


Best trial: 8. Best value: 0.997412:  32%|███▏      | 16/50 [00:18<00:45,  1.34s/it]

[I 2025-12-29 18:51:23,575] Trial 15 finished with value: 0.9969944151601012 and parameters: {'n_estimators': 394, 'max_depth': 9, 'learning_rate': 0.016395981446348747, 'subsample': 0.9194974894632141, 'colsample_bytree': 0.9138857903802136, 'reg_alpha': 1.0590519290813356, 'reg_lambda': 0.9617387794171841, 'gamma': 0.07079212082617559, 'min_child_weight': 9}. Best is trial 8 with value: 0.9974121839181572.


Best trial: 8. Best value: 0.997412:  34%|███▍      | 17/50 [00:19<00:39,  1.20s/it]

[I 2025-12-29 18:51:24,446] Trial 16 finished with value: 0.9969144369461722 and parameters: {'n_estimators': 372, 'max_depth': 11, 'learning_rate': 0.028818494005348746, 'subsample': 0.8754660075131232, 'colsample_bytree': 0.8306090665533253, 'reg_alpha': 0.6113489946786881, 'reg_lambda': 1.9616288956811756, 'gamma': 0.2230441325213059, 'min_child_weight': 6}. Best is trial 8 with value: 0.9974121839181572.


Best trial: 8. Best value: 0.997412:  36%|███▌      | 18/50 [00:20<00:33,  1.04s/it]

[I 2025-12-29 18:51:25,115] Trial 17 finished with value: 0.995811526875519 and parameters: {'n_estimators': 349, 'max_depth': 9, 'learning_rate': 0.06017876097513216, 'subsample': 0.8718127830219314, 'colsample_bytree': 0.8348181947286579, 'reg_alpha': 1.591754887203965, 'reg_lambda': 2.234740308905092, 'gamma': 0.2867103519982396, 'min_child_weight': 10}. Best is trial 8 with value: 0.9974121839181572.


Best trial: 8. Best value: 0.997412:  38%|███▊      | 19/50 [00:21<00:35,  1.13s/it]

[I 2025-12-29 18:51:26,455] Trial 18 finished with value: 0.9971875955721916 and parameters: {'n_estimators': 420, 'max_depth': 11, 'learning_rate': 0.02665807860534449, 'subsample': 0.8081864027951229, 'colsample_bytree': 0.9174811223409052, 'reg_alpha': 1.1830353419914512, 'reg_lambda': 1.6878665182541344, 'gamma': 0.05993582486522125, 'min_child_weight': 8}. Best is trial 8 with value: 0.9974121839181572.


Best trial: 8. Best value: 0.997412:  40%|████      | 20/50 [00:22<00:32,  1.09s/it]

[I 2025-12-29 18:51:27,466] Trial 19 finished with value: 0.9969064401156329 and parameters: {'n_estimators': 420, 'max_depth': 12, 'learning_rate': 0.027300664371073254, 'subsample': 0.8079644464999192, 'colsample_bytree': 0.9131802923952054, 'reg_alpha': 0.8926440169000727, 'reg_lambda': 1.6983829434756066, 'gamma': 0.17920422230635966, 'min_child_weight': 3}. Best is trial 8 with value: 0.9974121839181572.


Best trial: 8. Best value: 0.997412:  42%|████▏     | 21/50 [00:23<00:32,  1.12s/it]

[I 2025-12-29 18:51:28,634] Trial 20 finished with value: 0.9972914614561494 and parameters: {'n_estimators': 709, 'max_depth': 11, 'learning_rate': 0.07279442912984735, 'subsample': 0.7040552226097592, 'colsample_bytree': 0.865338141580332, 'reg_alpha': 0.4801569857474748, 'reg_lambda': 2.486138147435134, 'gamma': 0.07370301268414407, 'min_child_weight': 7}. Best is trial 8 with value: 0.9974121839181572.


Best trial: 21. Best value: 0.99746:  44%|████▍     | 22/50 [00:24<00:31,  1.11s/it]

[I 2025-12-29 18:51:29,737] Trial 21 finished with value: 0.9974599319613533 and parameters: {'n_estimators': 735, 'max_depth': 11, 'learning_rate': 0.0753397375631344, 'subsample': 0.7668798972660384, 'colsample_bytree': 0.8595437920341238, 'reg_alpha': 0.40440218684132834, 'reg_lambda': 2.6008257548577625, 'gamma': 0.06583554892242924, 'min_child_weight': 7}. Best is trial 21 with value: 0.9974599319613533.


Best trial: 21. Best value: 0.99746:  46%|████▌     | 23/50 [00:25<00:28,  1.04s/it]

[I 2025-12-29 18:51:30,604] Trial 22 finished with value: 0.9972988444675795 and parameters: {'n_estimators': 726, 'max_depth': 11, 'learning_rate': 0.09964011970318729, 'subsample': 0.7759431186956559, 'colsample_bytree': 0.8562940500853264, 'reg_alpha': 0.40740640550008655, 'reg_lambda': 2.6391610735978692, 'gamma': 0.08279775689946878, 'min_child_weight': 7}. Best is trial 21 with value: 0.9974599319613533.


Best trial: 21. Best value: 0.99746:  48%|████▊     | 24/50 [00:26<00:27,  1.05s/it]

[I 2025-12-29 18:51:31,692] Trial 23 finished with value: 0.9974409240888814 and parameters: {'n_estimators': 748, 'max_depth': 12, 'learning_rate': 0.0976027224440283, 'subsample': 0.7803056796281154, 'colsample_bytree': 0.806221371080419, 'reg_alpha': 0.3071581002145097, 'reg_lambda': 2.6688181857430333, 'gamma': 0.11187197346913594, 'min_child_weight': 6}. Best is trial 21 with value: 0.9974599319613533.


Best trial: 24. Best value: 0.99752:  50%|█████     | 25/50 [00:27<00:26,  1.04s/it]

[I 2025-12-29 18:51:32,707] Trial 24 finished with value: 0.9975196781490702 and parameters: {'n_estimators': 790, 'max_depth': 12, 'learning_rate': 0.07759761453093682, 'subsample': 0.7788225467564991, 'colsample_bytree': 0.8086988913893204, 'reg_alpha': 0.05134180938758126, 'reg_lambda': 2.708430347777598, 'gamma': 0.13022374280084953, 'min_child_weight': 5}. Best is trial 24 with value: 0.9975196781490702.


Best trial: 24. Best value: 0.99752:  52%|█████▏    | 26/50 [00:29<00:25,  1.07s/it]

[I 2025-12-29 18:51:33,844] Trial 25 finished with value: 0.997191397007885 and parameters: {'n_estimators': 800, 'max_depth': 12, 'learning_rate': 0.07775066717518013, 'subsample': 0.7755350899898593, 'colsample_bytree': 0.8044694463088927, 'reg_alpha': 0.0014954828953100963, 'reg_lambda': 2.74920466794187, 'gamma': 0.1996758791129717, 'min_child_weight': 5}. Best is trial 24 with value: 0.9975196781490702.


Best trial: 24. Best value: 0.99752:  54%|█████▍    | 27/50 [00:29<00:23,  1.03s/it]

[I 2025-12-29 18:51:34,763] Trial 26 finished with value: 0.9974993024178117 and parameters: {'n_estimators': 731, 'max_depth': 12, 'learning_rate': 0.08451037954437156, 'subsample': 0.7505884309740888, 'colsample_bytree': 0.807812020152967, 'reg_alpha': 0.2232135456099178, 'reg_lambda': 2.335669281566912, 'gamma': 0.12684742835360793, 'min_child_weight': 5}. Best is trial 24 with value: 0.9975196781490702.


Best trial: 24. Best value: 0.99752:  56%|█████▌    | 28/50 [00:30<00:21,  1.01it/s]

[I 2025-12-29 18:51:35,665] Trial 27 finished with value: 0.9971218753717253 and parameters: {'n_estimators': 688, 'max_depth': 12, 'learning_rate': 0.06247638997391007, 'subsample': 0.7038348015353404, 'colsample_bytree': 0.814706720848387, 'reg_alpha': 0.5768581903153138, 'reg_lambda': 2.28722432334115, 'gamma': 0.15211540501719145, 'min_child_weight': 5}. Best is trial 24 with value: 0.9975196781490702.


Best trial: 24. Best value: 0.99752:  58%|█████▊    | 29/50 [00:31<00:20,  1.04it/s]

[I 2025-12-29 18:51:36,580] Trial 28 finished with value: 0.9969823424382687 and parameters: {'n_estimators': 763, 'max_depth': 12, 'learning_rate': 0.08478577294245579, 'subsample': 0.7554687048997256, 'colsample_bytree': 0.8429077254472368, 'reg_alpha': 0.017324624650280937, 'reg_lambda': 2.4735148101635898, 'gamma': 0.31127417352962833, 'min_child_weight': 4}. Best is trial 24 with value: 0.9975196781490702.


Best trial: 29. Best value: 0.997566:  60%|██████    | 30/50 [00:32<00:21,  1.05s/it]

[I 2025-12-29 18:51:37,827] Trial 29 finished with value: 0.9975655595161508 and parameters: {'n_estimators': 684, 'max_depth': 12, 'learning_rate': 0.04377258306117736, 'subsample': 0.7238323164923874, 'colsample_bytree': 0.7850584113058039, 'reg_alpha': 0.2694216006696902, 'reg_lambda': 2.1699627706812303, 'gamma': 0.12185483273089766, 'min_child_weight': 3}. Best is trial 29 with value: 0.9975655595161508.


Best trial: 29. Best value: 0.997566:  62%|██████▏   | 31/50 [00:33<00:19,  1.03s/it]

[I 2025-12-29 18:51:38,806] Trial 30 finished with value: 0.9972443802716453 and parameters: {'n_estimators': 670, 'max_depth': 12, 'learning_rate': 0.043384293244918534, 'subsample': 0.7240111025642233, 'colsample_bytree': 0.7885039103802801, 'reg_alpha': 0.253646360529748, 'reg_lambda': 2.2208194078441807, 'gamma': 0.18597517165631405, 'min_child_weight': 3}. Best is trial 29 with value: 0.9975655595161508.


Best trial: 29. Best value: 0.997566:  64%|██████▍   | 32/50 [00:34<00:18,  1.03s/it]

[I 2025-12-29 18:51:39,832] Trial 31 finished with value: 0.9974820809814728 and parameters: {'n_estimators': 768, 'max_depth': 12, 'learning_rate': 0.06114534001726256, 'subsample': 0.755465024913858, 'colsample_bytree': 0.7706458401425321, 'reg_alpha': 0.20985362404162264, 'reg_lambda': 2.397944609449758, 'gamma': 0.1261297159553608, 'min_child_weight': 5}. Best is trial 29 with value: 0.9975655595161508.


Best trial: 29. Best value: 0.997566:  66%|██████▌   | 33/50 [00:35<00:17,  1.02s/it]

[I 2025-12-29 18:51:40,824] Trial 32 finished with value: 0.9974022894034738 and parameters: {'n_estimators': 769, 'max_depth': 12, 'learning_rate': 0.05984703770042609, 'subsample': 0.7230456107718123, 'colsample_bytree': 0.7641036021138428, 'reg_alpha': 0.24404284285624367, 'reg_lambda': 2.0781836546750894, 'gamma': 0.13093314990981142, 'min_child_weight': 5}. Best is trial 29 with value: 0.9975655595161508.


Best trial: 29. Best value: 0.997566:  68%|██████▊   | 34/50 [00:37<00:16,  1.04s/it]

[I 2025-12-29 18:51:41,919] Trial 33 finished with value: 0.9970658019668892 and parameters: {'n_estimators': 699, 'max_depth': 12, 'learning_rate': 0.03990445143626333, 'subsample': 0.796357244038434, 'colsample_bytree': 0.791488864481623, 'reg_alpha': 0.7754740358396197, 'reg_lambda': 2.2926417302971944, 'gamma': 0.15944202864556203, 'min_child_weight': 4}. Best is trial 29 with value: 0.9975655595161508.


Best trial: 29. Best value: 0.997566:  70%|███████   | 35/50 [00:38<00:16,  1.09s/it]

[I 2025-12-29 18:51:43,136] Trial 34 finished with value: 0.996920171295077 and parameters: {'n_estimators': 653, 'max_depth': 12, 'learning_rate': 0.05522178572852706, 'subsample': 0.7484168464419109, 'colsample_bytree': 0.7594467153481073, 'reg_alpha': 0.52016097771772, 'reg_lambda': 2.7941338212694005, 'gamma': 0.2223054708642883, 'min_child_weight': 3}. Best is trial 29 with value: 0.9975655595161508.


Best trial: 35. Best value: 0.998139:  72%|███████▏  | 36/50 [00:39<00:14,  1.07s/it]

[I 2025-12-29 18:51:44,144] Trial 35 finished with value: 0.9981393041903333 and parameters: {'n_estimators': 773, 'max_depth': 10, 'learning_rate': 0.08606665039955115, 'subsample': 0.732403587677708, 'colsample_bytree': 0.8158611058831956, 'reg_alpha': 0.10551201680628375, 'reg_lambda': 2.420293904424481, 'gamma': 0.0381166717077058, 'min_child_weight': 4}. Best is trial 35 with value: 0.9981393041903333.


Best trial: 36. Best value: 0.998201:  74%|███████▍  | 37/50 [00:40<00:13,  1.01s/it]

[I 2025-12-29 18:51:45,012] Trial 36 finished with value: 0.9982011812579963 and parameters: {'n_estimators': 554, 'max_depth': 10, 'learning_rate': 0.0857366634680369, 'subsample': 0.7194229699912189, 'colsample_bytree': 0.7984420535694712, 'reg_alpha': 0.07526373225235465, 'reg_lambda': 1.8835503176787405, 'gamma': 0.03176536980167656, 'min_child_weight': 4}. Best is trial 36 with value: 0.9982011812579963.


Best trial: 37. Best value: 0.998296:  76%|███████▌  | 38/50 [00:41<00:13,  1.09s/it]

[I 2025-12-29 18:51:46,286] Trial 37 finished with value: 0.9982961311283874 and parameters: {'n_estimators': 552, 'max_depth': 9, 'learning_rate': 0.08767502030170773, 'subsample': 0.717409754307801, 'colsample_bytree': 0.8183934654341165, 'reg_alpha': 0.07648428510720896, 'reg_lambda': 1.9137634318520012, 'gamma': 0.02876987692805398, 'min_child_weight': 4}. Best is trial 37 with value: 0.9982961311283874.


Best trial: 37. Best value: 0.998296:  78%|███████▊  | 39/50 [00:42<00:11,  1.02s/it]

[I 2025-12-29 18:51:47,162] Trial 38 finished with value: 0.9981761930918989 and parameters: {'n_estimators': 532, 'max_depth': 9, 'learning_rate': 0.08704509358994257, 'subsample': 0.7158574070888085, 'colsample_bytree': 0.8251395385009919, 'reg_alpha': 0.3375688940148556, 'reg_lambda': 1.838888771914823, 'gamma': 0.0330375248619801, 'min_child_weight': 4}. Best is trial 37 with value: 0.9982961311283874.


Best trial: 37. Best value: 0.998296:  80%|████████  | 40/50 [00:43<00:09,  1.00it/s]

[I 2025-12-29 18:51:48,092] Trial 39 finished with value: 0.9981522874022664 and parameters: {'n_estimators': 539, 'max_depth': 9, 'learning_rate': 0.08924599164813185, 'subsample': 0.7135036762590854, 'colsample_bytree': 0.8232390797196685, 'reg_alpha': 0.12439609554992281, 'reg_lambda': 1.849041284199025, 'gamma': 0.03794247240862764, 'min_child_weight': 4}. Best is trial 37 with value: 0.9982961311283874.


Best trial: 37. Best value: 0.998296:  82%|████████▏ | 41/50 [00:44<00:09,  1.01s/it]

[I 2025-12-29 18:51:49,149] Trial 40 finished with value: 0.9981176910262104 and parameters: {'n_estimators': 543, 'max_depth': 9, 'learning_rate': 0.06867976693208017, 'subsample': 0.7003112259884888, 'colsample_bytree': 0.7295286501003975, 'reg_alpha': 0.3866976223011256, 'reg_lambda': 1.868285713730315, 'gamma': 0.025798811929040523, 'min_child_weight': 4}. Best is trial 37 with value: 0.9982961311283874.


Best trial: 41. Best value: 0.998311:  84%|████████▍ | 42/50 [00:45<00:07,  1.04it/s]

[I 2025-12-29 18:51:49,974] Trial 41 finished with value: 0.9983111629833894 and parameters: {'n_estimators': 560, 'max_depth': 8, 'learning_rate': 0.08615351373418487, 'subsample': 0.7343610432604657, 'colsample_bytree': 0.8236134225329197, 'reg_alpha': 0.10925471780640356, 'reg_lambda': 1.5944377371651524, 'gamma': 0.030951970015281667, 'min_child_weight': 4}. Best is trial 41 with value: 0.9983111629833894.


Best trial: 41. Best value: 0.998311:  86%|████████▌ | 43/50 [00:46<00:06,  1.06it/s]

[I 2025-12-29 18:51:50,879] Trial 42 finished with value: 0.9982532107977132 and parameters: {'n_estimators': 570, 'max_depth': 8, 'learning_rate': 0.08931762559565207, 'subsample': 0.7124114122544668, 'colsample_bytree': 0.8448404516881289, 'reg_alpha': 0.12711000497314892, 'reg_lambda': 1.5720743517594258, 'gamma': 0.037102027368875454, 'min_child_weight': 4}. Best is trial 41 with value: 0.9983111629833894.


Best trial: 41. Best value: 0.998311:  88%|████████▊ | 44/50 [00:46<00:05,  1.08it/s]

[I 2025-12-29 18:51:51,775] Trial 43 finished with value: 0.9981428346415943 and parameters: {'n_estimators': 582, 'max_depth': 8, 'learning_rate': 0.06633759396477848, 'subsample': 0.7382389532566568, 'colsample_bytree': 0.8368283419617266, 'reg_alpha': 0.10479878715672922, 'reg_lambda': 1.561357492506596, 'gamma': 0.04106476735370586, 'min_child_weight': 4}. Best is trial 41 with value: 0.9983111629833894.


Best trial: 44. Best value: 0.998659:  90%|█████████ | 45/50 [00:47<00:04,  1.11it/s]

[I 2025-12-29 18:51:52,622] Trial 44 finished with value: 0.9986589862539785 and parameters: {'n_estimators': 496, 'max_depth': 7, 'learning_rate': 0.08922016625819638, 'subsample': 0.7121753192299412, 'colsample_bytree': 0.8464343413070025, 'reg_alpha': 0.3658979138425145, 'reg_lambda': 1.562225881841901, 'gamma': 0.0022623500682253522, 'min_child_weight': 3}. Best is trial 44 with value: 0.9986589862539785.


Best trial: 45. Best value: 0.998776:  92%|█████████▏| 46/50 [00:48<00:03,  1.15it/s]

[I 2025-12-29 18:51:53,421] Trial 45 finished with value: 0.9987762278250816 and parameters: {'n_estimators': 499, 'max_depth': 7, 'learning_rate': 0.09851042715028153, 'subsample': 0.7126245285451007, 'colsample_bytree': 0.8535866248078255, 'reg_alpha': 0.19039569723489863, 'reg_lambda': 1.5138450425173606, 'gamma': 0.0034759299175698005, 'min_child_weight': 3}. Best is trial 45 with value: 0.9987762278250816.


Best trial: 46. Best value: 0.998844:  94%|█████████▍| 47/50 [00:49<00:02,  1.05it/s]

[I 2025-12-29 18:51:54,548] Trial 46 finished with value: 0.9988442186858624 and parameters: {'n_estimators': 501, 'max_depth': 7, 'learning_rate': 0.0964238000232465, 'subsample': 0.741097401636354, 'colsample_bytree': 0.8493180767737526, 'reg_alpha': 0.1732833301606589, 'reg_lambda': 1.5584937454215104, 'gamma': 0.0019098296861894543, 'min_child_weight': 3}. Best is trial 46 with value: 0.9988442186858624.


Best trial: 46. Best value: 0.998844:  96%|█████████▌| 48/50 [00:50<00:01,  1.01it/s]

[I 2025-12-29 18:51:55,621] Trial 47 finished with value: 0.9988371404152782 and parameters: {'n_estimators': 497, 'max_depth': 7, 'learning_rate': 0.09954414746342434, 'subsample': 0.7362446561344272, 'colsample_bytree': 0.8806627225977456, 'reg_alpha': 0.47126647051577736, 'reg_lambda': 1.2868963477811963, 'gamma': 0.0008693622226914554, 'min_child_weight': 3}. Best is trial 46 with value: 0.9988442186858624.


Best trial: 46. Best value: 0.998844:  98%|█████████▊| 49/50 [00:51<00:00,  1.02it/s]

[I 2025-12-29 18:51:56,594] Trial 48 finished with value: 0.9987132800972969 and parameters: {'n_estimators': 499, 'max_depth': 7, 'learning_rate': 0.09798998212988072, 'subsample': 0.7397107458690192, 'colsample_bytree': 0.8798614945644001, 'reg_alpha': 0.4594983003697475, 'reg_lambda': 1.2857049645115397, 'gamma': 0.003909709817648889, 'min_child_weight': 3}. Best is trial 46 with value: 0.9988442186858624.


Best trial: 46. Best value: 0.998844: 100%|██████████| 50/50 [00:52<00:00,  1.05s/it]


[I 2025-12-29 18:51:57,399] Trial 49 finished with value: 0.9985076006479356 and parameters: {'n_estimators': 500, 'max_depth': 6, 'learning_rate': 0.09858662880498881, 'subsample': 0.8314028170502427, 'colsample_bytree': 0.8848935241054242, 'reg_alpha': 0.7194545661460865, 'reg_lambda': 1.2617242350349227, 'gamma': 0.002820659945525006, 'min_child_weight': 3}. Best is trial 46 with value: 0.9988442186858624.

✓ Best R² Score: 0.9988
✓ Best Parameters:
    n_estimators: 501
    max_depth: 7
    learning_rate: 0.0964238000232465
    subsample: 0.741097401636354
    colsample_bytree: 0.8493180767737526
    reg_alpha: 0.1732833301606589
    reg_lambda: 1.5584937454215104
    gamma: 0.0019098296861894543
    min_child_weight: 3

SECTION 6.1: TRAINING MODELS WITH OPTIMIZED HYPERPARAMETERS

[Training Optimized Cost Model]
--------------------------------------------------------------------------------
  Train R²: 0.9994
  Test R²:  0.9961
  RMSE:     ₹32.30
  MAE:      ₹15.76
  MAPE:     6.4

In [47]:
# ============================================================================
# SECTION 6.5: COMPREHENSIVE MODEL COMPARISON (ORIGINAL vs OPTUNA)
# ============================================================================
# PASTE THIS AFTER THE OPTIMIZED MODEL TRAINING SECTION
# This compares both approaches and recommends the best model
# ============================================================================

print("\n" + "="*100)
print("SECTION 6.5: MODEL COMPARISON - ORIGINAL vs OPTUNA OPTIMIZED")
print("="*100)

def compare_original_vs_optuna(X_cost_train, X_cost_test, y_cost_train, y_cost_test,
                                X_co2_train, X_co2_test, y_co2_train_log, y_co2_test_log,
                                y_co2_train, y_co2_test,
                                optimized_results):
    """
    Train original models and compare with already-trained optimized models
    """
    
    comparison = {}
    
    # ========================================================================
    # PART 1: TRAIN ORIGINAL MODELS
    # ========================================================================
    print("\n" + "="*100)
    print("TRAINING ORIGINAL MODELS (Manual Hyperparameters)")
    print("="*100)
    
    # Original Cost Model
    print("\n[Training Original Cost Model]")
    print("-" * 80)
    
    original_cost_params = {
        'n_estimators': 600,
        'max_depth': 8,
        'learning_rate': 0.03,
        'reg_alpha': 0.5,
        'reg_lambda': 1.5,
        'gamma': 0.2,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'min_child_weight': 5,
        'random_state': 42,
        'n_jobs': -1,
        'verbosity': 0
    }
    
    original_cost_model = XGBRegressor(**original_cost_params)
    original_cost_model.fit(X_cost_train, y_cost_train)
    
    orig_cost_train_pred = original_cost_model.predict(X_cost_train)
    orig_cost_test_pred = original_cost_model.predict(X_cost_test)
    
    # Metrics
    orig_cost_metrics = {
        'train_r2': r2_score(y_cost_train, orig_cost_train_pred),
        'test_r2': r2_score(y_cost_test, orig_cost_test_pred),
        'rmse': np.sqrt(mean_squared_error(y_cost_test, orig_cost_test_pred)),
        'mae': mean_absolute_error(y_cost_test, orig_cost_test_pred),
        'mape': np.mean(np.abs((y_cost_test - orig_cost_test_pred) / y_cost_test)) * 100
    }
    orig_cost_metrics['gap'] = orig_cost_metrics['train_r2'] - orig_cost_metrics['test_r2']
    
    # Cross-validation
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    orig_cv_cost = cross_val_score(original_cost_model, X_cost_train, y_cost_train, 
                                    cv=kfold, scoring='r2', n_jobs=-1)
    orig_cost_metrics['cv_mean'] = orig_cv_cost.mean()
    orig_cost_metrics['cv_std'] = orig_cv_cost.std()
    
    print(f"  Train R²:     {orig_cost_metrics['train_r2']:.4f}")
    print(f"  Test R²:      {orig_cost_metrics['test_r2']:.4f}")
    print(f"  Gap:          {orig_cost_metrics['gap']:.4f}")
    print(f"  CV Mean:      {orig_cost_metrics['cv_mean']:.4f} (±{orig_cost_metrics['cv_std']:.4f})")
    
    # Original CO2 Model
    print("\n[Training Original CO2 Model]")
    print("-" * 80)
    
    original_co2_params = {
        'n_estimators': 600,
        'max_depth': 10,
        'learning_rate': 0.05,
        'subsample': 0.9,
        'colsample_bytree': 0.9,
        'reg_alpha': 0.0,
        'reg_lambda': 1.5,
        'random_state': 42,
        'n_jobs': -1,
        'verbosity': 0
    }
    
    original_co2_model = XGBRegressor(**original_co2_params)
    original_co2_model.fit(X_co2_train, y_co2_train_log)
    
    orig_co2_train_pred_log = original_co2_model.predict(X_co2_train)
    orig_co2_test_pred_log = original_co2_model.predict(X_co2_test)
    
    orig_co2_train_pred = np.expm1(orig_co2_train_pred_log)
    orig_co2_test_pred = np.expm1(orig_co2_test_pred_log)
    
    # Metrics
    orig_co2_metrics = {
        'train_r2': r2_score(y_co2_train, orig_co2_train_pred),
        'test_r2': r2_score(y_co2_test, orig_co2_test_pred),
        'rmse': np.sqrt(mean_squared_error(y_co2_test, orig_co2_test_pred)),
        'mae': mean_absolute_error(y_co2_test, orig_co2_test_pred)
    }
    orig_co2_metrics['gap'] = orig_co2_metrics['train_r2'] - orig_co2_metrics['test_r2']
    
    # Cross-validation (on log scale)
    orig_cv_co2 = cross_val_score(original_co2_model, X_co2_train, y_co2_train_log,
                                   cv=kfold, scoring='r2', n_jobs=-1)
    orig_co2_metrics['cv_mean'] = orig_cv_co2.mean()
    orig_co2_metrics['cv_std'] = orig_cv_co2.std()
    
    print(f"  Train R²:     {orig_co2_metrics['train_r2']:.4f}")
    print(f"  Test R²:      {orig_co2_metrics['test_r2']:.4f}")
    print(f"  Gap:          {orig_co2_metrics['gap']:.4f}")
    print(f"  CV Mean:      {orig_co2_metrics['cv_mean']:.4f} (±{orig_co2_metrics['cv_std']:.4f})")
    
    # ========================================================================
    # PART 2: COMPARISON TABLES
    # ========================================================================
    print("\n\n" + "="*100)
    print("DETAILED MODEL COMPARISON")
    print("="*100)
    
    # ========== COST MODEL COMPARISON ==========
    print("\n[COST MODEL COMPARISON]")
    print("="*80)
    
    cost_comparison = pd.DataFrame({
        'Metric': [
            'Train R²',
            'Test R²',
            'Train-Test Gap',
            'RMSE (₹)',
            'MAE (₹)',
            'MAPE (%)',
            'CV Mean R²',
            'CV Std Dev'
        ],
        'Original (Manual)': [
            f"{orig_cost_metrics['train_r2']:.4f}",
            f"{orig_cost_metrics['test_r2']:.4f}",
            f"{orig_cost_metrics['gap']:.4f}",
            f"{orig_cost_metrics['rmse']:.2f}",
            f"{orig_cost_metrics['mae']:.2f}",
            f"{orig_cost_metrics['mape']:.2f}",
            f"{orig_cost_metrics['cv_mean']:.4f}",
            f"{orig_cost_metrics['cv_std']:.4f}"
        ],
        'Optuna (Optimized)': [
            f"{optimized_results['cost']['train_r2']:.4f}",
            f"{optimized_results['cost']['test_r2']:.4f}",
            f"{optimized_results['cost']['train_r2'] - optimized_results['cost']['test_r2']:.4f}",
            f"{optimized_results['cost']['rmse']:.2f}",
            f"{optimized_results['cost']['mae']:.2f}",
            f"{optimized_results['cost']['mape']:.2f}",
            "0.9946",  # From Optuna output
            "0.0081"   # From Optuna output
        ]
    })
    
    print("\n" + cost_comparison.to_string(index=False))
    
    # Cost Model Analysis
    print("\n[ANALYSIS - COST MODEL]")
    print("-" * 80)
    
    cost_score_orig = 0
    cost_score_opt = 0
    
    # Test R² comparison
    test_r2_diff = optimized_results['cost']['test_r2'] - orig_cost_metrics['test_r2']
    if abs(test_r2_diff) < 0.002:
        print(f"✓ Test R²: Essentially EQUAL (diff: {test_r2_diff:.4f})")
    elif test_r2_diff > 0:
        print(f"✓ Optuna BETTER on Test R² by {test_r2_diff:.4f}")
        cost_score_opt += 2
    else:
        print(f"✓ Original BETTER on Test R² by {abs(test_r2_diff):.4f}")
        cost_score_orig += 2
    
    # Overfitting check
    if orig_cost_metrics['train_r2'] >= 0.999:
        print(f"⚠ Original: Near-perfect train fit ({orig_cost_metrics['train_r2']:.4f}) - OVERFITTING RISK")
        cost_score_opt += 2
    else:
        print(f"✓ Original: Reasonable train fit ({orig_cost_metrics['train_r2']:.4f})")
        cost_score_orig += 1
    
    if optimized_results['cost']['train_r2'] < 0.999:
        print(f"✓ Optuna: Healthy train fit ({optimized_results['cost']['train_r2']:.4f}) - Better generalization")
        cost_score_opt += 2
    
    # CV stability
    cv_diff = orig_cost_metrics['cv_std'] - 0.0081  # Optuna CV std
    if cv_diff > 0:
        stability_improvement = (cv_diff / orig_cost_metrics['cv_std']) * 100
        print(f"✓ Optuna MORE STABLE: {stability_improvement:.1f}% improvement in CV stability")
        cost_score_opt += 1
    else:
        print(f"✓ Original MORE STABLE in CV")
        cost_score_orig += 1
    
    # Gap comparison
    optuna_gap = optimized_results['cost']['train_r2'] - optimized_results['cost']['test_r2']
    if optuna_gap < orig_cost_metrics['gap']:
        print(f"✓ Optuna: Smaller train-test gap ({optuna_gap:.4f} vs {orig_cost_metrics['gap']:.4f})")
        cost_score_opt += 1
    else:
        print(f"✓ Original: Smaller train-test gap")
        cost_score_orig += 1
    
    print(f"\n Score: Original {cost_score_orig} | Optuna {cost_score_opt}")
    
    if cost_score_opt > cost_score_orig:
        print("RECOMMENDATION: Use OPTUNA Cost Model")
        cost_winner = 'optuna'
    elif cost_score_opt == cost_score_orig:
        print("TIE: Both models perform similarly")
        # Tiebreaker: prefer Optuna if more stable
        if 0.0081 < orig_cost_metrics['cv_std']:
            print("   Tiebreaker: Optuna (better CV stability)")
            cost_winner = 'optuna'
        else:
            print("   Tiebreaker: Original (better CV stability)")
            cost_winner = 'original'
    else:
        print("RECOMMENDATION: Use ORIGINAL Cost Model")
        cost_winner = 'original'
    
    # ========== CO2 MODEL COMPARISON ==========
    print("\n\n[CO2 MODEL COMPARISON]")
    print("="*80)
    
    co2_comparison = pd.DataFrame({
        'Metric': [
            'Train R²',
            'Test R²',
            'Train-Test Gap',
            'RMSE',
            'MAE',
            'CV Mean R²',
            'CV Std Dev'
        ],
        'Original (Manual)': [
            f"{orig_co2_metrics['train_r2']:.4f}",
            f"{orig_co2_metrics['test_r2']:.4f}",
            f"{orig_co2_metrics['gap']:.4f}",
            f"{orig_co2_metrics['rmse']:.2f}",
            f"{orig_co2_metrics['mae']:.2f}",
            f"{orig_co2_metrics['cv_mean']:.4f}",
            f"{orig_co2_metrics['cv_std']:.4f}"
        ],
        'Optuna (Optimized)': [
            f"{optimized_results['co2']['train_r2']:.4f}",
            f"{optimized_results['co2']['test_r2']:.4f}",
            f"{optimized_results['co2']['train_r2'] - optimized_results['co2']['test_r2']:.4f}",
            f"{optimized_results['co2']['rmse']:.2f}",
            f"{optimized_results['co2']['mae']:.2f}",
            "0.9985",  # From Optuna output
            "0.0011"   # From Optuna output
        ]
    })
    
    print("\n" + co2_comparison.to_string(index=False))
    
    # CO2 Model Analysis
    print("\n[ANALYSIS - CO2 MODEL]")
    print("-" * 80)
    
    co2_score_orig = 0
    co2_score_opt = 0
    
    # Test R² comparison
    test_r2_diff_co2 = optimized_results['co2']['test_r2'] - orig_co2_metrics['test_r2']
    if test_r2_diff_co2 > 0.001:
        print(f"✓ Optuna BETTER on Test R² by {test_r2_diff_co2:.4f}")
        co2_score_opt += 2
    elif test_r2_diff_co2 < -0.001:
        print(f"✓ Original BETTER on Test R² by {abs(test_r2_diff_co2):.4f}")
        co2_score_orig += 2
    else:
        print(f"✓ Test R²: Essentially EQUAL (diff: {test_r2_diff_co2:.4f})")
    
    # CV comparison
    if 0.9985 > orig_co2_metrics['cv_mean']:
        improvement = ((0.9985 - orig_co2_metrics['cv_mean']) / orig_co2_metrics['cv_mean']) * 100
        print(f"✓ Optuna: Better CV Mean by {improvement:.2f}%")
        co2_score_opt += 2
    else:
        print(f"✓ Original: Better CV Mean")
        co2_score_orig += 2
    
    # CV stability
    if 0.0011 < orig_co2_metrics['cv_std']:
        stability_improvement = ((orig_co2_metrics['cv_std'] - 0.0011) / orig_co2_metrics['cv_std']) * 100
        print(f"✓ Optuna: {stability_improvement:.1f}% MORE STABLE in CV")
        co2_score_opt += 1
    else:
        print(f"✓ Original: More stable in CV")
        co2_score_orig += 1
    
    print(f"\n Score: Original {co2_score_orig} | Optuna {co2_score_opt}")
    
    if co2_score_opt > co2_score_orig:
        print("RECOMMENDATION: Use OPTUNA CO2 Model")
        co2_winner = 'optuna'
    else:
        print("RECOMMENDATION: Use ORIGINAL CO2 Model")
        co2_winner = 'original'
    
    # ========================================================================
    # PART 3: FINAL VERDICT
    # ========================================================================
    print("\n\n" + "="*100)
    print("FINAL VERDICT & MODEL SELECTION")
    print("="*100)
    
    print(f"\n COST MODEL:  Use {'OPTUNA' if cost_winner == 'optuna' else 'ORIGINAL'}")
    print(f"CO2 MODEL:   Use {'OPTUNA' if co2_winner == 'optuna' else 'ORIGINAL'}")
    
    # Set final models
    if cost_winner == 'optuna':
        final_cost_model = optimized_results['cost']['model']
        final_cost_pred = optimized_results['cost']['test_pred']
        print(f"\n  Cost Model: Using Optuna (Test R²={optimized_results['cost']['test_r2']:.4f})")
    else:
        final_cost_model = original_cost_model
        final_cost_pred = orig_cost_test_pred
        print(f"\n  Cost Model: Using Original (Test R²={orig_cost_metrics['test_r2']:.4f})")
    
    if co2_winner == 'optuna':
        final_co2_model = optimized_results['co2']['model']
        final_co2_pred = optimized_results['co2']['test_pred']
        print(f"  CO2 Model: Using Optuna (Test R²={optimized_results['co2']['test_r2']:.4f})")
    else:
        final_co2_model = original_co2_model
        final_co2_pred = orig_co2_test_pred
        print(f"  CO2 Model: Using Original (Test R²={orig_co2_metrics['test_r2']:.4f})")
    
    # Save final models
    joblib.dump(final_cost_model, MODEL_DIR / 'final_cost_model.pkl')
    joblib.dump(final_co2_model, MODEL_DIR / 'final_co2_model.pkl')
    
    print("\n✓ Saved: final_cost_model.pkl")
    print("✓ Saved: final_co2_model.pkl")
    
    # Comprehensive summary
    if cost_winner == 'optuna' and co2_winner == 'optuna':
        print("\n OVERALL RECOMMENDATION: Use OPTUNA for BOTH models")
        print("\n   JUSTIFICATION:")
        print("   • Superior cross-validation performance")
        print("   • Better generalization (less overfitting)")
        print("   • More stable predictions")
        print("   • Optimized specifically for this dataset")
    elif cost_winner == 'optuna' or co2_winner == 'optuna':
        print("\n MIXED APPROACH: Use best model for each target")
        print("   Both approaches have strengths for different targets")
    else:
        print("\n ORIGINAL MODELS PREFERRED")
        print("   Manual tuning achieved excellent results")
    
    print("\n" + "="*100)
    print("✓ COMPARISON COMPLETE")
    print("="*100)
    
    # Store comparison results
    comparison['cost'] = {
        'original_metrics': orig_cost_metrics,
        'optuna_metrics': {
            'train_r2': optimized_results['cost']['train_r2'],
            'test_r2': optimized_results['cost']['test_r2'],
            'rmse': optimized_results['cost']['rmse'],
            'mae': optimized_results['cost']['mae'],
            'cv_mean': 0.9946,
            'cv_std': 0.0081
        },
        'winner': cost_winner,
        'final_model': final_cost_model,
        'final_predictions': final_cost_pred
    }
    
    comparison['co2'] = {
        'original_metrics': orig_co2_metrics,
        'optuna_metrics': {
            'train_r2': optimized_results['co2']['train_r2'],
            'test_r2': optimized_results['co2']['test_r2'],
            'rmse': optimized_results['co2']['rmse'],
            'mae': optimized_results['co2']['mae'],
            'cv_mean': 0.9985,
            'cv_std': 0.0011
        },
        'winner': co2_winner,
        'final_model': final_co2_model,
        'final_predictions': final_co2_pred
    }
    
    return comparison

# ============================================================================
# EXECUTE COMPARISON
# ============================================================================

comparison_results = compare_original_vs_optuna(
    X_cost_train, X_cost_test, y_cost_train, y_cost_test,
    X_co2_train, X_co2_test, y_co2_train_log, y_co2_test_log,
    y_co2_train, y_co2_test,
    optimized_results
)

# Update variables for downstream use (SHAP, visualizations, recommendations)
best_cost_model = comparison_results['cost']['final_model']
best_co2_model = comparison_results['co2']['final_model']
y_cost_pred = comparison_results['cost']['final_predictions']
y_co2_pred = comparison_results['co2']['final_predictions']

print("\n✓ Final models set for downstream analysis")
print("  - SHAP analysis will use selected models")
print("  - Visualizations will use selected predictions")
print("  - Recommendations will use selected predictions")


SECTION 6.5: MODEL COMPARISON - ORIGINAL vs OPTUNA OPTIMIZED

TRAINING ORIGINAL MODELS (Manual Hyperparameters)

[Training Original Cost Model]
--------------------------------------------------------------------------------
  Train R²:     0.9994
  Test R²:      0.9955
  Gap:          0.0040
  CV Mean:      0.9941 (±0.0058)

[Training Original CO2 Model]
--------------------------------------------------------------------------------
  Train R²:     1.0000
  Test R²:      0.9955
  Gap:          0.0045
  CV Mean:      0.9989 (±0.0009)


DETAILED MODEL COMPARISON

[COST MODEL COMPARISON]

        Metric Original (Manual) Optuna (Optimized)
      Train R²            0.9994             0.9994
       Test R²            0.9955             0.9961
Train-Test Gap            0.0040             0.0033
      RMSE (₹)             34.63              32.30
       MAE (₹)             16.00              15.76
      MAPE (%)              6.56               6.46
    CV Mean R²            0.9941        

In [48]:
# ============================================================================
# ENHANCED VISUALIZATIONS
# ============================================================================

print("\n" + "="*100)
print("SECTION 7: ENHANCED MODEL VISUALIZATIONS")
print("="*100)

# CHART 1: Prediction Accuracy
fig1 = make_subplots(
    rows=2, cols=2,
    subplot_titles=[
        f'Cost Predictions (R²={optimized_results["cost"]["test_r2"]:.4f})',
        f'CO2 Predictions (R²={optimized_results["co2"]["test_r2"]:.4f})',
        'Cost Residuals Distribution',
        'CO2 Residuals Distribution'
    ],
    specs=[[{'type': 'scatter'}, {'type': 'scatter'}],
           [{'type': 'histogram'}, {'type': 'histogram'}]]
)

# Cost scatter
fig1.add_trace(go.Scatter(
    x=y_cost_test, y=y_cost_pred,
    mode='markers',
    marker=dict(color='blue', opacity=0.6, size=6),
    name='Predictions'
), row=1, col=1)

fig1.add_trace(go.Scatter(
    x=[y_cost_test.min(), y_cost_test.max()],
    y=[y_cost_test.min(), y_cost_test.max()],
    mode='lines',
    line=dict(color='red', dash='dash', width=2),
    name='Perfect Fit',
    showlegend=False
), row=1, col=1)

# CO2 scatter
fig1.add_trace(go.Scatter(
    x=y_co2_test, y=y_co2_pred,
    mode='markers',
    marker=dict(color='green', opacity=0.6, size=6),
    name='Predictions',
    showlegend=False
), row=1, col=2)

fig1.add_trace(go.Scatter(
    x=[y_co2_test.min(), y_co2_test.max()],
    y=[y_co2_test.min(), y_co2_test.max()],
    mode='lines',
    line=dict(color='red', dash='dash', width=2),
    showlegend=False
), row=1, col=2)

# Residuals
cost_residuals = y_cost_pred - y_cost_test
co2_residuals = y_co2_pred - y_co2_test

fig1.add_trace(go.Histogram(
    x=cost_residuals,
    marker_color='blue',
    opacity=0.7,
    name='Cost Residuals',
    showlegend=False
), row=2, col=1)

fig1.add_trace(go.Histogram(
    x=co2_residuals,
    marker_color='green',
    opacity=0.7,
    name='CO2 Residuals',
    showlegend=False
), row=2, col=2)

fig1.update_xaxes(title_text="Actual Cost (₹)", row=1, col=1)
fig1.update_yaxes(title_text="Predicted Cost (₹)", row=1, col=1)
fig1.update_xaxes(title_text="Actual CO2", row=1, col=2)
fig1.update_yaxes(title_text="Predicted CO2", row=1, col=2)
fig1.update_xaxes(title_text="Residuals", row=2, col=1)
fig1.update_xaxes(title_text="Residuals", row=2, col=2)

fig1.update_layout(height=800, title_text="Model Performance Analysis")
fig1.show()

# CHART 2: Error Analysis by Prediction Range
print("\n[Error Analysis by Prediction Range]")

cost_bins = pd.qcut(y_cost_pred, q=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
error_analysis = pd.DataFrame({
    'Range': cost_bins,
    'Actual': y_cost_test,
    'Predicted': y_cost_pred,
    'Error': np.abs(y_cost_test - y_cost_pred)
})

fig2 = px.box(error_analysis, x='Range', y='Error',
              title='Cost Prediction Error Distribution by Price Range',
              labels={'Error': 'Absolute Error (₹)', 'Range': 'Predicted Cost Range'},
              color='Range')
fig2.show()


SECTION 7: ENHANCED MODEL VISUALIZATIONS



[Error Analysis by Prediction Range]


In [49]:
# ============================================================================
# SHAP ANALYSIS - OPTIMIZED MODELS
# ============================================================================

print("\n" + "="*100)
print("SECTION 8: SHAP FEATURE IMPORTANCE ANALYSIS")
print("="*100)

def comprehensive_shap_analysis(model, X_test, feature_names, model_name, sample_size=200):
    """
    Complete SHAP analysis with multiple visualizations
    """
    print(f"\n[SHAP Analysis: {model_name}]")
    print("-" * 80)
    
    # Sample data
    np.random.seed(42)
    sample_idx = np.random.choice(len(X_test), min(sample_size, len(X_test)), replace=False)
    X_sample = X_test[sample_idx]
    
    # Calculate SHAP values
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_sample)
    
    # Feature importance
    feature_importance = np.abs(shap_values).mean(axis=0)
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importance
    }).sort_values('Importance', ascending=False)
    
    print(f"\nTop 10 Most Important Features:")
    print(importance_df.head(10).to_string(index=False))
    
    # VISUALIZATION 1: Bar Plot
    fig1 = go.Figure()
    top_n = 15
    top_features = importance_df.head(top_n)
    
    fig1.add_trace(go.Bar(
        y=top_features['Feature'][::-1],
        x=top_features['Importance'][::-1],
        orientation='h',
        marker=dict(
            color=top_features['Importance'][::-1],
            colorscale='Viridis',
            showscale=True
        )
    ))
    
    fig1.update_layout(
        title=f"{model_name} - Top {top_n} Features (SHAP Importance)",
        xaxis_title="Mean |SHAP Value|",
        yaxis_title="Feature",
        height=600,
        showlegend=False
    )
    fig1.show()
    
    # VISUALIZATION 2: Beeswarm Plot (Summary)
    shap_data = []
    for i, feat in enumerate(feature_names[:20]):
        for j in range(len(X_sample)):
            shap_data.append({
                'Feature': feat,
                'SHAP Value': shap_values[j, i],
                'Feature Value': X_sample[j, i]
            })
    
    shap_df = pd.DataFrame(shap_data)
    
    fig2 = px.scatter(
        shap_df,
        y='Feature',
        x='SHAP Value',
        color='Feature Value',
        color_continuous_scale='RdYlBu_r',
        title=f'{model_name} - SHAP Summary Plot',
        opacity=0.6,
        height=700
    )
    fig2.update_traces(marker=dict(size=5))
    fig2.show()
    
    return importance_df

# Run SHAP analysis
cost_shap = comprehensive_shap_analysis(
    best_cost_model, X_cost_test, cost_features, 
    "Cost Model", sample_size=200
)

co2_shap = comprehensive_shap_analysis(
    best_co2_model, X_co2_test, co2_features,
    "CO2 Model", sample_size=200
)


SECTION 8: SHAP FEATURE IMPORTANCE ANALYSIS

[SHAP Analysis: Cost Model]
--------------------------------------------------------------------------------

Top 10 Most Important Features:
                Feature  Importance
             weight_log  172.800018
   capacity_weight_prod   88.299416
         weight_squared   24.382204
        material_weight   23.845213
  capacity_weight_ratio   14.114363
  recyclability_percent    8.867025
parent_material_encoded    7.550686
        packaging_ratio    1.467634
       strength_encoded    1.394034
       material_encoded    1.387979



[SHAP Analysis: CO2 Model]
--------------------------------------------------------------------------------

Top 10 Most Important Features:
                Feature  Importance
        material_weight    0.498737
   capacity_weight_prod    0.422804
        weight_capacity    0.414574
        weight_measured    0.116948
      parent_mat_weight    0.058760
       material_encoded    0.040290
     material_weight_sq    0.024057
       strength_encoded    0.013750
         weight_squared    0.008063
parent_material_encoded    0.005133


In [50]:
# ============================================================================
# ENHANCED TEST SAMPLE RECOMMENDATION SYSTEM
# Provides detailed recommendations for individual test samples
# ============================================================================

import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px


print("\n" + "="*100)
print("PREPARING TEST DATAFRAME FOR RECOMMENDATION SYSTEM")
print("="*100)

# Get the size of your test set
test_size = len(y_cost_test)


from sklearn.model_selection import train_test_split

# Filter df_eng to only valid rows (same as your original preparation)
df_valid = df_eng[
    df_eng['Packaging_Cost'].notna() & 
    df_eng['CO2_Impact_Index'].notna()
].copy()

print(f"Valid data: {len(df_valid)} rows")

# Recreate the split with SAME parameters to get matching indices
y_cost_full = df_valid['Packaging_Cost'].values
y_bins = pd.qcut(y_cost_full, q=10, labels=False, duplicates='drop')

# Split with SAME random_state to get same split
_, df_test, _, _ = train_test_split(
    df_valid, y_cost_full, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_bins
)

print(f"✓ Test dataframe created: {len(df_test)} rows × {len(df_test.columns)} columns")
print(f"✓ Matches test set size: {len(df_test) == len(y_cost_test)}")

# Verify key columns exist
key_cols = ['material', 'shape', 'strength', 'recycling', 'recyclability_percent']
available = [col for col in key_cols if col in df_test.columns]
missing = [col for col in key_cols if col not in df_test.columns]

print(f"✓ Available columns: {available}")
if missing:
    print(f"⚠ Missing columns: {missing}")

def recommend_for_test_sample(
    sample_idx,
    df_test,
    df_eng,
    y_cost_pred,
    y_co2_pred,
    best_cost_model,
    best_co2_model,
    cost_features,
    co2_features,
    top_n=10
):
    """
    Generate detailed recommendations for a specific test sample
    
    Parameters:
    -----------
    sample_idx : int
        Index of the test sample (0 to len(df_test)-1)
    df_test : DataFrame
        Test dataset
    df_eng : DataFrame
        Full engineered dataset with all materials
    y_cost_pred : array
        Cost predictions for test set
    y_co2_pred : array
        CO2 predictions for test set
    best_cost_model : model
        Trained cost prediction model
    best_co2_model : model
        Trained CO2 prediction model
    cost_features : list
        Features for cost model
    co2_features : list
        Features for CO2 model
    top_n : int
        Number of alternative recommendations
    
    Returns:
    --------
    DataFrame with alternative recommendations
    """
    
    print("="*100)
    print(f"DETAILED RECOMMENDATION FOR TEST SAMPLE #{sample_idx}")
    print("="*100)
    
    # Get the specific sample
    sample = df_test.iloc[sample_idx]
    
    # ========================================================================
    # STEP 1: ANALYZE CURRENT SAMPLE
    # ========================================================================
    
    print("\n[CURRENT PRODUCT PROFILE]")
    print("-" * 80)
    
    # Display key attributes
    print(f"\nProduct Characteristics:")
    if 'food_group' in sample.index:
        print(f"  Food Category: {sample['food_group']}")
    if 'product_quantity' in sample.index:
        print(f"  Product Quantity: {sample['product_quantity']:.1f}g")
    if 'weight_measured' in sample.index:
        print(f"  Package Weight: {sample['weight_measured']:.1f}g")
    if 'weight_capacity' in sample.index:
        print(f"  Weight Capacity: {sample['weight_capacity']:.1f}g")
    
    print(f"\nCurrent Packaging:")
    if 'material' in sample.index:
        print(f"  Material: {sample['material']}")
    if 'shape' in sample.index:
        print(f"  Shape: {sample['shape']}")
    if 'strength' in sample.index:
        print(f"  Strength: {sample['strength']}")
    if 'recycling' in sample.index:
        print(f"  Recycling Type: {sample['recycling']}")
    if 'recyclability_percent' in sample.index:
        print(f"  Recyclability: {sample['recyclability_percent']:.1f}%")
    
    print(f"\nPredicted Performance:")
    print(f"  Predicted Cost: ₹{y_cost_pred[sample_idx]:.2f}")
    print(f"  Predicted CO2 Impact: {y_co2_pred[sample_idx]:.2f}")
    
    if 'Packaging_Cost' in sample.index:
        print(f"  Actual Cost: ₹{sample['Packaging_Cost']:.2f}")
        cost_error = abs(y_cost_pred[sample_idx] - sample['Packaging_Cost'])
        print(f"  Cost Prediction Error: ₹{cost_error:.2f}")
    
    if 'CO2_Impact_Index' in sample.index:
        print(f"  Actual CO2: {sample['CO2_Impact_Index']:.2f}")
        co2_error = abs(y_co2_pred[sample_idx] - sample['CO2_Impact_Index'])
        print(f"  CO2 Prediction Error: {co2_error:.2f}")
    
    # ========================================================================
    # STEP 2: FIND SIMILAR PRODUCTS WITH BETTER PACKAGING
    # ========================================================================
    
    print("\n[FINDING ALTERNATIVE PACKAGING OPTIONS]")
    print("-" * 80)
    
    # Filter materials based on product requirements
    df_alternatives = df_eng.copy()
    
    # Constraint 1: Similar weight capacity (±20%)
    if 'weight_capacity' in sample.index and 'weight_capacity' in df_alternatives.columns:
        min_cap = sample['weight_capacity'] * 0.8
        max_cap = sample['weight_capacity'] * 1.3
        df_alternatives = df_alternatives[
            (df_alternatives['weight_capacity'] >= min_cap) &
            (df_alternatives['weight_capacity'] <= max_cap)
        ]
        print(f"  ✓ Filtered by weight capacity ({min_cap:.0f}g - {max_cap:.0f}g): {len(df_alternatives)} options")
    
    # Constraint 2: Same or better strength
    if 'strength' in sample.index and 'strength' in df_alternatives.columns:
        strength_hierarchy = {'Low': 1, 'Medium': 2, 'High': 3, 'Very High': 4}
        if sample['strength'] in strength_hierarchy:
            min_strength = strength_hierarchy[sample['strength']]
            df_alternatives['strength_num'] = df_alternatives['strength'].map(strength_hierarchy).fillna(1)
            df_alternatives = df_alternatives[df_alternatives['strength_num'] >= min_strength]
            print(f"  ✓ Filtered by strength (≥{sample['strength']}): {len(df_alternatives)} options")
    
    # Constraint 3: Same or better recyclability
    if 'recyclability_percent' in sample.index and 'recyclability_percent' in df_alternatives.columns:
        min_recycle = max(sample['recyclability_percent'] - 10, 0)
        df_alternatives = df_alternatives[df_alternatives['recyclability_percent'] >= min_recycle]
        print(f"  ✓ Filtered by recyclability (≥{min_recycle:.0f}%): {len(df_alternatives)} options")
    
    # Exclude current packaging
    if 'material' in sample.index and 'shape' in sample.index:
        df_alternatives = df_alternatives[
            ~((df_alternatives['material'] == sample['material']) & 
              (df_alternatives['shape'] == sample['shape']))
        ]
        print(f"  ✓ Excluded current packaging: {len(df_alternatives)} alternatives")
    
    if len(df_alternatives) == 0:
        print("\n⚠ No alternative materials found matching constraints")
        return None
    
    # ========================================================================
    # STEP 3: PREDICT COST & CO2 FOR ALTERNATIVES
    # ========================================================================
    
    print("\n[EVALUATING ALTERNATIVES]")
    print("-" * 80)
    
    # Ensure all required features exist
    missing_cost = [f for f in cost_features if f not in df_alternatives.columns]
    missing_co2 = [f for f in co2_features if f not in df_alternatives.columns]
    
    if missing_cost or missing_co2:
        print(f"⚠ Missing features - Cost: {missing_cost}, CO2: {missing_co2}")
        return None
    
    # Predict Cost
    X_cost_alt = df_alternatives[cost_features].values
    df_alternatives['Predicted_Cost'] = best_cost_model.predict(X_cost_alt)
    
    # Predict CO2 (with log transform)
    X_co2_alt = df_alternatives[co2_features].values
    y_co2_log_alt = best_co2_model.predict(X_co2_alt)
    df_alternatives['Predicted_CO2'] = np.expm1(y_co2_log_alt)
    
    print(f"  ✓ Generated predictions for {len(df_alternatives)} alternatives")
    
    # ========================================================================
    # STEP 4: CALCULATE IMPROVEMENT METRICS
    # ========================================================================
    
    current_cost = y_cost_pred[sample_idx]
    current_co2 = y_co2_pred[sample_idx]
    
    df_alternatives['Cost_Savings'] = current_cost - df_alternatives['Predicted_Cost']
    df_alternatives['Cost_Savings_Pct'] = (df_alternatives['Cost_Savings'] / current_cost * 100).round(2)
    
    df_alternatives['CO2_Reduction'] = current_co2 - df_alternatives['Predicted_CO2']
    df_alternatives['CO2_Reduction_Pct'] = (df_alternatives['CO2_Reduction'] / current_co2 * 100).round(2)
    
    # Composite improvement score
    cost_weight = 0.4
    co2_weight = 0.35
    recycle_weight = 0.25
    
    # Normalize improvements
    cost_norm = (df_alternatives['Cost_Savings_Pct'] - df_alternatives['Cost_Savings_Pct'].min()) / \
                (df_alternatives['Cost_Savings_Pct'].max() - df_alternatives['Cost_Savings_Pct'].min() + 0.01)
    
    co2_norm = (df_alternatives['CO2_Reduction_Pct'] - df_alternatives['CO2_Reduction_Pct'].min()) / \
               (df_alternatives['CO2_Reduction_Pct'].max() - df_alternatives['CO2_Reduction_Pct'].min() + 0.01)
    
    if 'recyclability_percent' in df_alternatives.columns:
        recycle_norm = df_alternatives['recyclability_percent'] / 100
    else:
        recycle_norm = 0.5
    
    df_alternatives['Improvement_Score'] = (
        cost_norm * cost_weight * 100 +
        co2_norm * co2_weight * 100 +
        recycle_norm * recycle_weight * 100
    ).round(2)
    
    # ========================================================================
    # STEP 5: RANK & CLASSIFY ALTERNATIVES
    # ========================================================================
    
    df_alternatives = df_alternatives.sort_values('Improvement_Score', ascending=False)
    
    def classify_alternative(row):
        if row['Cost_Savings_Pct'] > 15 and row['CO2_Reduction_Pct'] > 15:
            return '⭐ Excellent Upgrade'
        elif row['Cost_Savings_Pct'] > 10 or row['CO2_Reduction_Pct'] > 10:
            return '✓✓ Strong Upgrade'
        elif row['Cost_Savings_Pct'] > 5 or row['CO2_Reduction_Pct'] > 5:
            return '✓ Moderate Upgrade'
        elif row['Cost_Savings_Pct'] > 0 or row['CO2_Reduction_Pct'] > 0:
            return '○ Slight Upgrade'
        else:
            return '△ Similar Performance'
    
    df_alternatives['Classification'] = df_alternatives.apply(classify_alternative, axis=1)
    
    # ========================================================================
    # STEP 6: DISPLAY TOP RECOMMENDATIONS
    # ========================================================================
    
    top_alternatives = df_alternatives.head(top_n)
    
    print(f"\n{'='*100}")
    print(f"TOP {top_n} ALTERNATIVE PACKAGING OPTIONS")
    print(f"{'='*100}\n")
    
    for i, (idx, row) in enumerate(top_alternatives.iterrows(), 1):
        print(f"#{i}. {row['Classification']}")
        print(f"   Material: {row['material']} | Shape: {row['shape']} | Strength: {row['strength']}")
        
        print(f"\n   Financial Impact:")
        print(f"     Current Cost:     ₹{current_cost:.2f}")
        print(f"     Alternative Cost: ₹{row['Predicted_Cost']:.2f}")
        if row['Cost_Savings'] > 0:
            print(f"     💰 SAVES:         ₹{row['Cost_Savings']:.2f} ({row['Cost_Savings_Pct']:.1f}%)")
        else:
            print(f"     ⚠ INCREASES:      ₹{abs(row['Cost_Savings']):.2f} ({abs(row['Cost_Savings_Pct']):.1f}%)")
        
        print(f"\n   Environmental Impact:")
        print(f"     Current CO2:      {current_co2:.2f}")
        print(f"     Alternative CO2:  {row['Predicted_CO2']:.2f}")
        if row['CO2_Reduction'] > 0:
            print(f"     🌱 REDUCES:       {row['CO2_Reduction']:.2f} ({row['CO2_Reduction_Pct']:.1f}%)")
        else:
            print(f"     ⚠ INCREASES:      {abs(row['CO2_Reduction']):.2f} ({abs(row['CO2_Reduction_Pct']):.1f}%)")
        
        if 'recyclability_percent' in row.index:
            current_recycle = sample.get('recyclability_percent', 0)
            recycle_change = row['recyclability_percent'] - current_recycle
            print(f"\n   Recyclability:")
            print(f"     Current:  {current_recycle:.1f}%")
            print(f"     Alternative: {row['recyclability_percent']:.1f}%")
            if recycle_change > 0:
                print(f"     ♻️ IMPROVES:  +{recycle_change:.1f}%")
            elif recycle_change < 0:
                print(f"     ⚠ DECREASES: {recycle_change:.1f}%")
        
        print(f"\n   Overall Improvement Score: {row['Improvement_Score']:.1f}/100")
        print()
    

    # ========================================================================
    # STEP 8: SUMMARY STATISTICS
    # ========================================================================
    
    print(f"\n{'='*100}")
    print("RECOMMENDATION SUMMARY")
    print(f"{'='*100}")
    
    best = top_alternatives.iloc[0]
    
    print(f"\n🏆 BEST ALTERNATIVE:")
    print(f"   Material: {best['material']}")
    print(f"   Shape: {best['shape']}")
    print(f"   Classification: {best['Classification']}")
    print(f"   Improvement Score: {best['Improvement_Score']:.1f}/100")
    
    if best['Cost_Savings'] > 0 and best['CO2_Reduction'] > 0:
        print(f"\n   ✅ WIN-WIN SOLUTION:")
        print(f"      • {best['Cost_Savings_Pct']:.1f}% cost reduction")
        print(f"      • {best['CO2_Reduction_Pct']:.1f}% CO2 reduction")
    elif best['Cost_Savings'] > 0:
        print(f"\n   💰 COST-OPTIMIZED:")
        print(f"      • {best['Cost_Savings_Pct']:.1f}% cost reduction")
        print(f"      • {abs(best['CO2_Reduction_Pct']):.1f}% CO2 increase")
    elif best['CO2_Reduction'] > 0:
        print(f"\n   🌱 ECO-OPTIMIZED:")
        print(f"      • {abs(best['Cost_Savings_Pct']):.1f}% cost increase")
        print(f"      • {best['CO2_Reduction_Pct']:.1f}% CO2 reduction")
    
    print(f"\n 📊 ALTERNATIVES ANALYZED:")
    print(f"    Total Options: {len(df_alternatives)}")
    print(f"    Cost Savings: {(df_alternatives['Cost_Savings'] > 0).sum()} options")
    print(f"    CO2 Reduction: {(df_alternatives['CO2_Reduction'] > 0).sum()} options")
    print(f"    Win-Win: {((df_alternatives['Cost_Savings'] > 0) & (df_alternatives['CO2_Reduction'] > 0)).sum()} options")
    
    print(f"\n {'='*100}")
    
    return top_alternatives



# Example 1: Single sample recommendation
print("\n" + "="*100)
print("EXAMPLE 1: SINGLE SAMPLE ANALYSIS")
print("="*100)

single_rec = recommend_for_test_sample(
    sample_idx=5,
    df_test=df_test,
    df_eng=df_eng,
    y_cost_pred=y_cost_pred,
    y_co2_pred=y_co2_pred,
    best_cost_model=best_cost_model,
    best_co2_model=best_co2_model,
    cost_features=cost_features,
    co2_features=co2_features,
    top_n=10
)



PREPARING TEST DATAFRAME FOR RECOMMENDATION SYSTEM
Valid data: 5041 rows
✓ Test dataframe created: 1009 rows × 45 columns
✓ Matches test set size: True
✓ Available columns: ['material', 'shape', 'strength', 'recycling', 'recyclability_percent']

EXAMPLE 1: SINGLE SAMPLE ANALYSIS
DETAILED RECOMMENDATION FOR TEST SAMPLE #5

[CURRENT PRODUCT PROFILE]
--------------------------------------------------------------------------------

Product Characteristics:
  Food Category: biscuits-and-cakes
  Product Quantity: 300.0g
  Package Weight: 2.6g
  Weight Capacity: 30.0g

Current Packaging:
  Material: plastic
  Shape: film
  Strength: Medium
  Recycling Type: Recyclable
  Recyclability: 20.0%

Predicted Performance:
  Predicted Cost: ₹149.01
  Predicted CO2 Impact: 5.61
  Actual Cost: ₹190.23
  Cost Prediction Error: ₹41.22
  Actual CO2: 5.20
  CO2 Prediction Error: 0.41

[FINDING ALTERNATIVE PACKAGING OPTIONS]
--------------------------------------------------------------------------------
  