# Load the finalized dataset

In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv("../data/processed/ecopackai_final_base.csv")

print("Base dataset shape:", df.shape)
df.head()


Base dataset shape: (2600, 10)


Unnamed: 0,material_type,product_category,strength,weight_capacity,cost_per_unit,biodegradability_score,co2_emission_score,recyclability_percentage,fragility_level,shipping_type
0,paper,electronics,34.0,8.4,0.12,0.88,0.85,78.7,3.0,air
1,jute,home,44.0,8.5,0.28,0.87,1.04,69.9,1.0,ground
2,bagasse,electronics,39.0,5.3,0.24,0.94,0.85,85.7,3.0,air
3,plastic,electronics,72.0,11.4,0.23,0.11,9.41,34.8,3.0,air
4,glass,electronics,70.0,12.6,0.54,0.02,20.04,87.7,3.0,air


In [4]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2600 entries, 0 to 2599
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   material_type             2600 non-null   object 
 1   product_category          2600 non-null   object 
 2   strength                  2600 non-null   float64
 3   weight_capacity           2600 non-null   float64
 4   cost_per_unit             2600 non-null   float64
 5   biodegradability_score    2600 non-null   float64
 6   co2_emission_score        2600 non-null   float64
 7   recyclability_percentage  2600 non-null   float64
 8   fragility_level           2600 non-null   float64
 9   shipping_type             2600 non-null   object 
dtypes: float64(7), object(3)
memory usage: 203.2+ KB


# Feature Engineering

COâ‚‚ Impact Index

In [5]:
def min_max(series):
    if series.max() == series.min():
        return pd.Series(0.0, index=series.index)
    return (series - series.min()) / (series.max() - series.min())


In [6]:
df["co2_impact_index"] = min_max(df["co2_emission_score"])


Cost Efficiency Index

In [7]:
df["cost_efficiency_index"] = (
    (df["strength"] + df["weight_capacity"]) /
    (df["cost_per_unit"] + 1e-6)
)

df["cost_efficiency_index"] = min_max(df["cost_efficiency_index"])


Material Suitability Score

In [8]:
df["material_suitability_score"] = (
    0.30 * min_max(df["strength"]) +
    0.20 * min_max(df["weight_capacity"]) +
    0.20 * df["biodegradability_score"] +
    0.15 * min_max(df["recyclability_percentage"]) -
    0.10 * df["co2_impact_index"] -
    0.05 * min_max(df["cost_per_unit"])
)


# Validate Data Quality

In [9]:
engineered_cols = [
    "co2_impact_index",
    "cost_efficiency_index",
    "material_suitability_score"
]

df[engineered_cols].describe()


Unnamed: 0,co2_impact_index,cost_efficiency_index,material_suitability_score
count,2600.0,2600.0,2600.0
mean,0.231837,0.31715,0.356056
std,0.285958,0.174945,0.090998
min,0.0,0.0,0.052856
25%,0.019761,0.182235,0.316311
50%,0.036229,0.287058,0.368374
75%,0.488678,0.418108,0.419854
max,1.0,1.0,0.670962


In [10]:
df[engineered_cols].min(), df[engineered_cols].max()


(co2_impact_index              0.000000
 cost_efficiency_index         0.000000
 material_suitability_score    0.052856
 dtype: float64,
 co2_impact_index              1.000000
 cost_efficiency_index         1.000000
 material_suitability_score    0.670962
 dtype: float64)

In [11]:
final_cols = [
    "material_type",
    "product_category",
    "strength",
    "weight_capacity",
    "cost_per_unit",
    "biodegradability_score",
    "co2_emission_score",
    "recyclability_percentage",
    "fragility_level",
    "shipping_type",
    "co2_impact_index",
    "cost_efficiency_index",
    "material_suitability_score"
]

final_df = df[final_cols]
final_df.head()


Unnamed: 0,material_type,product_category,strength,weight_capacity,cost_per_unit,biodegradability_score,co2_emission_score,recyclability_percentage,fragility_level,shipping_type,co2_impact_index,cost_efficiency_index,material_suitability_score
0,paper,electronics,34.0,8.4,0.12,0.88,0.85,78.7,3.0,air,0.010292,0.590077,0.359352
1,jute,home,44.0,8.5,0.28,0.87,1.04,69.9,1.0,ground,0.018114,0.190203,0.371503
2,bagasse,electronics,39.0,5.3,0.24,0.94,0.85,85.7,3.0,air,0.010292,0.18317,0.356799
3,plastic,electronics,72.0,11.4,0.23,0.11,9.41,34.8,3.0,air,0.362701,0.612446,0.277328
4,glass,electronics,70.0,12.6,0.54,0.02,20.04,87.7,3.0,air,0.800329,0.106924,0.309289


In [16]:
output_path = "../data/processed/ecopackai_feature_engineered.csv"
final_df.to_csv(output_path, index=False)

print("Saved:", output_path)


Saved: ../data/processed/ecopackai_feature_engineered.csv
