In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("../data/processed/ecopackai_clean_base.csv")


In [3]:
print("Loaded shape:", df.shape)
df.head()



Loaded shape: (2600, 10)


Unnamed: 0,material_name,product_category,strength_rating,weight_capacity_kg,unit_cost_,biodegradability_score,co2_emission_kg,recyclability_percentage,fragility_level,shipping_mode
0,paper,electronics,34.0,8.4,0.12,0.88,0.85,78.7,3.0,air
1,jute,home,44.0,8.5,0.28,0.87,1.04,69.9,1.0,ground
2,bagasse,electronics,39.0,5.3,0.24,0.94,0.85,85.7,3.0,air
3,plastic,electronics,72.0,11.4,0.23,0.11,9.41,34.8,3.0,air
4,glass,electronics,70.0,12.6,0.54,0.02,20.04,87.7,3.0,air


In [4]:
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace("(", "")
    .str.replace(")", "")
    .str.replace("%", "percentage")
    .str.replace("$", "")
)


In [5]:
df = df.rename(columns={
    "material_name": "material_type",
    "product_category": "product_category",
    "strength_rating": "strength",
    "weight_capacity_kg": "weight_capacity",
    "unit_cost_": "cost_per_unit",
    "biodegradability_score": "biodegradability_score",
    "co2_emission_kg": "co2_emission_score",
    "recyclability_percentage": "recyclability_percentage",
    "fragility_level": "fragility_level",
    "shipping_mode": "shipping_type"
})


In [6]:
expected_cols = [
    "material_type",
    "product_category",
    "strength",
    "weight_capacity",
    "cost_per_unit",
    "biodegradability_score",
    "co2_emission_score",
    "recyclability_percentage",
    "fragility_level",
    "shipping_type"
]

missing_cols = set(expected_cols) - set(df.columns)
assert len(missing_cols) == 0, f"Missing columns: {missing_cols}"


In [7]:
df["material_type"] = df["material_type"].str.lower().str.strip()
df["product_category"] = df["product_category"].str.lower().str.strip()
df["shipping_type"] = df["shipping_type"].str.lower().str.strip()


In [8]:
valid_materials = [
    "paper", "plastic", "glass",
    "bamboo", "jute", "bagasse", "metal"
]

valid_categories = [
    "electronics", "food",
    "cosmetics", "pharmaceuticals", "home"
]

df = df[df["material_type"].isin(valid_materials)]
df = df[df["product_category"].isin(valid_categories)]


In [9]:
numeric_cols = [
    "strength", "weight_capacity",
    "cost_per_unit", "biodegradability_score",
    "co2_emission_score", "recyclability_percentage",
    "fragility_level"
]

df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors="coerce")


In [10]:
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())


In [11]:

df = df[df["fragility_level"].between(1, 3)]

df = df[df["recyclability_percentage"].between(0, 100)]

df = df[df["biodegradability_score"].between(0, 1)]


In [13]:
df.loc[df["shipping_type"].isna() & (df["fragility_level"] == 3), "shipping_type"] = "air"
df.loc[df["shipping_type"].isna() & (df["fragility_level"] < 3), "shipping_type"] = "ground"

assert df["shipping_type"].isnull().sum() == 0


In [14]:
print("Final processed shape:", df.shape)
df.describe(include="all")


Final processed shape: (2600, 10)


Unnamed: 0,material_type,product_category,strength,weight_capacity,cost_per_unit,biodegradability_score,co2_emission_score,recyclability_percentage,fragility_level,shipping_type
count,2600,2600,2600.0,2600.0,2600.0,2600.0,2600.0,2600.0,2600.0,2600
unique,7,5,,,,,,,,2
top,plastic,home,,,,,,,,ground
freq,568,545,,,,,,,,2086
mean,,,55.962692,9.051962,0.294381,0.5753,6.231327,73.031077,1.802308,
std,,,17.213207,2.944545,0.130476,0.409362,6.945922,18.985437,0.742899,
min,,,30.0,4.2,0.12,0.0,0.6,25.2,1.0,
25%,,,43.0,6.8,0.2,0.11,1.08,66.0,1.0,
50%,,,52.0,8.4,0.26,0.84,1.48,78.7,2.0,
75%,,,68.0,10.9,0.35,0.93,12.47,87.125,2.0,


In [15]:
output_path = "../data/processed/ecopackai_final_base.csv"
df.to_csv(output_path, index=False)

print("Saved:", output_path)


Saved: ../data/processed/ecopackai_final_base.csv
