In [1]:
import pandas as pd
import numpy as np

In [7]:
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 120)


In [11]:
raw_df = pd.read_csv("../data/raw/ecopackai_raw_dataset.csv")

print("Raw dataset shape:", raw_df.shape)
raw_df.head()


Raw dataset shape: (2600, 10)


Unnamed: 0,Material Name,Product Category,Strength Rating,Weight Capacity (kg),Unit Cost ($),Biodegradability Score,CO2 Emission (kg),Recyclability (%),Fragility Level,Shipping Mode
0,paper,electronics,34.0,,0.12,0.88,0.85,,3.0,Air
1,jute,home,44.0,8.5,0.28,0.87,1.04,69.9,1.0,Ground
2,bagasse,electronics,39.0,5.3,0.24,0.94,0.85,85.7,3.0,Air
3,plastic,electronics,72.0,11.4,0.23,0.11,9.41,34.8,3.0,Air
4,glass,electronics,70.0,12.6,0.54,0.02,20.04,87.7,3.0,Air


In [12]:
raw_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2600 entries, 0 to 2599
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Material Name           2600 non-null   object 
 1   Product Category        2600 non-null   object 
 2   Strength Rating         2548 non-null   float64
 3   Weight Capacity (kg)    2496 non-null   float64
 4   Unit Cost ($)           2522 non-null   float64
 5   Biodegradability Score  2548 non-null   float64
 6   CO2 Emission (kg)       2470 non-null   float64
 7   Recyclability (%)       2444 non-null   float64
 8   Fragility Level         2574 non-null   float64
 9   Shipping Mode           2561 non-null   object 
dtypes: float64(7), object(3)
memory usage: 203.2+ KB


In [13]:
raw_df.isnull().sum()


Material Name               0
Product Category            0
Strength Rating            52
Weight Capacity (kg)      104
Unit Cost ($)              78
Biodegradability Score     52
CO2 Emission (kg)         130
Recyclability (%)         156
Fragility Level            26
Shipping Mode              39
dtype: int64

In [14]:
def clean_columns(df):
    df = df.copy()
    df.columns = (
        df.columns
        .str.strip()
        .str.lower()
        .str.replace(" ", "_")
        .str.replace("-", "_")
        .str.replace("%", "percentage")
        .str.replace("(", "")
        .str.replace(")", "")
        .str.replace("$", "")
    )
    return df

df = clean_columns(raw_df)
df.columns


Index(['material_name', 'product_category', 'strength_rating', 'weight_capacity_kg', 'unit_cost_',
       'biodegradability_score', 'co2_emission_kg', 'recyclability_percentage', 'fragility_level', 'shipping_mode'],
      dtype='object')

In [15]:
df = df.rename(columns={
    "material_name_": "material_type",
    "productcategory": "product_category",
    "strength_value": "strength",
    "wtcapacity": "weight_capacity",
    "unit_cost": "cost_per_unit",
    "bio_score": "biodegradability_score",
    "co2emission": "co2_emission_score",
    "recycle_percentage": "recyclability_percentage",
    "fragilelevel": "fragility_level",
    "ship_type": "shipping_type"
})

df.head()


Unnamed: 0,material_name,product_category,strength_rating,weight_capacity_kg,unit_cost_,biodegradability_score,co2_emission_kg,recyclability_percentage,fragility_level,shipping_mode
0,paper,electronics,34.0,,0.12,0.88,0.85,,3.0,Air
1,jute,home,44.0,8.5,0.28,0.87,1.04,69.9,1.0,Ground
2,bagasse,electronics,39.0,5.3,0.24,0.94,0.85,85.7,3.0,Air
3,plastic,electronics,72.0,11.4,0.23,0.11,9.41,34.8,3.0,Air
4,glass,electronics,70.0,12.6,0.54,0.02,20.04,87.7,3.0,Air


In [16]:
before = len(df)
df = df.drop_duplicates()
after = len(df)

print(f"Removed {before - after} duplicate rows")


Removed 0 duplicate rows


In [17]:
for col in df.select_dtypes(include="object").columns:
    df[col] = df[col].astype(str).str.strip().str.lower()


In [18]:

num_cols = df.select_dtypes(include=np.number).columns

for col in num_cols:
    df[col] = df[col].fillna(df[col].median())


cat_cols = df.select_dtypes(exclude=np.number).columns

for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])


In [19]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2600 entries, 0 to 2599
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   material_name             2600 non-null   object 
 1   product_category          2600 non-null   object 
 2   strength_rating           2600 non-null   float64
 3   weight_capacity_kg        2600 non-null   float64
 4   unit_cost_                2600 non-null   float64
 5   biodegradability_score    2600 non-null   float64
 6   co2_emission_kg           2600 non-null   float64
 7   recyclability_percentage  2600 non-null   float64
 8   fragility_level           2600 non-null   float64
 9   shipping_mode             2600 non-null   object 
dtypes: float64(7), object(3)
memory usage: 203.2+ KB


In [20]:
df.isnull().sum()


material_name               0
product_category            0
strength_rating             0
weight_capacity_kg          0
unit_cost_                  0
biodegradability_score      0
co2_emission_kg             0
recyclability_percentage    0
fragility_level             0
shipping_mode               0
dtype: int64

In [21]:
print("Final cleaned shape:", df.shape)
df.head()


Final cleaned shape: (2600, 10)


Unnamed: 0,material_name,product_category,strength_rating,weight_capacity_kg,unit_cost_,biodegradability_score,co2_emission_kg,recyclability_percentage,fragility_level,shipping_mode
0,paper,electronics,34.0,8.4,0.12,0.88,0.85,78.7,3.0,air
1,jute,home,44.0,8.5,0.28,0.87,1.04,69.9,1.0,ground
2,bagasse,electronics,39.0,5.3,0.24,0.94,0.85,85.7,3.0,air
3,plastic,electronics,72.0,11.4,0.23,0.11,9.41,34.8,3.0,air
4,glass,electronics,70.0,12.6,0.54,0.02,20.04,87.7,3.0,air


In [22]:
df.to_csv("../data/processed/ecopackai_clean_base.csv", index=False)
