In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv("carbonEmission.csv")


FileNotFoundError: [Errno 2] No such file or directory: 'carbonEmission.csv'

In [None]:
df.columns = df.columns.str.replace("_", " ")


In [None]:
# Converting list like strings into actual lists (if present)
def parse_list(x):
    if isinstance(x, str) and x.startswith("["):
        x = x.strip("[]").replace("'", "").split(",")
        return [i.strip() for i in x if i.strip()]
    return []

In [None]:
df.head()

Unnamed: 0,Body Type,Sex,Diet,How Often Shower,Heating Energy Source,Transport,Vehicle Type,Social Activity,Monthly Grocery Bill,Frequency of Traveling by Air,Vehicle Monthly Distance Km,Waste Bag Size,Waste Bag Weekly Count,How Long TV PC Daily Hour,How Many New Clothes Monthly,How Long Internet Daily Hour,Energy efficiency,Recycling,Cooking With,CarbonEmission
0,overweight,female,pescatarian,daily,coal,public,,often,230,frequently,210,large,4,7,26,1,No,['Metal'],"['Stove', 'Oven']",2238
1,obese,female,vegetarian,less frequently,natural gas,walk/bicycle,,often,114,rarely,9,extra large,3,9,38,5,No,['Metal'],"['Stove', 'Microwave']",1892
2,overweight,male,omnivore,more frequently,wood,private,petrol,never,138,never,2472,small,1,14,47,6,Sometimes,['Metal'],"['Oven', 'Microwave']",2595
3,overweight,male,omnivore,twice a day,wood,walk/bicycle,,sometimes,157,rarely,74,medium,3,20,5,7,Sometimes,"['Paper', 'Plastic', 'Glass', 'Metal']","['Microwave', 'Grill', 'Airfryer']",1074
4,obese,female,vegetarian,daily,coal,private,diesel,often,266,very frequently,8457,large,1,3,5,6,Yes,['Paper'],['Oven'],4743


In [None]:
df["Recycling"] = df["Recycling"].apply(parse_list)
df["Cooking With"] = df["Cooking With"].apply(parse_list)

In [None]:
# Expand Recycling into binary flags
unique_recycle_items = set(item for sublist in df["Recycling"] for item in sublist)
for item in unique_recycle_items:
    df[f"Recycle_{item}"] = df["Recycling"].apply(lambda x: 1 if item in x else 0)

In [None]:
# Expand Cooking_With into binary flags
unique_cooking_items = set(item for sublist in df["Cooking With"] for item in sublist)
for item in unique_cooking_items:
    df[f"Cook_{item}"] = df["Cooking With"].apply(lambda x: 1 if item in x else 0)

In [None]:
# Drop the original list columns
df.drop(["Recycling", "Cooking With"], axis=1, inplace=True)

In [None]:
# Separate target variable
y = df["CarbonEmission"]
X = df.drop("CarbonEmission", axis=1)

In [None]:
# Identify categorical & numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

In [None]:
# One-hot encode categorical features
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_cat = encoder.fit_transform(X[categorical_cols])

In [None]:

# Ensure it's 2D
if X_cat.ndim == 1:
    X_cat = X_cat.reshape(-1, 1)

# Convert numerical to 2D array
X_num = X[numerical_cols].to_numpy()

# Stack both arrays
X_final = np.hstack([X_cat, X_num])


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)


In [None]:
# === Train Model ===
model = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

In [None]:
with open("carbon_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("encoder.pkl", "wb") as f:
    pickle.dump({"encoder": encoder, "categorical_cols": categorical_cols, "numerical_cols": numerical_cols}, f)

print("✅ Model & encoder saved successfully!")

✅ Model & encoder saved successfully!


In [None]:
import numpy as np
import pandas as pd

print(type(X))  # Check if it's DataFrame, ndarray, etc.
print(X.dtypes if isinstance(X, pd.DataFrame) else X.dtype)


<class 'pandas.core.frame.DataFrame'>
Body Type                        object
Sex                              object
Diet                             object
How Often Shower                 object
Heating Energy Source            object
Transport                        object
Vehicle Type                     object
Social Activity                  object
Monthly Grocery Bill              int64
Frequency of Traveling by Air    object
Vehicle Monthly Distance Km       int64
Waste Bag Size                   object
Waste Bag Weekly Count            int64
How Long TV PC Daily Hour         int64
How Many New Clothes Monthly      int64
How Long Internet Daily Hour      int64
Energy efficiency                object
Recycle_Plastic                   int64
Recycle_Glass                     int64
Recycle_Paper                     int64
Recycle_Metal                     int64
Cook_Oven                         int64
Cook_Airfryer                     int64
Cook_Grill                        int64
Co

In [None]:
import pickle
import pandas as pd
import numpy as np

# Load model
with open("carbon_model.pkl", "rb") as f:
    model = pickle.load(f)

# Load encoder and column info
with open("encoder.pkl", "rb") as f:
    encoder_data = pickle.load(f)

encoder = encoder_data["encoder"]
categorical_cols = encoder_data["categorical_cols"]
numerical_cols = encoder_data["numerical_cols"]

# Your input row
input_df = pd.DataFrame([{
    "Body Type": "overweight",
    "Sex": "female",
    "Diet": "pescatarian",
    "How Often Shower": "daily",
    "Heating Energy Source": "coal",
    "Transport": "public",
    "Vehicle Type": np.nan,
    "Social Activity": "often",
    "Monthly Grocery Bill": 230,
    "Frequency of Traveling by Air": "frequently",
    "Vehicle Monthly Distance Km": 210,
    "Waste Bag Size": "large",
    "Waste Bag Weekly Count": 4,
    "How Long TV PC Daily Hour": 7,
    "How Many New Clothes Monthly": 1,
    "How Long Internet Daily Hour": 26,
    "Energy efficiency": "No",
    "Recycle_Plastic": 0,
    "Recycle_Glass": 0,
    "Recycle_Paper": 0,
    "Recycle_Metal": 1,
    "Cook_Oven": 1,
    "Cook_Airfryer": 0,
    "Cook_Grill": 0,
    "Cook_Microwave": 0,
    "Cook_Stove": 1
}])

# Fill NaN for missing categorical values
input_df[categorical_cols] = input_df[categorical_cols].fillna("missing")

# Encode categorical
X_cat = encoder.transform(input_df[categorical_cols])

# Combine with numeric features
X_final = np.hstack([X_cat, input_df[numerical_cols].values])

# Predict
prediction = model.predict(X_final)
print("🔮 Predicted Carbon Footprint:", prediction[0])


🔮 Predicted Carbon Footprint: 1971.365
