In [None]:
# ==========================================
# 1. IMPORT LIBRARIES
# ==========================================

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor

# ==========================================
# 2. LOAD DATASET (ORIGINAL, NOT ENCODED)
# ==========================================

df = pd.read_csv("electronics_packaging_data.csv")

print("Dataset shape:", df.shape)

# ==========================================
# 3. FEATURE ENGINEERING
# ==========================================

# --- Fix Dimensions column ---
df["Dimensions_cm"] = df["Dimensions_cm"].astype(str)

dims = df["Dimensions_cm"].str.split("x", expand=True)
df["Length_cm"] = dims[0].astype(float)
df["Width_cm"]  = dims[1].astype(float)
df["Height_cm"] = dims[2].astype(float)

df.drop(columns=["Dimensions_cm"], inplace=True)

# --- Drop non-informative ID columns ---
df.drop(columns=["ProductID", "ProductName"], inplace=True)

# ==========================================
# 4. ONE-HOT ENCODING (CORRECT WAY)
# ==========================================

df_encoded = pd.get_dummies(df, drop_first=True)

print("Encoded dataset shape:", df_encoded.shape)

# ==========================================
# 5. FEATURE & TARGET SELECTION
# ==========================================

X = df_encoded.drop(columns=["CostPerPackage", "CO2_Emission_Score"])
y_cost = df_encoded["CostPerPackage"]
y_co2  = df_encoded["CO2_Emission_Score"]

# ==========================================
# 6. TRAINâ€“TEST SPLIT (80% / 20%)
# ==========================================

X_train, X_test, y_cost_train, y_cost_test = train_test_split(
    X, y_cost, test_size=0.2, random_state=42
)

_, _, y_co2_train, y_co2_test = train_test_split(
    X, y_co2, test_size=0.2, random_state=42
)

# ==========================================
# 7. RANDOM FOREST MODELS
# ==========================================

rf_cost = RandomForestRegressor(
    n_estimators=300,
    max_depth=12,
    random_state=42
)

rf_co2 = RandomForestRegressor(
    n_estimators=300,
    max_depth=12,
    random_state=42
)

rf_cost.fit(X_train, y_cost_train)
rf_co2.fit(X_train, y_co2_train)

rf_cost_pred = rf_cost.predict(X_test)
rf_co2_pred  = rf_co2.predict(X_test)

# ==========================================
# 8. XGBOOST MODELS (BEST PERFORMANCE)
# ==========================================

xgb_cost = XGBRegressor(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbosity=0
)

xgb_co2 = XGBRegressor(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbosity=0
)

xgb_cost.fit(X_train, y_cost_train)
xgb_co2.fit(X_train, y_co2_train)

xgb_cost_pred = xgb_cost.predict(X_test)
xgb_co2_pred  = xgb_co2.predict(X_test)

# ==========================================
# 9. EVALUATION FUNCTION (NO ERRORS)
# ==========================================

def evaluate_model(name, y_test, y_pred):
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"{name} -> R2 Score: {r2:.4f}, RMSE: {rmse:.4f}")

print("\n====== COST PER PACKAGE PREDICTION ======")
evaluate_model("Random Forest", y_cost_test, rf_cost_pred)
evaluate_model("XGBoost", y_cost_test, xgb_cost_pred)

print("\n====== CO2 EMISSION PREDICTION ======")
evaluate_model("Random Forest", y_co2_test, rf_co2_pred)
evaluate_model("XGBoost", y_co2_test, xgb_co2_pred)


# ==========================================
# 10. FINAL COMPARISON TABLE
# ==========================================


results = pd.DataFrame({
    "Model": ["Random Forest", "XGBoost"],

    "Cost_Accuracy_%": [
        r2_score(y_cost_test, rf_cost_pred) * 100,
        r2_score(y_cost_test, xgb_cost_pred) * 100
    ],
    "Cost_RMSE": [
        np.sqrt(mean_squared_error(y_cost_test, rf_cost_pred)),
        np.sqrt(mean_squared_error(y_cost_test, xgb_cost_pred))
    ],

    "CO2_Accuracy_%": [
        r2_score(y_co2_test, rf_co2_pred) * 100,
        r2_score(y_co2_test, xgb_co2_pred) * 100
    ],
    "CO2_RMSE": [
        np.sqrt(mean_squared_error(y_co2_test, rf_co2_pred)),
        np.sqrt(mean_squared_error(y_co2_test, xgb_co2_pred))
    ]
})

print("\nFINAL MODEL COMPARISON:")
print(results)



Dataset shape: (4000, 16)
Encoded dataset shape: (4000, 23)

Random Forest -> R2 Score: 0.9972, RMSE: 2.7028
XGBoost -> R2 Score: 0.9979, RMSE: 2.2996

Random Forest -> R2 Score: 0.9924, RMSE: 2.0100
XGBoost -> R2 Score: 0.9917, RMSE: 2.0993

FINAL MODEL COMPARISON:
           Model  Cost_Accuracy_%  Cost_RMSE  CO2_Accuracy_%  CO2_RMSE
0  Random Forest        99.715980   2.702778       99.243049  2.010010
1        XGBoost        99.794396   2.299595       99.174320  2.099279
