In [9]:
import pandas as pd

# Load dataset
df = pd.read_csv("../Data/cleaned_dataset_final.csv")

# Preview structure
print(f"Shape: {df.shape}")
print(df.dtypes)
df.head()

Shape: (159533, 18)
brand               object
model               object
cnit                object
power_hp           float64
power_admin          int64
gearbox             object
cons_urban         float64
cons_extra         float64
cons_mixed         float64
co2                float64
fuel                object
hybrid              object
mass_min             int64
mass_max             int64
Year                 int64
avg_mass           float64
power_to_weight    float64
log_co2            float64
dtype: object


Unnamed: 0,brand,model,cnit,power_hp,power_admin,gearbox,cons_urban,cons_extra,cons_mixed,co2,fuel,hybrid,mass_min,mass_max,Year,avg_mass,power_to_weight,log_co2
0,Alfa Romeo,GIULIETTA,M10ALFVP0000324,85.0,7,M 6,8.4,5.3,6.4,149.0,ES,NON,1355,1355,2012,1355.0,0.062731,5.010635
1,Alfa Romeo,MITO,M10ALFVP0000360,62.0,4,M 6,4.9,3.8,4.2,98.0,ES,NON,1205,1205,2012,1205.0,0.051452,4.59512
2,Alfa Romeo,GIULIETTA,M10ALFVP0001325,85.0,7,M 6,8.4,5.3,6.4,149.0,ES,NON,1355,1355,2012,1355.0,0.062731,5.010635
3,Alfa Romeo,MITO,M10ALFVP0002035,77.0,6,M 6,7.6,4.8,5.8,136.0,ES,NON,1185,1185,2012,1185.0,0.064979,4.919981
4,Alfa Romeo,GIULIETTA,M10ALFVP0002326,120.0,9,M 6,7.8,4.6,5.8,134.0,ES,NON,1365,1365,2012,1365.0,0.087912,4.905275


In [10]:
# List of columns to keep

df['hybrid'] = df['hybrid'].map({'OUI': 1, 'NON': 0})

selected_features = ["power_to_weight","hybrid","brand","fuel","gearbox"]

# Prepare final X and y
X = df[selected_features]
y = df["co2"]


In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

In [12]:
from sklearn.preprocessing import LabelEncoder, StandardScaler


from sklearn.preprocessing import OrdinalEncoder

# Apply to all 3 categorical features at once
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

X_train[['brand', 'gearbox', 'fuel']] = ordinal_encoder.fit_transform(X_train[['brand', 'gearbox', 'fuel']])
X_test[['brand', 'gearbox', 'fuel']] = ordinal_encoder.transform(X_test[['brand', 'gearbox', 'fuel']])


In [13]:
# Numerical columns to scale
numerical_cols = ['power_to_weight']

scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])


In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR


from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd

# Define models
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "DecisionTree": DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor(),
    "ExtraTrees": ExtraTreesRegressor(),
    "GradientBoosting": GradientBoostingRegressor(),
    "HistGradientBoosting": HistGradientBoostingRegressor(),
    "XGBoost": XGBRegressor(objective="reg:squarederror", verbosity=0),
    "KNeighbors": KNeighborsRegressor(),
    "SVR": SVR()
}

results = []

# Train and evaluate
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    r2 = r2_score(y_test, preds)
    rmse = mean_squared_error(y_test, preds, squared=False)
    
    results.append({
        "Model": name,
        "R²": r2,
        "RMSE": rmse
    })

results_df = pd.DataFrame(results).sort_values(by="R²", ascending=False)
display(results_df)




In [8]:
from sklearn.model_selection import cross_val_score

top_models = {
    "ExtraTrees": ExtraTreesRegressor(),
    "RandomForest": RandomForestRegressor(),
    "DecisionTree": DecisionTreeRegressor(),
    "XGBoost": XGBRegressor()
}

cv_results = []

for name, model in top_models.items():
    r2_scores = cross_val_score(model, X, y, scoring='r2', cv=5)
    rmse_scores = cross_val_score(model, X, y, scoring='neg_root_mean_squared_error', cv=5)

    cv_results.append({
        "Model": name,
        "CV R² Mean": r2_scores.mean(),
        "CV R² Std": r2_scores.std(),
        "CV RMSE Mean": -rmse_scores.mean(),
        "CV RMSE Std": rmse_scores.std()
    })

pd.DataFrame(cv_results).sort_values(by="CV R² Mean", ascending=False)


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\qr02245\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\qr02245\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\qr02245\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\ensemble\_forest.py", line 363, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\qr02245\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\qr02245\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\utils\validation.py", line 1301, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "C:\Users\qr02245\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\utils\validation.py", line 1012, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\qr02245\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\utils\_array_api.py", line 745, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\qr02245\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\pandas\core\generic.py", line 2153, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'NON'
