# Main Notebook - CO2Emissions 


**Importing Libraries**

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from category_encoders import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
from sklearn.impute import SimpleImputer

**Loading the data**

In [2]:
df = pd.read_csv("./data/data.csv")

In [3]:
df.shape

(526170, 40)

**Wrapping the cleaning process in a function**

In [4]:
def cleaner(data):
    data.columns = data.columns.str.lower().str.replace(' ', '_')

    categorical_columns = list(data.dtypes[data.dtypes == 'object'].index)

    for c in categorical_columns:
        data[c] = data[c].str.lower().str.replace(' ', '_')

    nan = ['mms', 'ernedc_(g/km)', 'electric_range_(km)', 'vf', 'enedc_(g/km) ',
           'at2_(mm)', 'at1_(mm)', 'w_(mm)', 'rlfi', 'de', 'z_(wh/km)', 'it',
           'erwltp_(g/km)', 'enedc_(g/km)']

    data = data[[col for col in data.columns if col not in nan]]

    features = ['mk', 'm_(kg)', 'mt', 'w_(mm)', 'at1_(mm)', 'at2_(mm)',
                'ft', 'fm', 'ec_(cm3)', 'ep_(kw)', 'fuel_consumption_', 'ech']
    features = [f for f in features if f in data.columns]

    categorical_columns = [col for col in categorical_columns if col in features]

    target = 'ewltp_(g/km)'

    select_data = data[features + [target]].copy()

    numerical_columns = list(select_data.dtypes[select_data.dtypes != 'object'].index)
    numerical_columns.remove(target)

    return select_data, numerical_columns, categorical_columns, target

In [5]:
select_df, numerical_cols, categorical_cols, target = cleaner(df)

select_df.head()

Unnamed: 0,mk,m_(kg),mt,ft,fm,ec_(cm3),ep_(kw),fuel_consumption_,ech,ewltp_(g/km)
0,lexus,1388,1426.0,petrol,h,1490.0,67.0,4.5,euro_6_ea,100
1,toyota,1539,1586.0,petrol,m,1798.0,72.0,3.6,euro_6_ea,102
2,volkswagen,1403,1539.0,petrol,h,1498.0,85.0,5.4,euro_6_ea,123
3,renault,1537,1616.0,petrol,h,1598.0,69.0,7.0,euro_6_ea,107
4,renault,1328,1334.0,petrol,m,999.0,67.0,5.5,euro_6_ap,126


**Checking the missing data**

In [6]:
select_df[numerical_cols].isnull().sum().sort_values(ascending=False)

mt                   751
ec_(cm3)              27
ep_(kw)               20
m_(kg)                 0
fuel_consumption_      0
dtype: int64

In [7]:
select_df[categorical_cols].isnull().sum().sort_values(ascending=False)

ech    11
mk      0
ft      0
fm      0
dtype: int64

**Checking for misspells**

In [8]:
select_df['mk'].unique()

array(['lexus', 'toyota', 'volkswagen', 'renault', 'volvo', 'opel',
       'audi', 'skoda', 'kia', 'bmw', 'ford', 'mazda', 'mercedes-benz',
       'suzuki', 'hyundai', 'dacia', 'jeep', 'mg', 'seat',
       'allied_vehicles_ltd', 'citroen', 'porsche', 'omoda', 'ssangyong',
       'fiat', 'nissan', 'honda', 'cupra', 'peugeot', 'subaru',
       'land_rover', 'alfa_romeo', 'toyota/carpol', 'mitsubishi',
       'renault/carpol', 'mini', 'ds', 'toyota/steeler', 'jaguar', 'byd',
       'opel/carpol', 'jaecoo', 'mercedes-amg', 'peugeot/carpol',
       'volkswageb', 'genesis', 'ford/fc_auto_system', 'citroen/carpol',
       'maserati', 'allied_vehicles_ltd.', 'volkswagen/v-van',
       'cms_auto/mercedes-benz', 'ford_transit/frank-cars', 'lamborghini',
       'lotus', 'ssangyong_kg_mobility', 'bentley', 'mercedes/v-van',
       'ford/germaz', 'mclaren', 'volkswagen_amz_kutno',
       'ford/frank-cars', 'ford_transit/its_system', 'volkswagen/carpol',
       'nisssan', 'alpine', 'fiat/carpol', 'v

In [9]:
spelling_mappings = {
    'toyota/carpol' : 'toyota',
    'renault/carpol' : 'renault',
    'toyota/steeler' : 'toyota',
    'opel/carpol' : 'opel',
    'mercedes-amg' : 'mercedes-benz',
    'peugeot/carpol' : 'peugeot',
    'volkswageb' : 'volkswagen',
    'ford/fc_auto_system' : 'ford',
    'citroen/carpol' : 'citroen',
    'allied_vehicles_ltd.' : 'allied_vehicles_ltd',
    'volkswagen/v-van' : 'volkswagen',
    'cms_auto/mercedes-benz' : 'mercedes-benz',
    'ford_transit/frank-cars' : 'ford_transit',
    'ssangyong_kg_mobility' : 'ssangyong',
    'mercedes/v-van' : 'mercedes-benz',
    'ford/germaz' : 'ford',
    'volkswagen_amz_kutno' : 'volkswagen',
    'ford/frank-cars' : 'ford',
    'ford_transit/its_system' : 'ford_transit',
    'volkswagen/carpol' : 'volkswagen',
    'nisssan' : 'nissan',
    'fiat/carpol' : 'fiat',
    'volkswagen/zimny' : 'volkswagen',
    'mercedes-benz/mrc' : 'mercedes-benz',
    'ssang_yong' : 'ssangyong',
    'ford/auto_galeria' : 'ford',
    'alpina' : 'bmw',
    'suzki' : 'suzuki',
    'man/carpol' : 'man',
    'renault_/_multitel' : 'renault',
    'nissa' : 'nissan',
    'opek' : 'opel',
    'volkswage._vw' : 'volkswagen',
    'lexsus' : 'lexus',
    'mercede-benz': 'mercedes-benz',
    'ssangyong_kg_mobitity' : 'ssangyong',
    'mercedes' : 'mercedes-benz',
    'mercedes-benz/cms-auto' : 'mercedes-benz',
    'volkswagen/mobilcar' : 'volkswagen',
    'jaeccoo' : 'jaecoo',
    'ford_transit/auto_galeria' : 'ford_transit',
    'volkswagen/mrc' : 'volkswagen',
    'ssang-young' : 'ssangyong',
    'porche' : 'porsche',
    'omoda5' : 'omoda',
    'mag' : 'mg',
    'caterham' : 'caterham_cars_ltd',
}

In [10]:
select_df['mk'] = select_df['mk'].replace(spelling_mappings)
select_df['mk'].unique()

array(['lexus', 'toyota', 'volkswagen', 'renault', 'volvo', 'opel',
       'audi', 'skoda', 'kia', 'bmw', 'ford', 'mazda', 'mercedes-benz',
       'suzuki', 'hyundai', 'dacia', 'jeep', 'mg', 'seat',
       'allied_vehicles_ltd', 'citroen', 'porsche', 'omoda', 'ssangyong',
       'fiat', 'nissan', 'honda', 'cupra', 'peugeot', 'subaru',
       'land_rover', 'alfa_romeo', 'mitsubishi', 'mini', 'ds', 'jaguar',
       'byd', 'jaecoo', 'genesis', 'maserati', 'ford_transit',
       'lamborghini', 'lotus', 'bentley', 'mclaren', 'alpine', 'morgan',
       'carpol', 'mercus', 'man', 'caterham_cars_ltd'], dtype=object)

**Splitting the data**

In [11]:
df_train, df_test = train_test_split(select_df, test_size=0.2, random_state=1)

y_train = df_train['ewltp_(g/km)']
y_test = df_test['ewltp_(g/km)']

del df_train['ewltp_(g/km)']
del df_test['ewltp_(g/km)']

**Creatin the Pipeline**

In [12]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

target_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('target_encoder', TargetEncoder(handle_unknown='value'))
])

In [13]:
preprocessor = ColumnTransformer(
    transformers=[
        ('target', target_transformer, ['mk', 'ech']),
        ('onehot', OneHotEncoder(handle_unknown='ignore'), ['ft', 'fm']),
        ('num', numeric_transformer, numerical_cols)
    ],
    remainder='drop'
)

pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LGBMRegressor(random_state=42))
])

**Setting up parameters for GridSearchCV and fitting the model using the best hyperparametes**

In [14]:
param_grid = {
    'model__n_estimators': [100, 300, 500],
    'model__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'model__max_depth': [3, 5, 6, 7],
    'model__min_child_samples': [2, 5, 10, 20]
}

In [15]:
grid = GridSearchCV(
    pipe,
    param_grid,
    scoring='neg_root_mean_squared_error',
    cv=3,
    n_jobs=-1,
    verbose=2
)

In [16]:
grid.fit(df_train, y_train)

Fitting 3 folds for each of 192 candidates, totalling 576 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006960 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 903
[LightGBM] [Info] Number of data points in the train set: 420936, number of used features: 16
[LightGBM] [Info] Start training from score 134.073805


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'model__learning_rate': [0.01, 0.05, ...], 'model__max_depth': [3, 5, ...], 'model__min_child_samples': [2, 5, ...], 'model__n_estimators': [100, 300, ...]}"
,scoring,'neg_root_mean_squared_error'
,n_jobs,-1
,refit,True
,cv,3
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('target', ...), ('onehot', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,7
,learning_rate,0.2
,n_estimators,500
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [42]:
print("Best params:", grid.best_params_)
best_pipeline = grid.best_params_

Best params: {'model__learning_rate': 0.2, 'model__max_depth': 7, 'model__min_child_samples': 5, 'model__n_estimators': 500}


**Predictions and Scoring**

In [33]:
preds = grid.predict(df_test)



In [34]:
r2 = r2_score(y_test, preds)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print(f"R2: {r2}, RMSE: {rmse}")

R2: 0.970574746910062, RMSE: 4.7225940956938315


**Observation from the model:**

* First model trained and evaluated on the data was a XGBoostRegressor, which due to the size of the data grid searching took a long time and scoring showed near perfect scores (similar to the LightGBM model).
* For higher efficiency on the hyperparameter tuning process and types of features that we are encountering here, LightGBMRegressor model was chosen for the final model.
* Due to high amount of R2 score i was suspicous of leakage (there were a couple of minor leakages that i fixed but they didnt affect model performance), but no trace of leakage were found in the process and since the target variable is a calculated variable it is possible that the model can figure out a way to calculate the target using the available features accurately.
* To find out if the model is good at its job or something is wrong with the data in the next section a simple LinearRegression model will be trained on the data.


**Implementing the LinearRegression model**

In [29]:
from sklearn.linear_model import LinearRegression

lr_preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', numeric_transformer, numerical_cols)
    ],
    remainder='drop'
)

lr_pipe = Pipeline([
    ('preprocessor', lr_preprocessor),
    ('model', LinearRegression())
])

In [30]:
lr_pipe.fit(df_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('onehot', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [35]:
lr_preds = lr_pipe.predict(df_test)

In [37]:
lr_r2 = r2_score(y_test, lr_preds)
lr_rmse = np.sqrt(mean_squared_error(y_test, lr_preds))
print(f"R2: {lr_r2}, RMSE: {lr_rmse}")

R2: 0.7608278334983214, RMSE: 13.46405369987879


**Checking the difference**

In [39]:
print("When using LGBM instead of LR we see these changes:")
print(f"R2: {round(r2 - lr_r2, 3)}, RMSE: {round(rmse - lr_rmse, 2)}")

When using LGBM instead of LR we see these changes:
R2: 0.21, RMSE: -8.74


* The comparison between the LightGBM (LGBM) and Linear Regression (LR) models reveals a significant improvement in predictive performance when using LGBM. 
* Specifically, the LGBM model achieves a 0.21 increase in R2, indicating it explains 21% more variance in the target variable compared to the LR model. Additionally, the 8.74 reduction in RMSE suggests that LGBMs predictions are, on average, 8.74 units ($gCO2/km$) closer to the actual values than those of the LR model. 
* These results strongly suggest that LGBMs ability to capture non-linear relationships and interactions in the data is superior to the linear relationships of LR, rather than showing a problem in the data. The performance gap shows the value of using more flexible, ensemble-based models like LGBM for complex datasets.

## Exporting the model

In [43]:
#i used joblib to dump the model since i find it easier to use and more intuitive
import joblib

filename = 'model.bin'
joblib.dump(best_pipeline, filename)

print(f"Model Pipeline saved to: {filename}")

Model Pipeline saved to: model.bin
