In [None]:
#installations
!pip install pandas numpy scikit-learn

import pandas as pd
import numpy as np
import joblib



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer #link categorical and numerical data
from sklearn.preprocessing import OneHotEncoder #categorical data
from sklearn.preprocessing import StandardScaler #standardizer
from sklearn.impute import SimpleImputer #missing values
from sklearn.ensemble import RandomForestRegressor #multiple decision trees
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score




In [None]:
#loading data
data=pd.read_csv("vehicle_emissions.csv")


In [None]:
print(data.head())

   Model_Year   Make              Model Vehicle_Class  Engine_Size  Cylinders  \
0        2021  Acura                ILX       Compact          2.4          4   
1        2021  Acura                NSX    Two-seater          3.5          6   
2        2021  Acura         RDX SH-AWD    SUV: Small          2.0          4   
3        2021  Acura  RDX SH-AWD A-SPEC    SUV: Small          2.0          4   
4        2021  Acura         TLX SH-AWD       Compact          2.0          4   

  Transmission  Fuel_Consumption_in_City(L/100 km)  \
0          AM8                                 9.9   
1          AM9                                11.1   
2         AS10                                11.0   
3         AS10                                11.3   
4         AS10                                11.2   

   Fuel_Consumption_in_City_Hwy(L/100 km)  Fuel_Consumption_comb(L/100km)  \
0                                     7.0                             8.6   
1                                 

In [None]:

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 935 entries, 0 to 934
Data columns (total 12 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Model_Year                              935 non-null    int64  
 1   Make                                    935 non-null    object 
 2   Model                                   935 non-null    object 
 3   Vehicle_Class                           935 non-null    object 
 4   Engine_Size                             935 non-null    float64
 5   Cylinders                               935 non-null    int64  
 6   Transmission                            935 non-null    object 
 7   Fuel_Consumption_in_City(L/100 km)      935 non-null    float64
 8   Fuel_Consumption_in_City_Hwy(L/100 km)  935 non-null    float64
 9   Fuel_Consumption_comb(L/100km)          935 non-null    float64
 10  CO2_Emissions                           935 non-null    int64 

In [None]:
X=data.drop(["CO2_Emissions"],axis=1)
Y=data["CO2_Emissions"]

numerical_cols=["Model_Year","Engine_Size","Cylinders","Fuel_Consumption_in_City(L/100 km)","Fuel_Consumption_in_City_Hwy(L/100 km)","Fuel_Consumption_comb(L/100km)","Smog_Level"]
categorical_cols=["Make","Model","Vehicle_Class","Transmission"]

In [None]:
#pipeline
numerical_pipeline=Pipeline([('imputer',SimpleImputer(strategy="mean")),('scaler',StandardScaler())])

In [None]:
categorical_pipeline=Pipeline([('imputer',SimpleImputer(strategy="most_frequent")),('encoder',OneHotEncoder(handle_unknown='ignore'))])

In [None]:
preprocessor=ColumnTransformer([('num',numerical_pipeline,numerical_cols),('cat',categorical_pipeline,categorical_cols)])

In [None]:
#join pipeline
pipeline=Pipeline([
    ('preprocessor',preprocessor),
    ('model',RandomForestRegressor())
])

In [None]:
#split into training and testing
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=42)

In [None]:
#Train and predict model
pipeline.fit(X_train,Y_train)
prediction=pipeline.predict(X_test)

In [None]:
#view encoding
encoder_cols=pipeline.named_steps["preprocessor"].named_transformers_["cat"]["encoder"].get_feature_names_out(categorical_cols)
print(encoder_cols)


['Make_Acura' 'Make_Alfa Romeo' 'Make_Aston Martin' 'Make_Audi' 'Make_BMW'
 'Make_Bentley' 'Make_Bugatti' 'Make_Buick' 'Make_Cadillac'
 'Make_Chevrolet' 'Make_Chrysler' 'Make_Dodge' 'Make_FIAT' 'Make_Ford'
 'Make_GMC' 'Make_Genesis' 'Make_Honda' 'Make_Hyundai' 'Make_Infiniti'
 'Make_Jaguar' 'Make_Jeep' 'Make_Kia' 'Make_Lamborghini' 'Make_Lexus'
 'Make_Lincoln' 'Make_MINI' 'Make_Maserati' 'Make_Mazda'
 'Make_Mercedes-Benz' 'Make_Mitsubishi' 'Make_Nissan' 'Make_Porsche'
 'Make_Ram' 'Make_Rolls-Royce' 'Make_Subaru' 'Make_Toyota'
 'Make_Volkswagen' 'Make_Volvo' 'Model_1500' 'Model_1500 4X4 EcoDiesel'
 'Model_1500 4X4 TRX' 'Model_1500 4X4 eTorque' 'Model_1500 Classic'
 'Model_1500 Classic 4X4' 'Model_1500 EcoDiesel'
 'Model_1500 HFE EcoDiesel' 'Model_1500 HFE eTorque' 'Model_1500 eTorque'
 'Model_228i xDrive Gran Coupe' 'Model_230i xDrive Coupe' 'Model_300'
 'Model_300 AWD' 'Model_430i xDrive Coupe' 'Model_4Runner 4WD'
 'Model_4Runner 4WD (Part-Time 4WD)' 'Model_500X AWD'
 'Model_530i xDriv

In [None]:
#evaluate accuracy
mse=mean_squared_error(Y_test,prediction)
mae=mean_absolute_error(Y_test,prediction)
r2=r2_score(Y_test,prediction)
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"r2_score:{r2}")

Mean Squared Error: 108.37286791443852
Mean Absolute Error: 3.2895721925133703
r2_score:0.9730966058585209


In [None]:
#export pipeline for future use
joblib.dump(pipeline,"vehicle_emission_pipeline")

['vehicle_emission_pipeline']