In [6]:
#import packages
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split   #this is split your data into training and testing part for model performance
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer #its allow the pipelines to link together
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.impute import SimpleImputer  #its will handle the missing values in data set
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error


In [23]:
#create features and target variable
data=pd.read_csv("Sample.csv")
x=data.drop(["CO2_EMISSIONS"],axis=1) #we remove this data in file because your model dont want to see this data 
y=data["CO2_EMISSIONS"]

#split categorial and numerical features
numerical_cols=["MODEL","ENGINE_SIZE","CYLINDERS","FUEL_CONSUMPTION*","Unnamed: 9","Unnamed: 10","Unnamed: 11"]  #INT FLOAT VALUES DATATYPES
categorial_cols=["MAKE","MODEL.1","VEHICLE CLASS","TRANSMISSION","FUEL"] #STRING(OBJECT) DATATYPES 

#start the pipeline with encoding
numerical_pipeline=Pipeline([
    ('imputer',SimpleImputer(strategy="mean"))
    ,("scaler",StandardScaler())])
categorial_pipeline=Pipeline([
    ('imputer',SimpleImputer(strategy="most_frequent")),
    ('encoder',OneHotEncoder(handle_unknown="ignore"))])

#join the pipelines together
preprocessor=ColumnTransformer([
    ('num',numerical_pipeline,numerical_cols),
    ("cat",categorial_pipeline,categorial_cols)
])

pipeline=Pipeline([
    ("preprocessor",preprocessor),
    ("model",RandomForestRegressor())
])

#split into training and testing datasets
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

#train and predict model
pipeline.fit(x_train,y_train)
prediction=pipeline.predict(x_test)

#evaluate model accuracy
mse=mean_squared_error(y_test,prediction)
rmse=np.sqrt(mse)
r2=r2_score(y_test,prediction)
mae=mean_absolute_error(y_test,prediction)
print("MODEL PERFORMANCE")
print(f"R2_score:{r2}")
print(f"Root Mean Square Error:{rmse}")
print(f"mean absolute error:{mae}")


joblib.dump(pipeline,"vehicle_emission_pipeline.joblib")

MODEL PERFORMANCE
R2_score:0.990894094841656
Root Mean Square Error:5.579294969164223
mean absolute error:0.8970588235294092


['vehicle_emission_pipeline.joblib']

In [18]:
data.columns

Index(['MODEL', 'MAKE', 'MODEL.1', 'VEHICLE CLASS', 'ENGINE_SIZE', 'CYLINDERS',
       'TRANSMISSION', 'FUEL', 'FUEL_CONSUMPTION*', 'Unnamed: 9',
       'Unnamed: 10', 'Unnamed: 11', 'CO2_EMISSIONS'],
      dtype='object')

In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 679 entries, 0 to 678
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   MODEL              679 non-null    int64  
 1   MAKE               679 non-null    object 
 2   MODEL.1            679 non-null    object 
 3   VEHICLE CLASS      679 non-null    object 
 4   ENGINE_SIZE        679 non-null    float64
 5   CYLINDERS          679 non-null    int64  
 6   TRANSMISSION       679 non-null    object 
 7   FUEL               679 non-null    object 
 8   FUEL_CONSUMPTION*  679 non-null    float64
 9   Unnamed: 9         679 non-null    float64
 10  Unnamed: 10        679 non-null    float64
 11  Unnamed: 11        679 non-null    int64  
 12  CO2_EMISSIONS      679 non-null    int64  
dtypes: float64(4), int64(4), object(5)
memory usage: 69.1+ KB
