In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('co2_emissions_EDA_Completed.csv')

In [3]:
df.head(2)

Unnamed: 0,engine_size,fuel_consumption_city,fuel_consumption_hwy,fuel_consumption_comb(l/100km),fuel_consumption_comb(mpg),cylinders_3,cylinders_4,cylinders_5,cylinders_6,cylinders_8,...,transmission_M,fuel_type_D,fuel_type_E,fuel_type_N,fuel_type_X,fuel_type_Z,make_freq,model_freq,vehicle_class_freq,co2_emissions
0,-0.851899,-0.764893,-1.042022,-0.856464,0.775786,False,True,False,False,False,...,False,False,False,False,False,True,0.00813,0.001116,0.143791,196
1,-0.558899,-0.398726,-0.602985,-0.482924,0.222145,False,True,False,False,False,...,True,False,False,False,False,True,0.00813,0.001116,0.143791,221


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6273 entries, 0 to 6272
Data columns (total 27 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   engine_size                     6273 non-null   float64
 1   fuel_consumption_city           6273 non-null   float64
 2   fuel_consumption_hwy            6273 non-null   float64
 3   fuel_consumption_comb(l/100km)  6273 non-null   float64
 4   fuel_consumption_comb(mpg)      6273 non-null   float64
 5   cylinders_3                     6273 non-null   bool   
 6   cylinders_4                     6273 non-null   bool   
 7   cylinders_5                     6273 non-null   bool   
 8   cylinders_6                     6273 non-null   bool   
 9   cylinders_8                     6273 non-null   bool   
 10  cylinders_10                    6273 non-null   bool   
 11  cylinders_12                    6273 non-null   bool   
 12  cylinders_16                    62

#### split data into train and test

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X = df.iloc[:,:26]
y = df.iloc[:,26]

In [7]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6273 entries, 0 to 6272
Data columns (total 26 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   engine_size                     6273 non-null   float64
 1   fuel_consumption_city           6273 non-null   float64
 2   fuel_consumption_hwy            6273 non-null   float64
 3   fuel_consumption_comb(l/100km)  6273 non-null   float64
 4   fuel_consumption_comb(mpg)      6273 non-null   float64
 5   cylinders_3                     6273 non-null   bool   
 6   cylinders_4                     6273 non-null   bool   
 7   cylinders_5                     6273 non-null   bool   
 8   cylinders_6                     6273 non-null   bool   
 9   cylinders_8                     6273 non-null   bool   
 10  cylinders_10                    6273 non-null   bool   
 11  cylinders_12                    6273 non-null   bool   
 12  cylinders_16                    62

In [8]:
X.head(2)

Unnamed: 0,engine_size,fuel_consumption_city,fuel_consumption_hwy,fuel_consumption_comb(l/100km),fuel_consumption_comb(mpg),cylinders_3,cylinders_4,cylinders_5,cylinders_6,cylinders_8,...,transmission_AV,transmission_M,fuel_type_D,fuel_type_E,fuel_type_N,fuel_type_X,fuel_type_Z,make_freq,model_freq,vehicle_class_freq
0,-0.851899,-0.764893,-1.042022,-0.856464,0.775786,False,True,False,False,False,...,False,False,False,False,False,False,True,0.00813,0.001116,0.143791
1,-0.558899,-0.398726,-0.602985,-0.482924,0.222145,False,True,False,False,False,...,False,True,False,False,False,False,True,0.00813,0.001116,0.143791


In [9]:
y.info()

<class 'pandas.core.series.Series'>
RangeIndex: 6273 entries, 0 to 6272
Series name: co2_emissions
Non-Null Count  Dtype
--------------  -----
6273 non-null   int64
dtypes: int64(1)
memory usage: 49.1 KB


In [10]:
y.head(2)

0    196
1    221
Name: co2_emissions, dtype: int64

In [11]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((5018, 26), (1255, 26), (5018,), (1255,))

#### Create the model

In [12]:
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [15]:
model = BaggingRegressor(
    estimator=DecisionTreeRegressor(),
    n_estimators=100,
    max_samples=0.8,
    bootstrap=True,
)

In [16]:
model.fit(X_train,y_train)

#### Predict for training Data and get the metrics

In [17]:
yhat_train = model.predict(X_train)

In [18]:
train_mse = mean_squared_error(y_train,yhat_train)
train_rmse = np.sqrt(train_mse)
train_r2 = r2_score(y_train,yhat_train)
train_mse, train_rmse, train_r2

(2.3941172197198792, 1.5472935144050335, 0.9993189471529126)

In [19]:
y_train.mean()

251.84854523714628

#### Predict for testing Data and get the metrics

In [20]:
yhat_test = model.predict(X_test)

In [21]:
test_mse = mean_squared_error(y_test,yhat_test)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test,yhat_test)
test_mse, test_rmse, test_r2

(11.212003269656877, 3.3484329573185243, 0.9967715529339335)

### The MLR model is performing well for both training and testing data

In [22]:
import pickle

In [23]:
with open("model.pkl", "wb") as f:
    pickle.dump(model, f)