In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv("CO2_emissions.csv")

In [3]:
df.columns

Index(['Make', 'Model', 'Vehicle Class', 'Engine Size(L)', 'Cylinders',
       'Transmission', 'Fuel Type', 'Fuel Consumption City (L/100 km)',
       'Fuel Consumption Hwy (L/100 km)', 'Fuel Consumption Comb (L/100 km)',
       'Fuel Consumption Comb (mpg)', 'CO2 Emissions(g/km)'],
      dtype='object')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7385 entries, 0 to 7384
Data columns (total 12 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Make                              7385 non-null   object 
 1   Model                             7385 non-null   object 
 2   Vehicle Class                     7385 non-null   object 
 3   Engine Size(L)                    7385 non-null   float64
 4   Cylinders                         7385 non-null   int64  
 5   Transmission                      7385 non-null   object 
 6   Fuel Type                         7385 non-null   object 
 7   Fuel Consumption City (L/100 km)  7385 non-null   float64
 8   Fuel Consumption Hwy (L/100 km)   7385 non-null   float64
 9   Fuel Consumption Comb (L/100 km)  7385 non-null   float64
 10  Fuel Consumption Comb (mpg)       7385 non-null   int64  
 11  CO2 Emissions(g/km)               7385 non-null   int64  
dtypes: flo

In [5]:
#one hot encode 'Fuel type' to train the model well

temp=df.pop('Fuel Type')
df['Ftype_X']=(temp=='X')*1.0
df['Ftype_Z']=(temp=='Z')*1.0
df['Ftype_E']=(temp=='E')*1.0
df['Ftype_D']=(temp=='D')*1.0
df['Ftype_N']=(temp=='N')*1.0


In [6]:
df.dtypes

Make                                 object
Model                                object
Vehicle Class                        object
Engine Size(L)                      float64
Cylinders                             int64
Transmission                         object
Fuel Consumption City (L/100 km)    float64
Fuel Consumption Hwy (L/100 km)     float64
Fuel Consumption Comb (L/100 km)    float64
Fuel Consumption Comb (mpg)           int64
CO2 Emissions(g/km)                   int64
Ftype_X                             float64
Ftype_Z                             float64
Ftype_E                             float64
Ftype_D                             float64
Ftype_N                             float64
dtype: object

In [7]:
df.head()

Unnamed: 0,Make,Model,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km),Ftype_X,Ftype_Z,Ftype_E,Ftype_D,Ftype_N
0,ACURA,ILX,COMPACT,2.0,4,AS5,9.9,6.7,8.5,33,196,0.0,1.0,0.0,0.0,0.0
1,ACURA,ILX,COMPACT,2.4,4,M6,11.2,7.7,9.6,29,221,0.0,1.0,0.0,0.0,0.0
2,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,6.0,5.8,5.9,48,136,0.0,1.0,0.0,0.0,0.0
3,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,12.7,9.1,11.1,25,255,0.0,1.0,0.0,0.0,0.0
4,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,12.1,8.7,10.6,27,244,0.0,1.0,0.0,0.0,0.0


In [8]:
df.columns

Index(['Make', 'Model', 'Vehicle Class', 'Engine Size(L)', 'Cylinders',
       'Transmission', 'Fuel Consumption City (L/100 km)',
       'Fuel Consumption Hwy (L/100 km)', 'Fuel Consumption Comb (L/100 km)',
       'Fuel Consumption Comb (mpg)', 'CO2 Emissions(g/km)', 'Ftype_X',
       'Ftype_Z', 'Ftype_E', 'Ftype_D', 'Ftype_N'],
      dtype='object')

In [9]:
# min max normalization such that all training features stay between 0 to 1

def norm(X):
    X=(X-min(X))/(max(X)-min(X))
    return X

In [10]:
col=['Engine Size(L)', 'Cylinders', 'Fuel Consumption City (L/100 km)',
       'Fuel Consumption Hwy (L/100 km)', 'Fuel Consumption Comb (L/100 km)',
       'Fuel Consumption Comb (mpg)', 'Ftype_X',
       'Ftype_Z', 'Ftype_E', 'Ftype_D', 'Ftype_N']

for i in range(len(col)):
    df[col[i]]=norm(df[col[i]])

df.head()

Unnamed: 0,Make,Model,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km),Ftype_X,Ftype_Z,Ftype_E,Ftype_D,Ftype_N
0,ACURA,ILX,COMPACT,0.146667,0.076923,AS5,0.215909,0.162651,0.2,0.37931,196,0.0,1.0,0.0,0.0,0.0
1,ACURA,ILX,COMPACT,0.2,0.076923,M6,0.265152,0.222892,0.25,0.310345,221,0.0,1.0,0.0,0.0,0.0
2,ACURA,ILX HYBRID,COMPACT,0.08,0.076923,AV7,0.068182,0.108434,0.081818,0.637931,136,0.0,1.0,0.0,0.0,0.0
3,ACURA,MDX 4WD,SUV - SMALL,0.346667,0.230769,AS6,0.32197,0.307229,0.318182,0.241379,255,0.0,1.0,0.0,0.0,0.0
4,ACURA,RDX AWD,SUV - SMALL,0.346667,0.230769,AS6,0.299242,0.283133,0.295455,0.275862,244,0.0,1.0,0.0,0.0,0.0


In [17]:
#train test split

from sklearn.model_selection import train_test_split

train_features=['Engine Size(L)', 'Cylinders', 'Fuel Consumption City (L/100 km)',
       'Fuel Consumption Hwy (L/100 km)', 'Fuel Consumption Comb (L/100 km)',
       'Fuel Consumption Comb (mpg)', 'Ftype_X',
       'Ftype_Z', 'Ftype_E', 'Ftype_D', 'Ftype_N']
target=[ 'CO2 Emissions(g/km)']
X_train,X_test,Y_train,Y_test= train_test_split(df[train_features],df[target],test_size=0.125,random_state=66)

In [31]:
#model definition

from sklearn.ensemble import GradientBoostingRegressor
    
model=GradientBoostingRegressor(n_estimators=100,criterion='mse',max_depth=20,min_samples_leaf=4,min_impurity_split=3,max_leaf_nodes=1000)

In [32]:
#model training

model.fit(X_train,Y_train)

GradientBoostingRegressor(criterion='mse', max_depth=20, max_leaf_nodes=1000,
                          min_impurity_split=3, min_samples_leaf=4)

In [33]:
#predict 

Y_pred=model.predict(X_test)

In [34]:
#metrics

from sklearn.metrics import r2_score,mean_squared_error

print("R squared score=",r2_score(Y_test,Y_pred))
print("MSE=",mean_squared_error(Y_test,Y_pred))

R squared score= 0.9983130988558956
MSE= 5.5675031348096224


In [35]:
from sklearn.model_selection import cross_val_score
mod = DecisionTreeRegressor(criterion='mse',max_depth=10,min_samples_leaf=4,min_impurity_split=3,max_leaf_nodes=1000)
scores = cross_val_score(mod, X_train, Y_train, cv=100)
print("Average cross validation score", np.mean(scores))

Average cross validation score 0.9967490877136594
