In [1]:
# Importing the libraries 

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

# Ignore harmless warnings 

import warnings 
warnings.filterwarnings("ignore")

# Set to display all the columns in dataset

pd.set_option("display.max_columns", None)

# Import psql to run queries 

import pandasql as psql

In [2]:
# load the CO2 Emission dataset 

CO2Emission = pd.read_csv(r"C:\Users\Raju\IIIT-Kurnool\MY2021_Fuel_Consumption_Ratings.csv", header=0) 
CO2Emission_BK = CO2Emission.copy()
CO2Emission.head()

Unnamed: 0,Year,Make,Model,Vehicle_Class,Engine_Size,Cylinders,Transmission,Fuel_Type,Fuel_Consumption_city,Fuel_Consumption_Hwy,Fuel_Consumption_Comb,Fuel_Consumption_Comb_MPG,CO2_Emissions,CO2_Rating,Smog_Rating
0,2021,Acura,ILX,Compact,2.4,4,AM8,Z,9.9,7.0,8.6,33,199,6,3
1,2021,Acura,NSX,Two-seater,3.5,6,AM9,Z,11.1,10.8,11.0,26,256,4,3
2,2021,Acura,RDX SH-AWD,SUV: Small,2.0,4,AS10,Z,11.0,8.6,9.9,29,232,5,6
3,2021,Acura,RDX SH-AWD A-SPEC,SUV: Small,2.0,4,AS10,Z,11.3,9.1,10.3,27,242,5,6
4,2021,Acura,TLX SH-AWD,Compact,2.0,4,AS10,Z,11.2,8.0,9.8,29,230,5,7


In [3]:
# Dataset information

CO2Emission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 883 entries, 0 to 882
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Year                       883 non-null    int64  
 1   Make                       883 non-null    object 
 2   Model                      883 non-null    object 
 3   Vehicle_Class              883 non-null    object 
 4   Engine_Size                883 non-null    float64
 5   Cylinders                  883 non-null    int64  
 6   Transmission               883 non-null    object 
 7   Fuel_Type                  883 non-null    object 
 8   Fuel_Consumption_city      883 non-null    float64
 9   Fuel_Consumption_Hwy       883 non-null    float64
 10  Fuel_Consumption_Comb      883 non-null    float64
 11  Fuel_Consumption_Comb_MPG  883 non-null    int64  
 12  CO2_Emissions              883 non-null    int64  
 13  CO2_Rating                 883 non-null    int64  

In [4]:
# Delecting the 8 columns

CO2Emission = CO2Emission.drop(['Year', 'Make', 'Model', 'Vehicle_Class', 'Transmission',
                                 'Fuel_Type', 'CO2_Rating', 'Smog_Rating'], axis = 1)
CO2Emission.head()

Unnamed: 0,Engine_Size,Cylinders,Fuel_Consumption_city,Fuel_Consumption_Hwy,Fuel_Consumption_Comb,Fuel_Consumption_Comb_MPG,CO2_Emissions
0,2.4,4,9.9,7.0,8.6,33,199
1,3.5,6,11.1,10.8,11.0,26,256
2,2.0,4,11.0,8.6,9.9,29,232
3,2.0,4,11.3,9.1,10.3,27,242
4,2.0,4,11.2,8.0,9.8,29,230


In [5]:
# Identify the Independent and Target variables

IndepVar = []
for col in CO2Emission.columns:
    if col != 'CO2_Emissions':
        IndepVar.append(col)

TargetVar = 'CO2_Emissions'

x = CO2Emission[IndepVar]
y = CO2Emission[TargetVar]

In [6]:
# Split the data into train and test

from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [7]:
# Scaling the features by using MinMaxScaler

from sklearn.preprocessing import MinMaxScaler

mmscaler = MinMaxScaler(feature_range=(0, 1))

x_train = mmscaler.fit_transform(x_train)
x_train = pd.DataFrame(x_train)

x_test = mmscaler.fit_transform(x_test)
x_test = pd.DataFrame(x_test)

# Build the Regression model

In [8]:
# Train the algorithm and build the model with train dataset

from sklearn.linear_model import LinearRegression

MulRGR = LinearRegression()

# Train the model with training dataset

MulRGR.fit(x_train, y_train)

# Predict the model with test dataset

y_pred = MulRGR.predict(x_test)

In [9]:
# Evaluation metrics for Regression analysis

from sklearn import metrics

print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))

# Define the function to calculate the MAPE - Mean Absolute Percentage Error

def MAPE (y_test, y_pred): 
    y_test, y_pred = np.array(y_test), np.array(y_pred)
    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

# Evaluation of MAPE 

result = MAPE(y_test, y_pred)
print('Mean Absolute Percentage Error (MAPE):', round(result, 3), '%')

# Calculate Adjusted R squared values 

r_squared = round(metrics.r2_score(y_test, y_pred),6)
adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
print('Adj R Square: ', adjusted_r_squared)

Mean Absolute Error (MAE): 28.879
Mean Squared Error (MSE): 1445.051
Root Mean Squared Error (RMSE): 38.014
R2_score: 0.586325
Root Mean Squared Log Error (RMSLE): 3.638
Mean Absolute Percentage Error (MAPE): 10.726 %
Adj R Square:  0.583492


In [10]:
Results = pd.DataFrame({'CO2_Emissions_A':y_test, 'CO2_Emissions_P':y_pred})

# Merge two Dataframes on index of both the dataframes

ResultsFinal = CO2Emission_BK.merge(Results, left_index=True, right_index=True)
ResultsFinal.sample(10)

Unnamed: 0,Year,Make,Model,Vehicle_Class,Engine_Size,Cylinders,Transmission,Fuel_Type,Fuel_Consumption_city,Fuel_Consumption_Hwy,Fuel_Consumption_Comb,Fuel_Consumption_Comb_MPG,CO2_Emissions,CO2_Rating,Smog_Rating,CO2_Emissions_A,CO2_Emissions_P
620,2021,Mercedes-Benz,AMG GT 63 S 4MATIC+ Coupe,Compact,4.0,8,A9,Z,15.1,11.1,13.3,21,312,3,5,312,356.478939
211,2021,Chevrolet,Suburban,SUV: Standard,5.3,8,A10,X,14.3,11.8,13.2,21,309,3,5,309,353.841477
54,2021,Audi,SQ5 Sportback quattro,SUV: Small,3.0,6,AS8,Z,13.1,9.9,11.6,24,274,4,5,274,302.001766
720,2021,Porsche,718 Spyder,Two-seater,4.0,6,M6,Z,14.0,10.1,12.2,23,286,3,5,286,319.189238
10,2021,Alfa Romeo,Stelvio AWD,SUV: Small,2.0,4,A8,Z,10.8,8.3,9.6,29,226,5,3,226,240.304389
536,2021,Lexus,UX 250h AWD,Compact,2.0,4,AV6,X,5.7,6.2,6.0,47,140,8,7,140,139.7293
755,2021,Porsche,Panamera 4 ST,Full-size,2.9,6,AM8,Z,12.8,10.2,11.7,24,274,4,5,274,307.451049
528,2021,Lexus,RC 350 AWD,Subcompact,3.5,6,AS6,Z,12.2,9.0,10.8,26,253,4,5,253,286.184667
746,2021,Porsche,Cayenne Turbo,SUV: Standard,4.0,8,AS8,Z,15.6,12.4,14.1,20,329,3,3,329,371.536438
576,2021,Mazda,Mazda3 5-Door,Mid-size,2.5,4,AS6,X,9.2,6.9,8.1,35,190,6,7,190,200.908806


In [11]:
Results = pd.DataFrame({'CO2_Emissions_A':y_test, 'CO2_Emissions_P':y_pred})
Results.sample(10)

Unnamed: 0,CO2_Emissions_A,CO2_Emissions_P
139,239,258.899479
732,281,314.2513
275,277,309.855617
526,175,180.389408
54,274,302.001766
97,281,323.309872
687,223,234.978532
165,296,342.60615
309,287,302.426869
110,235,264.442103


# Compare all regressors

In [12]:
# Build the Regression / Regressor models

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neural_network import MLPRegressor

# Create objects of Regression / Regressor models with default hyper-parameters

modelmlg = LinearRegression()
modeldcr = DecisionTreeRegressor()
modelrfr = RandomForestRegressor()
modelABR = AdaBoostRegressor(n_estimators=50, base_estimator=None ,learning_rate=1)
modelSVR = SVR()
modelGBR = GradientBoostingRegressor()
modelXGR = xgb.XGBRegressor()
modelKNN = KNeighborsRegressor(n_neighbors=5)
modelETR = ExtraTreesRegressor()
modelMLPR = MLPRegressor()

# Evalution matrix for all the algorithms

MM = [modelmlg, modeldcr, modelrfr, modelABR, modelSVR, modelGBR, modelXGR, modelKNN, modelETR, modelMLPR]

for models in MM:
    
    # Fit the model with train data
    
    models.fit(x_train, y_train)
    
    # Predict the model with test data

    y_pred = models.predict(x_test)
    
    # Print the model name
    
    print('Model Name: ', models)
    
    # Evaluation metrics for Regression analysis

    from sklearn import metrics

    print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
    print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
    print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
    print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
    print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))
    
    # Define the function to calculate the MAPE - Mean Absolute Percentage Error

    def MAPE (y_test, y_pred):
        y_test, y_pred = np.array(y_test), np.array(y_pred)
        return np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    
    # Evaluation of MAPE 

    result = MAPE(y_test, y_pred)
    print('Mean Absolute Percentage Error (MAPE):', round(result, 2), '%')
    
    # Calculate Adjusted R squared values 

    r_squared = round(metrics.r2_score(y_test, y_pred),6)
    adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
    print('Adj R Square: ', adjusted_r_squared)
    print('------------------------------------------------------------------------------------------------------------')

Model Name:  LinearRegression()
Mean Absolute Error (MAE): 28.879
Mean Squared Error (MSE): 1445.051
Root Mean Squared Error (RMSE): 38.014
R2_score: 0.586325
Root Mean Squared Log Error (RMSLE): 3.638
Mean Absolute Percentage Error (MAPE): 10.73 %
Adj R Square:  0.583492
------------------------------------------------------------------------------------------------------------
Model Name:  DecisionTreeRegressor()
Mean Absolute Error (MAE): 26.275
Mean Squared Error (MSE): 1290.766
Root Mean Squared Error (RMSE): 35.927
R2_score: 0.630492
Root Mean Squared Log Error (RMSLE): 3.581
Mean Absolute Percentage Error (MAPE): 9.62 %
Adj R Square:  0.627961
------------------------------------------------------------------------------------------------------------
Model Name:  RandomForestRegressor()
Mean Absolute Error (MAE): 22.334
Mean Squared Error (MSE): 1006.01
Root Mean Squared Error (RMSE): 31.718
R2_score: 0.71201
Root Mean Squared Log Error (RMSLE): 3.457
Mean Absolute Percentage Er