In [1]:
# Importing the libraries 

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

# Ignore harmless warnings 

import warnings 
warnings.filterwarnings("ignore")

# Set to display all the columns in dataset

pd.set_option("display.max_columns", None)

# Import psql to run queries 

import pandasql as psql

In [2]:
# load the CO2 Emission dataset 

CO2Emission = pd.read_csv(r"D:\R3SPAnalytics\01-SDP\Datasets\MY2021_Fuel_Consumption_Ratings.csv", header=0)

# Copy the file to back-up file

CO2Emission_bk = CO2Emission.copy()

# Display first 5 records

CO2Emission.head()

Unnamed: 0,Year,Make,Model,Vehicle_Class,Engine_Size,Cylinders,Transmission,Fuel_Type,Fuel_Consumption_city,Fuel_Consumption_Hwy,Fuel_Consumption_Comb,Fuel_Consumption_Comb_MPG,CO2_Emissions,CO2_Rating,Smog_Rating
0,2021,Acura,ILX,Compact,2.4,4,AM8,Z,9.9,7.0,8.6,33,199,6,3
1,2021,Acura,NSX,Two-seater,3.5,6,AM9,Z,11.1,10.8,11.0,26,256,4,3
2,2021,Acura,RDX SH-AWD,SUV: Small,2.0,4,AS10,Z,11.0,8.6,9.9,29,232,5,6
3,2021,Acura,RDX SH-AWD A-SPEC,SUV: Small,2.0,4,AS10,Z,11.3,9.1,10.3,27,242,5,6
4,2021,Acura,TLX SH-AWD,Compact,2.0,4,AS10,Z,11.2,8.0,9.8,29,230,5,7


In [3]:
# Display dataset information

CO2Emission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 883 entries, 0 to 882
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Year                       883 non-null    int64  
 1   Make                       883 non-null    object 
 2   Model                      883 non-null    object 
 3   Vehicle_Class              883 non-null    object 
 4   Engine_Size                883 non-null    float64
 5   Cylinders                  883 non-null    int64  
 6   Transmission               883 non-null    object 
 7   Fuel_Type                  883 non-null    object 
 8   Fuel_Consumption_city      883 non-null    float64
 9   Fuel_Consumption_Hwy       883 non-null    float64
 10  Fuel_Consumption_Comb      883 non-null    float64
 11  Fuel_Consumption_Comb_MPG  883 non-null    int64  
 12  CO2_Emissions              883 non-null    int64  
 13  CO2_Rating                 883 non-null    int64  

In [4]:
# Delecting the 8 columns

CO2Emission = CO2Emission.drop(['Year', 'Make', 'Model', 'Vehicle_Class', 'Transmission',
                                 'Fuel_Type', 'CO2_Rating', 'Smog_Rating'], axis = 1)
CO2Emission.head()

Unnamed: 0,Engine_Size,Cylinders,Fuel_Consumption_city,Fuel_Consumption_Hwy,Fuel_Consumption_Comb,Fuel_Consumption_Comb_MPG,CO2_Emissions
0,2.4,4,9.9,7.0,8.6,33,199
1,3.5,6,11.1,10.8,11.0,26,256
2,2.0,4,11.0,8.6,9.9,29,232
3,2.0,4,11.3,9.1,10.3,27,242
4,2.0,4,11.2,8.0,9.8,29,230


In [5]:
# Identify the Independent and Target variables

IndepVar = []
for col in CO2Emission.columns:
    if col != 'CO2_Emissions':
        IndepVar.append(col)

TargetVar = 'CO2_Emissions'

x = CO2Emission[IndepVar]
y = CO2Emission[TargetVar]

In [6]:
# Split the data into train and test

from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=143)

# Display the shape of the train_data and test_data

x_train.shape, x_test.shape, y_train.shape, y_test.shape

((618, 6), (265, 6), (618,), (265,))

In [7]:
# Scaling the features by using MinMaxScaler

from sklearn.preprocessing import MinMaxScaler

mmscaler = MinMaxScaler(feature_range=(0, 1))

x_train = mmscaler.fit_transform(x_train)
x_train = pd.DataFrame(x_train)

x_test = mmscaler.fit_transform(x_test)
x_test = pd.DataFrame(x_test)

# Multi Regression Algorithm

In [8]:
# Build the model with Gradient Boosting Regressor

from sklearn.linear_model import LinearRegression  

# Create object for the model

ModelMLR = LinearRegression()

# Train the model with training data

ModelMLR.fit(x_train, y_train)

# Predict the model with test dataset

y_pred = ModelMLR.predict(x_test)

# Evaluation metrics for Regression analysis

from sklearn import metrics

print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))

# Define the function to calculate the MAPE - Mean Absolute Percentage Error

def MAPE (y_test, y_pred): 
    y_test, y_pred = np.array(y_test), np.array(y_pred)
    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

# Evaluation of MAPE 

result = MAPE(y_test, y_pred)
print('Mean Absolute Percentage Error (MAPE):', round(result, 3), '%')

# Calculate Adjusted R squared values 

r_squared = round(metrics.r2_score(y_test, y_pred),6)
adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
print('Adj R Square: ', adjusted_r_squared)

Mean Absolute Error (MAE): 22.98
Mean Squared Error (MSE): 881.506
Root Mean Squared Error (RMSE): 29.69
R2_score: 0.760523
Root Mean Squared Log Error (RMSLE): 3.391
Mean Absolute Percentage Error (MAPE): 8.392 %
Adj R Square:  0.758883


In [9]:
Results = pd.DataFrame({'CO2_Emissions_A':y_test, 'CO2_Emissions_P':y_pred})

# Merge two Dataframes on index of both the dataframes

ResultsFinal = CO2Emission_bk.merge(Results, left_index=True, right_index=True)
ResultsFinal.sample(5)

Unnamed: 0,Year,Make,Model,Vehicle_Class,Engine_Size,Cylinders,Transmission,Fuel_Type,Fuel_Consumption_city,Fuel_Consumption_Hwy,Fuel_Consumption_Comb,Fuel_Consumption_Comb_MPG,CO2_Emissions,CO2_Rating,Smog_Rating,CO2_Emissions_A,CO2_Emissions_P
487,2021,Kia,Rio,Compact,1.6,4,AV1,X,7.2,6.0,6.7,42,159,7,3,159,160.97102
455,2021,Jeep,Gladiator 4X4 EcoDiesel,Pickup truck: Standard,3.0,6,A8,D,10.8,8.5,9.8,29,263,4,1,263,256.048404
357,2021,GMC,Sierra 4WD,Pickup truck: Standard,5.3,8,A6,X,16.0,11.8,14.1,20,331,3,5,331,366.87271
508,2021,Lamborghini,Huracan Spyder AWD,Two-seater,5.2,10,AM7,Z,18.0,12.9,15.7,18,371,2,1,371,415.573926
55,2021,Audi,SQ7 quattro,SUV: Standard,4.0,8,AS8,Z,16.0,11.4,13.9,20,324,3,3,324,363.469639


# Compare with all Regression / Regressors

In [10]:
# Load the result dataset

RGRResults = pd.read_csv(r"D:\R3SPAnalytics\01-SDP\Datasets\RGRResults.csv", header=0)

RGRResults.head()

Unnamed: 0,Model Name,Mean_Absolute_Error_MAE,Adj_R_Square,Root_Mean_Squared_Error_RMSE,Mean_Absolute_Percentage_Error_MAPE,Mean_Squared_Error_MSE,Root_Mean_Squared_Log_Error_RMSLE,R2_score


In [11]:
# Build the Regression / Regressor models

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
#from sklearn.svm import SVR
#import xgboost as xgb
from sklearn.neighbors import KNeighborsRegressor
#from sklearn.linear_model import BayesianRidge
#from sklearn.ensemble import BaggingRegressor
#from sklearn.ensemble import GradientBoostingRegressor

# Create objects of Regression / Regressor models with default hyper-parameters

ModelMLR= LinearRegression()
ModelDCR = DecisionTreeRegressor()
ModelRFR = RandomForestRegressor()
ModelETR = ExtraTreesRegressor()
#modelSVR = SVR()
#modelXGR = xgb.XGBRegressor()
ModelKNN = KNeighborsRegressor(n_neighbors=5)
#modelBRR = BayesianRidge()
#modelBGR = BaggingRegressor()
#modelGBR = GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0,
#                                     criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1,
#                                     min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0,
#                                     min_impurity_split=None, init=None, random_state=None, max_features=None,
#                                     alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False,
#                                     validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0)

# Evalution matrix for all the algorithms

#MM = [modelmlg, modeldcr, modelrfr, modelSVR, modelXGR, modelKNN, modelETR, modelBRR, modelBGR, modelGBR]
MM = [ModelMLR, ModelDCR, ModelRFR, ModelETR, ModelKNN]

for models in MM:
    
    # Fit the model with train data
    
    models.fit(x_train, y_train)
    
    # Predict the model with test data

    y_pred = models.predict(x_test)
    
    # Print the model name
    
    print('Model Name: ', models)
    
    # Evaluation metrics for Regression analysis

    from sklearn import metrics

    print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
    print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
    print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
    print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
    print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))
    
    # Define the function to calculate the MAPE - Mean Absolute Percentage Error

    def MAPE (y_test, y_pred):
        y_test, y_pred = np.array(y_test), np.array(y_pred)
        return np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    
    # Evaluation of MAPE 

    result = MAPE(y_test, y_pred)
    print('Mean Absolute Percentage Error (MAPE):', round(result, 2), '%')
    
    # Calculate Adjusted R squared values 

    r_squared = round(metrics.r2_score(y_test, y_pred),6)
    adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
    print('Adj R Square: ', adjusted_r_squared)
    print('------------------------------------------------------------------------------------------------------------')
    #-------------------------------------------------------------------------------------------
    new_row = {'Model Name' : models,
               'Mean_Absolute_Error_MAE' : metrics.mean_absolute_error(y_test, y_pred),
               'Adj_R_Square' : adjusted_r_squared,
               'Root_Mean_Squared_Error_RMSE' : np.sqrt(metrics.mean_squared_error(y_test, y_pred)),
               'Mean_Absolute_Percentage_Error_MAPE' : result,
               'Mean_Squared_Error_MSE' : metrics.mean_squared_error(y_test, y_pred),
               'Root_Mean_Squared_Log_Error_RMSLE': np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),
               'R2_score' : metrics.r2_score(y_test, y_pred)}
    RGRResults = RGRResults.append(new_row, ignore_index=True)
    #-------------------------------------------------------------------------------------------

Model Name:  LinearRegression()
Mean Absolute Error (MAE): 22.98
Mean Squared Error (MSE): 881.506
Root Mean Squared Error (RMSE): 29.69
R2_score: 0.760523
Root Mean Squared Log Error (RMSLE): 3.391
Mean Absolute Percentage Error (MAPE): 8.39 %
Adj R Square:  0.758883
------------------------------------------------------------------------------------------------------------
Model Name:  DecisionTreeRegressor()
Mean Absolute Error (MAE): 22.268
Mean Squared Error (MSE): 917.362
Root Mean Squared Error (RMSE): 30.288
R2_score: 0.750781
Root Mean Squared Log Error (RMSLE): 3.411
Mean Absolute Percentage Error (MAPE): 7.96 %
Adj R Square:  0.749074
------------------------------------------------------------------------------------------------------------
Model Name:  RandomForestRegressor()
Mean Absolute Error (MAE): 19.56
Mean Squared Error (MSE): 639.365
Root Mean Squared Error (RMSE): 25.286
R2_score: 0.826305
Root Mean Squared Log Error (RMSLE): 3.23
Mean Absolute Percentage Error (M

In [12]:
# Results with comparing the all the algorithms 

#RGRResults.to_csv("D://00 Henotic//SRKR//Datasets//REsults//RGRResults.csv")

RGRResults.head()

Unnamed: 0,Model Name,Mean_Absolute_Error_MAE,Adj_R_Square,Root_Mean_Squared_Error_RMSE,Mean_Absolute_Percentage_Error_MAPE,Mean_Squared_Error_MSE,Root_Mean_Squared_Log_Error_RMSLE,R2_score
0,LinearRegression(),22.979788,0.758883,29.690159,8.39178,881.505528,3.390816,0.760523
1,DecisionTreeRegressor(),22.267925,0.749074,30.287989,7.962595,917.362264,3.410751,0.750781
2,"(DecisionTreeRegressor(max_features='auto', ra...",19.559654,0.825115,25.285659,7.303426,639.364547,3.230237,0.826305
3,"(ExtraTreeRegressor(random_state=889351928), E...",16.540953,0.869934,21.806206,6.000314,475.510616,3.082195,0.870819
4,KNeighborsRegressor(),22.269434,0.788435,27.811237,8.008636,773.464906,3.32544,0.789874


In [13]:
# Predict the values with ET algorithm

y_predF = ModelETR.predict(x_test)

In [14]:
Results = pd.DataFrame({'CO2_Emissions_A':y_test, 'CO2_Emissions_P':y_pred})

# Merge two Dataframes on index of both the dataframes

ResultsFinal = CO2Emission_bk.merge(Results, left_index=True, right_index=True)
ResultsFinal.sample(5)

Unnamed: 0,Year,Make,Model,Vehicle_Class,Engine_Size,Cylinders,Transmission,Fuel_Type,Fuel_Consumption_city,Fuel_Consumption_Hwy,Fuel_Consumption_Comb,Fuel_Consumption_Comb_MPG,CO2_Emissions,CO2_Rating,Smog_Rating,CO2_Emissions_A,CO2_Emissions_P
509,2021,Lamborghini,Urus,SUV: Standard,4.0,8,AS8,Z,19.2,14.1,16.9,17,384,1,3,384,335.6
49,2021,Audi,S5 Sportback quattro,Mid-size,3.0,6,AS8,Z,11.5,8.5,10.1,28,237,5,5,237,255.4
613,2021,Mercedes-Benz,AMG GLE 53 4MATIC+ SUV,SUV: Standard,3.0,6,A9,Z,13.2,10.8,12.1,23,285,3,6,285,318.4
19,2021,Audi,A4 Sedan 40 TFSI quattro,Compact,2.0,4,AM7,Z,9.3,7.0,8.3,34,194,6,5,194,199.4
678,2021,Mitsubishi,RVR 4WD,SUV: Small,2.4,4,AV6,X,10.3,8.3,9.4,30,218,5,5,218,237.6


In [15]:
# Calculate the %of Error

ResultsFinal['%Error'] = round(((ResultsFinal['CO2_Emissions_A']-ResultsFinal['CO2_Emissions_P'])/ResultsFinal['CO2_Emissions_A'])*100,3)

In [16]:
# Display the results

ResultsFinal.sample(5)

Unnamed: 0,Year,Make,Model,Vehicle_Class,Engine_Size,Cylinders,Transmission,Fuel_Type,Fuel_Consumption_city,Fuel_Consumption_Hwy,Fuel_Consumption_Comb,Fuel_Consumption_Comb_MPG,CO2_Emissions,CO2_Rating,Smog_Rating,CO2_Emissions_A,CO2_Emissions_P,%Error
52,2021,Audi,S8 Sedan quattro,Full-size,4.0,8,AS8,Z,17.6,10.7,14.5,19,339,2,3,339,366.6,-8.142
151,2021,Cadillac,XT5 AWD,SUV: Small,2.0,4,AS9,Z,11.2,8.7,10.1,28,237,5,7,237,255.4,-7.764
692,2021,Nissan,Rogue AWD,SUV: Small,2.5,4,AV8,X,9.2,7.2,8.3,34,195,6,7,195,200.8,-2.974
858,2021,Volkswagen,Atlas Cross Sport 4MOTION,SUV: Small,3.6,6,AS8,X,13.8,10.4,12.3,23,289,3,5,289,328.4,-13.633
236,2021,Dodge,Challenger (MDS),Mid-size,5.7,8,A8,X,14.7,9.4,12.3,23,289,3,3,289,311.8,-7.889
