## Model Training

### import data & required packages

In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
#from catboost import CatBoostRegressor
#from xgboost import XGBRegressor
import warnings

In [2]:
df = pd.read_csv('data\preprocessig_data.csv')

In [3]:
df.head()

Unnamed: 0,Make,Model,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)
0,ACURA,ILX,COMPACT,2.0,4.0,AS5,Z,9.9,6.7,8.5,33.0,196.0
1,ACURA,ILX,COMPACT,2.0,4.0,AS5,Z,9.9,6.7,8.5,33.0,196.0
2,ACURA,ILX,COMPACT,2.4,4.0,M6,Z,11.2,7.7,9.6,29.0,221.0
3,ACURA,ILX HYBRID,COMPACT,1.5,4.0,AV7,Z,6.0,5.8,5.9,48.0,136.0
4,ACURA,MDX 4WD,SUV - SMALL,3.5,6.0,AS6,Z,12.7,9.1,11.1,25.0,255.0


In [4]:
df.columns

Index(['Make', 'Model', 'Vehicle Class', 'Engine Size(L)', 'Cylinders',
       'Transmission', 'Fuel Type', 'Fuel Consumption City (L/100 km)',
       'Fuel Consumption Hwy (L/100 km)', 'Fuel Consumption Comb (L/100 km)',
       'Fuel Consumption Comb (mpg)', 'CO2 Emissions(g/km)'],
      dtype='object')

### prepare x & y variable

In [5]:
x = df.drop(columns=['CO2 Emissions(g/km)'],axis=1)

In [6]:
x.head()

Unnamed: 0,Make,Model,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg)
0,ACURA,ILX,COMPACT,2.0,4.0,AS5,Z,9.9,6.7,8.5,33.0
1,ACURA,ILX,COMPACT,2.0,4.0,AS5,Z,9.9,6.7,8.5,33.0
2,ACURA,ILX,COMPACT,2.4,4.0,M6,Z,11.2,7.7,9.6,29.0
3,ACURA,ILX HYBRID,COMPACT,1.5,4.0,AV7,Z,6.0,5.8,5.9,48.0
4,ACURA,MDX 4WD,SUV - SMALL,3.5,6.0,AS6,Z,12.7,9.1,11.1,25.0


In [7]:
y = df['CO2 Emissions(g/km)']

In [8]:
y

0       196.0
1       196.0
2       221.0
3       136.0
4       255.0
        ...  
6300    219.0
6301    232.0
6302    240.0
6303    232.0
6304    248.0
Name: CO2 Emissions(g/km), Length: 6305, dtype: float64

In [9]:
print("Categories in 'Make' variable:     ",end=" " )
print(df['Make'].unique())

print("Categories in 'Model' variable:  ",end=" ")
print(df['Model'].unique())

print("Categories in'Vehicle Class' variable:",end=" " )
print(df['Vehicle Class'].unique())

print("Categories in 'Transmission' variable:     ",end=" " )
print(df['Transmission'].unique())

print("Categories in 'Fuel Type' variable:     ",end=" " )
print(df['Fuel Type'].unique())

Categories in 'Make' variable:      ['ACURA' 'ALFA ROMEO' 'ASTON MARTIN' 'AUDI' 'BENTLEY' 'BMW' 'BUICK'
 'CADILLAC' 'CHEVROLET' 'CHRYSLER' 'DODGE' 'FIAT' 'FORD' 'GENESIS' 'GMC'
 'HONDA' 'HYUNDAI' 'INFINITI' 'JAGUAR' 'JEEP' 'KIA' 'LAMBORGHINI'
 'LAMBORGHINI Aventador Coupe' 'LAND ROVER' 'LEXUS' 'LINCOLN' 'MASERATI'
 'MAZDA' 'MERCEDES-BENZ' 'MINI' 'MITSUBISHI' 'NISSAN' 'PORSCHE' 'RAM'
 'ROLLS-ROYCE' 'SCION' 'SMART' 'SRT' 'SUBARU' 'TOYOTA' 'VOLKSWAGEN'
 'VOLVO']
Categories in 'Model' variable:   ['ILX' 'ILX HYBRID' 'MDX 4WD' ... 'V90 CC T5 AWD' 'XC40 T5 AWD'
 'XC40 T4 AWD']
Categories in'Vehicle Class' variable: ['COMPACT' 'SUV - SMALL' 'MID-SIZE' 'TWO-SEATER' 'MINICOMPACT'
 'SUBCOMPACT' 'FULL-SIZE' 'STATION WAGON - SMALL' 'SUV - STANDARD'
 'VAN - CARGO' 'VAN - PASSENGER' 'PICKUP TRUCK - STANDARD'
 'SPECIAL PURPOSE VEHICLE' 'PICKUP TRUCK - SMALL' 'MINIVAN'
 'STATION WAGON - MID-SIZE']
Categories in 'Transmission' variable:      ['AS5' 'M6' 'AV7' 'AS6' 'AM7' 'AM8' 'AS9' 'AM9' 'AS10' 'AM6' 

### create a column transformer

In [10]:
num_features = x.select_dtypes(exclude="object").columns
cat_features = x.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder",oh_transformer,cat_features),
        ("StandardScaler",numeric_transformer,num_features),
    ]
)


In [11]:
x = preprocessor.fit_transform(x)

In [12]:
x.shape

(6305, 1683)

In [13]:
x

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 69355 stored elements and shape (6305, 1683)>

### train & test

In [14]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)
x_train.shape,x_test.shape

((5044, 1683), (1261, 1683))

### create and evaluate function to give all metrices after model training

In [15]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [16]:
models = {
    "Linear Regression":LinearRegression(),
    "Lasso":Lasso(),
    "Ridge":Ridge(),
    "K-Neighbors Regressor":KNeighborsRegressor(),
    "Decison Tree":DecisionTreeRegressor(),
    "Random Forest Regressor":RandomForestRegressor(),
    #"XGBRegressor":XGBRegressor(), 
    #"CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor":AdaBoostRegressor()
}

model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train,y_train) # train model

    #make predicition
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    #evaluate train and test dataset
    model_train_mae,model_train_rmse,model_train_r2 = evaluate_model(y_train,y_train_pred)
    model_test_mae,model_test_rmse,model_test_r2 = evaluate_model(y_test,y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')



### Result

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 4.2125
- Mean Absolute Error: 2.3566
- R2 Score: 0.9949
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 43.2954
- Mean Absolute Error: 4.6465
- R2 Score: 0.4671


Lasso
Model performance for Training set
- Root Mean Squared Error: 10.2014
- Mean Absolute Error: 6.5689
- R2 Score: 0.9702
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 9.3397
- Mean Absolute Error: 6.2884
- R2 Score: 0.9752


Ridge
Model performance for Training set
- Root Mean Squared Error: 4.5770
- Mean Absolute Error: 2.5878
- R2 Score: 0.9940
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 38.0591
- Mean Absolute Error: 4.1288
- R2 Score: 0.5882


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 8.0018
- Mean Absolute Error: 5.2084
- R2 Score: 0.9817
--------------------

# Result

In [17]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
5,Random Forest Regressor,0.997171
4,Decison Tree,0.996137
1,Lasso,0.975202
3,K-Neighbors Regressor,0.967258
6,AdaBoost Regressor,0.934224
2,Ridge,0.588214
0,Linear Regression,0.467111


In [21]:
print(df['Model'].unique())

['ILX' 'ILX HYBRID' 'MDX 4WD' ... 'V90 CC T5 AWD' 'XC40 T5 AWD'
 'XC40 T4 AWD']


In [22]:
unique_count = df['Model'].nunique()
print(unique_count)


1588


In [23]:
unique_values = df['Model'].unique()
print(unique_values)


['ILX' 'ILX HYBRID' 'MDX 4WD' ... 'V90 CC T5 AWD' 'XC40 T5 AWD'
 'XC40 T4 AWD']
