# Predicting Fuel Efficiency of Vehicles - Part3

## Selecting and Training Models
1. Select and Train a few Algorithm (Linear Regression, Decision Tree, Random Forest)
2. Evaluate using Mean Squared Error
3. Model Evaluation using Cross Validation
4. Hyperparameter Tuning using GridSearchCV
5. Evaluate the Final System on test data
6. Saving the Model

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings('ignore')

In [2]:
cols = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 
        'Model Year', 'Origin']

df = pd.read_csv('auto-mpg.data', names=cols, na_values='?',
                comment='\t', sep=" ", skipinitialspace=True)

data = df.copy()

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data.Cylinders):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [3]:
# segregate the feature and target variable
data = strat_train_set.drop('MPG', axis=1)
data_labels = strat_train_set['MPG'].copy()

In [4]:
data

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
145,4,83.0,61.0,2003.0,19.0,74,3
151,4,79.0,67.0,2000.0,16.0,74,2
388,4,156.0,92.0,2585.0,14.5,82,1
48,6,250.0,88.0,3139.0,14.5,71,1
114,4,98.0,90.0,2265.0,15.5,73,2
...,...,...,...,...,...,...,...
147,4,90.0,75.0,2108.0,15.5,74,2
156,8,400.0,170.0,4668.0,11.5,75,1
395,4,135.0,84.0,2295.0,11.6,82,1
14,4,113.0,95.0,2372.0,15.0,70,3


In [5]:
# preprocess custom original column in data
def preprocess_origin_cols(df):
    df['Origin'] = df['Origin'].map({1:'India', 2:'USA', 3:'Germany'})
    return df

In [6]:
# Creating custom attribute adder class

acc_ix, hpower_ix, cyl_ix = 4, 2, 0

class CustomAttrAdder(BaseEstimator, TransformerMixin):
    def __init__(self, acc_on_power=True):
        self.acc_on_power = acc_on_power # no *args or **kwargs
    def fit(self, x, y=None):
        return self # nothing else to do
    def transform(self, x):
        acc_on_cyl = x[:, acc_ix]/x[:, cyl_ix]
        if self.acc_on_power:
            acc_on_power = x[:, acc_ix] / x[:, hpower_ix]
            return np.c_[x, acc_on_power, acc_on_cyl] # np.c_ will concate numpy array
        return np.c_[x, acc_on_cyl]

In [7]:
def num_pipeline_transformer(data):
    '''
    Function to process numerical transformations
    Argument:
         data: original dataframe
    Returns:
         num_attrs: numerical dataframe
         num_pipeline: numerical pipeline object
    '''
    
    numerics = ['float64', 'int64']
    
    num_attrs = data.select_dtypes(include=numerics)

    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('attrs_adder', CustomAttrAdder()),
        ('std_scaler', StandardScaler())
    ])
    data.head()
    return num_attrs, num_pipeline

def pipeline_transformer(data):
    '''
    Complete transformation pipeline for both
    numerical and categorical data.
    
    Argument:
        data: original dataframe
    Returns:
        prepared_data: transformed data, ready to use
    '''
    cat_attr = ['Origin']
    num_attrs, num_pipeline = num_pipeline_transformer(data)

    full_pipeline =ColumnTransformer([
        ('num', num_pipeline, num_attrs.columns),
        ('cat', OneHotEncoder(), cat_attr)
    ])
    
    prepared_data = full_pipeline.fit_transform(data)
    return prepared_data

## From raw data to preprocessed data in 2 steps

In [8]:
# from raw data to preprocess data in 2steps
preprocessed_df = preprocess_origin_cols(data)
prepared_data = pipeline_transformer(preprocessed_df)

In [12]:
pd.DataFrame(prepared_data[3:, :]).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.322607,0.567467,-0.423404,0.194851,-0.411887,-1.362195,-0.129273,-0.662693,0.0,1.0,0.0
1,-0.856578,-0.930244,-0.369439,-0.856914,-0.050164,-0.816974,-0.035899,0.561274,0.0,0.0,1.0
2,-0.856578,-0.999218,-0.990035,-1.217932,-0.231025,1.63652,0.611597,0.456363,1.0,0.0,0.0


## <font color='red'>Selecting and Training Models</font>
1. Linear Regression
2. Decision Tree
3. Random Forest
4. SVM regressor

## Linear Regression

In [10]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(prepared_data, data_labels)

LinearRegression()

In [11]:
# testing the predictions with the
sample_data = data.iloc[:5]
sample_labels = data_labels.iloc[:5]

sample_data_prepared = pipeline_transformer(sample_data)
print('Prediction of samples:', lin_reg.predict(sample_data_prepared))

Prediction of samples: [29.08069379 27.78336755 26.08031176 12.70419279 22.23454159]


In [12]:
print('Actual labels of sample: ', list(sample_labels))

Actual labels of sample:  [32.0, 31.0, 26.0, 18.0, 26.0]


**Mean Squared Error**

In [13]:
from sklearn.metrics import mean_squared_error

mpg_predictions = lin_reg.predict(prepared_data)
lin_mse = mean_squared_error(data_labels, mpg_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

2.959040222576087

## Decision Tree

In [14]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(prepared_data, data_labels)

DecisionTreeRegressor()

In [15]:
mpg_predictions = tree_reg.predict(prepared_data)
tree_mse = mean_squared_error(data_labels, mpg_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

But no model is perfect, this means that our model has overfit the data to a greate extent. We won't be touching our test data until we finalize our model. So how do we check for what's happening?



### Model Evaluation using Cross Validation
Scikit-Learn's K-fond cross-validation feature randomly splits the training set into `K` distinct subsets called folds, then it trains and evaluate the model K times, picking a different fold for evaluation every time and training on the other K-1 folds.

The result is an array containing the K evaluation scores:

In [16]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, prepared_data, data_labels, scoring='neg_mean_squared_error',
                        cv=10)

# As neg_mean.. error is -ve so we'll use -scores to make it +ve
tree_reg_rmse_scores = np.sqrt(-scores)

In [17]:
tree_reg_rmse_scores

array([3.22058807, 3.36781643, 3.00083322, 3.40812448, 2.75249886,
       3.09636884, 3.49852648, 5.03490938, 4.17237458, 2.62617101])

In [18]:
tree_reg_rmse_scores.mean()

3.417821136187876

In [19]:
scores = cross_val_score(lin_reg, prepared_data, data_labels, scoring='neg_mean_squared_error',
                        cv=10)
lin_reg_rmse_scores = np.sqrt(-scores)
lin_reg_rmse_scores

array([3.43254597, 3.45157629, 3.6621715 , 2.59652976, 2.48023405,
       2.74798115, 3.32524647, 2.42208917, 3.78133275, 2.8573747 ])

In [20]:
lin_reg_rmse_scores.mean()

3.075708179370932

# Random Forest model

In [21]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(prepared_data, data_labels)
forest_reg_cv_scores  = cross_val_score(forest_reg, prepared_data, data_labels,
                                       scoring='neg_mean_squared_error', cv=10)

forest_reg_rmse_score = np.sqrt(-forest_reg_cv_scores)
forest_reg_rmse_score.mean()

2.6078962257799203

# Support Vector Machine Regressor

In [22]:
from sklearn.svm import SVR

svm_reg = SVR(kernel='linear')
svm_reg.fit(prepared_data, data_labels)
svm_cv_scores = cross_val_score(svm_reg, prepared_data, data_labels,
                                 scoring='neg_mean_squared_error', cv=10)
svm_rmse_scores = np.sqrt(-svm_cv_scores)
svm_rmse_scores.mean()

3.08659162080283

# <font color='red'>Hyperparameter Tuning using GridSearchCV</font>

In [23]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap':[False], 'n_estimators':[3, 10], 'max_features':[2, 3, 4]}
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, scoring='neg_mean_squared_error',
                          return_train_score=True, cv=10)

grid_search.fit(prepared_data, data_labels)

GridSearchCV(cv=10, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [24]:
grid_search.best_params_

{'max_features': 6, 'n_estimators': 30}

In [25]:
grid_search.best_estimator_

RandomForestRegressor(max_features=6, n_estimators=30)

In [26]:
cv_scores = grid_search.cv_results_

# printing all the parameters along with their scores
for mean_score, params in zip(cv_scores['mean_test_score'], cv_scores['params']):
    print(np.sqrt(-mean_score), params)

3.4285931202244133 {'max_features': 2, 'n_estimators': 3}
3.155183620557682 {'max_features': 2, 'n_estimators': 10}
2.8374639599103846 {'max_features': 2, 'n_estimators': 30}
3.3307387415581142 {'max_features': 4, 'n_estimators': 3}
2.79833153594792 {'max_features': 4, 'n_estimators': 10}
2.740721425773104 {'max_features': 4, 'n_estimators': 30}
3.109848700205855 {'max_features': 6, 'n_estimators': 3}
2.8019386991492734 {'max_features': 6, 'n_estimators': 10}
2.631030594509937 {'max_features': 6, 'n_estimators': 30}
3.0665544809131635 {'max_features': 8, 'n_estimators': 3}
2.722600252901034 {'max_features': 8, 'n_estimators': 10}
2.6588090340208113 {'max_features': 8, 'n_estimators': 30}
3.27685357885276 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
3.0582957992261823 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
3.1330621629685367 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
2.7918134245648734 {'bootstrap': False, 'max_features': 3, 'n_estimato

# <font color='red'>Checking Feature Importance</font>

In [27]:
# feature importance

feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([0.11750137, 0.31724836, 0.11685504, 0.23454865, 0.01882553,
       0.11486985, 0.03069041, 0.04317678, 0.00208425, 0.0024476 ,
       0.00175217])

In [28]:
extra_attrs = ['acc_on_power', 'acc_on_cyl']
numeric = ['float64', 'int64']
num_attrs = list(data.select_dtypes(include=numeric))

attrs = num_attrs + extra_attrs

sorted(zip(attrs, feature_importances), reverse=True)

[('acc_on_power', 0.030690414334150977),
 ('acc_on_cyl', 0.04317677727684325),
 ('Weight', 0.23454864818504179),
 ('Model Year', 0.11486984864399326),
 ('Horsepower', 0.11685504220347269),
 ('Displacement', 0.31724836047853794),
 ('Cylinders', 0.11750136644762359),
 ('Acceleration', 0.0188255252983397)]

# <font color='red'>Evaluate the entire system on Test Data</font>

In [29]:
final_model = grid_search.best_estimator_

x_test = strat_test_set.drop('MPG', axis=1)
y_test = strat_test_set['MPG'].copy()

x_test_preprocessed = preprocess_origin_cols(x_test)
x_test_prepared = pipeline_transformer(x_test_preprocessed)

final_predictions = final_model.predict(x_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [30]:
final_rmse

2.9753037193238314

# <font color='red'>Creating a function to cover this entire flow</font>

In [31]:
def predict_mpg(config, model):
    if type(config) == dict:
        df = pd.DataFrame(config)
    else:
        df = config
    
    preproc_df = preprocess_origin_cols(df)
    prepared_df = pipeline_transformer(preproc_df)
#     print(prepared_df)
    y_pred = model.predict(prepared_df)
    return y_pred

In [32]:
data.columns

Index(['Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration',
       'Model Year', 'Origin'],
      dtype='object')

In [33]:
# checking it on a random sample

vehicle_config = {
    'Cylinders': [4, 6, 8],
    'Displacement': [155.0, 160.0, 165.5],
    'Horsepower': [93.0, 130.0, 98.0],
    'Weight': [2500.0, 3150.0, 2600.0],
    'Acceleration': [15.0, 14.0, 16.0],
    'Model Year': [81, 80, 78],
    'Origin': [3, 2, 1]
}

predict_mpg(vehicle_config, final_model)

array([32.24666667, 17.64      , 21.46666667])

# <font color='red'>Save the model</font>

In [34]:
import pickle

In [35]:
# saving the model
with open('model.bin', 'wb') as f_out:
    pickle.dump(final_model, f_out)
    f_out.close()

In [37]:
# loading the model from the saved file
with open('model.bin', 'rb') as f_in:
    model = pickle.load(f_in)
    
predict_mpg(vehicle_config, model)

array([32.24666667, 17.64      , 21.46666667])