## Selecting and Training Models

1. Select and Train a few Algorithms ( Linear Regression, Decision Tree, Random Forest)
2. Evaluation using Mean Squared Error
3. Model Evaluation using Cross Validation
4. Hyperparameter Tuning using GridSearchCV
5. Check Feature Importance
6. Evaluate the Final System on test data
7. Saving the Model

In [32]:
## importing a few general use case libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

import warnings 
warnings.filterwarnings('ignore')

In [33]:
# reading the .data file using pandas

cols = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight', 
        'Acceleration', 'Model Year', 'Origin']

df = pd.read_csv('./auto-mpg.data', names=cols, na_values = '?',
                comment = '\t',
                sep = " ",
                skipinitialspace=True)

data = df.copy()


split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["Cylinders"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]


In [34]:
# segregate the feature and target variable
data = strat_train_set.drop("MPG", axis=1)
data_labels = strat_train_set["MPG"].copy()
data.head()

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
145,4,83.0,61.0,2003.0,19.0,74,3
151,4,79.0,67.0,2000.0,16.0,74,2
388,4,156.0,92.0,2585.0,14.5,82,1
48,6,250.0,88.0,3139.0,14.5,71,1
114,4,98.0,90.0,2265.0,15.5,73,2


In [35]:
## preprocess the Origin column in data
def preprocess_origin_cols(df):
    df["Origin"] = df["Origin"].map({1: "India", 2: "USA", 3: "Germany"})
    return df

In [36]:
## creating custom attribute adder class
acc_ix, hpower_ix, cyl_ix = 4, 2, 0

class CustomAttrAdder(BaseEstimator, TransformerMixin):
    def __init__(self, acc_on_power=True): # no *args or **kargs
        self.acc_on_power = acc_on_power
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X):
        acc_on_cyl = X[:, acc_ix] / X[:, hpower_ix]
        if self.acc_on_power:
            acc_on_power = X[:, acc_ix] / X[:, hpower_ix]
            return np.c_[X, acc_on_power, acc_on_cyl]
        
        return np.c_[X, acc_on_cyl]
    

In [37]:
def num_pipeline_transformer(data):
    '''
    Function to process numerical transformations
    Argument:
        data: original dataframe
    Returns:
        num_attrs: numerical dataframe
        num_pipeline: numerical pipeline object
    '''
    numerics = ['float64', 'int64']
    
    num_attrs = data.select_dtypes(include=numerics)
    
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attrs_adder', CustomAttrAdder()),
        ('std_scaler', StandardScaler())
    ])
    return num_attrs, num_pipeline

def pipeline_transformer(data):
    '''
    Complete transformation pipeline for both
    numerical and categorical data.
    
    Argument:
        data: original dataframe
    Returns:
        prepared_data: transformed data, ready to use 
    '''
    cat_attrs = ["Origin"]
    num_attrs, num_pipeline = num_pipeline_transformer(data)
    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, list(num_attrs)),
        ("cat", OneHotEncoder(), cat_attrs),
    ])
    prepared_data = full_pipeline.fit_transform(data)
    return prepared_data

## From raw data to processed data in 2 steps

In [38]:
## from raw data to processes data in 2 steps
preprocessed_df = preprocess_origin_cols(data)
# preprocessed_df
prepared_data = pipeline_transformer(preprocessed_df)
prepared_data[0]

array([-0.85657842, -1.07804475, -1.15192977, -1.17220298,  1.21586943,
       -0.54436373,  1.70952741,  1.70952741,  1.        ,  0.        ,
        0.        ])

## Select and Train Models

1. Linear Regression
2. Decision Tree
3. Random Forest
4. SVM regression

## Linear Regression

In [39]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(prepared_data, data_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [40]:
# testing the prediction with the test data

sample_data = data.iloc[:5]
sample_labels = data_labels.iloc[:5]

sample_data_prepared = pipeline_transformer(sample_data)
print("Prediction of samples: ", lin_reg.predict(sample_data_prepared))

Prediction of samples:  [29.13771344 27.8060451  26.01346093 12.74611562 22.18711329]


In [41]:
print("Actual Labels of samples: ", list(sample_labels))

Actual Labels of samples:  [32.0, 31.0, 26.0, 18.0, 26.0]


### Mean Squared Error

In [42]:
from sklearn.metrics import mean_squared_error

mpg_predictions = lin_reg.predict(prepared_data)
lin_mse = mean_squared_error(data_labels, mpg_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

2.959295581945071

## Decision Tree

In [43]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(prepared_data, data_labels)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [44]:
mpg_predictions = tree_reg.predict(prepared_data)
tree_mse = mean_squared_error(data_labels, mpg_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

But no model is perfect, this means that our model has overfit the data to a great extent

We won't be touching our test data until we finalize our model. So, how do we check for what's happening?

## Model Evaluation using Cross Validation

Scikit-Learn's K-fold cross-validation feature randomly splits the training set into k distinct subsets called folds, then it trains and evaluates the model K times, picking a different fold for evaluation every time and training on the other K-1 folds.

The result is an array containing the K evaluation scores:

In [45]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, 
                        prepared_data,
                        data_labels,
                        scoring="neg_mean_squared_error",
                        cv = 10)
tree_reg_rmse_scores = np.sqrt(-scores)

In [46]:
tree_reg_rmse_scores

array([2.76942007, 2.65918221, 3.02820079, 3.51670122, 2.77207053,
       3.37273072, 3.22790567, 3.5383612 , 4.3807865 , 3.38440446])

In [47]:
scores = cross_val_score(lin_reg, prepared_data, data_labels, scoring="neg_mean_squared_error", cv=10)
lin_reg_rmse_scores = np.sqrt(-scores)
lin_reg_rmse_scores

array([3.35499199, 3.41593033, 3.6628555 , 2.53876217, 2.47924695,
       2.72779158, 3.3262419 , 2.42296059, 3.73278446, 2.85886034])

In [48]:
lin_reg_rmse_scores.mean()

3.052042580702212

## Random Forest model

In [49]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(prepared_data, data_labels)
forest_reg_cv_scores = cross_val_score(forest_reg,
                                      prepared_data,
                                      data_labels,
                                      scoring='neg_mean_squared_error',
                                      cv=10)

forest_reg_rmse_scores = np.sqrt(-forest_reg_cv_scores)
forest_reg_rmse_scores.mean()

2.559703643807294

## Support Vector Machine Regressor


In [50]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid,
                          scoring='neg_mean_squared_error',
                          return_train_score=True,
                          cv=10,
                          )

grid_search.fit(prepared_data, data_labels)

GridSearchCV(cv=10, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_j

In [51]:
grid_search.best_params_

{'max_features': 6, 'n_estimators': 30}

In [52]:
cv_scores = grid_search.cv_results_

##printing all the parameters along with their scores
for mean_score, params in zip(cv_scores['mean_test_score'], cv_scores["params"]):
    print(np.sqrt(-mean_score), params)

3.6482467280966757 {'max_features': 2, 'n_estimators': 3}
3.092797020054511 {'max_features': 2, 'n_estimators': 10}
2.9170691411804963 {'max_features': 2, 'n_estimators': 30}
3.5210699322424617 {'max_features': 4, 'n_estimators': 3}
2.7392695948567116 {'max_features': 4, 'n_estimators': 10}
2.7923250417286445 {'max_features': 4, 'n_estimators': 30}
3.04299315603408 {'max_features': 6, 'n_estimators': 3}
2.95944958987309 {'max_features': 6, 'n_estimators': 10}
2.633374816988139 {'max_features': 6, 'n_estimators': 30}
3.1072383229356118 {'max_features': 8, 'n_estimators': 3}
2.814220461235311 {'max_features': 8, 'n_estimators': 10}
2.668732600062472 {'max_features': 8, 'n_estimators': 30}
3.2692086575218426 {'max_features': 2, 'n_estimators': 3, 'bootstrap': False}
2.917301062439612 {'max_features': 2, 'n_estimators': 10, 'bootstrap': False}
3.1323050639171246 {'max_features': 3, 'n_estimators': 3, 'bootstrap': False}
2.970734358883067 {'max_features': 3, 'n_estimators': 10, 'bootstrap':

## Checking Feature importance

In [53]:
# feature importance

feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([0.15952927, 0.30370733, 0.12760556, 0.20641487, 0.02244683,
       0.10955506, 0.03795247, 0.02636223, 0.0022453 , 0.0013905 ,
       0.00279059])

In [54]:
extra_attrs = ["acc_on_power", "acc_on_cyl"]
numerics = ['float64', 'int64']
num_attrs = list(data.select_dtypes(include=numerics))

attrs = num_attrs + extra_attrs
sorted(zip(attrs, feature_importances), reverse=True)

[('acc_on_power', 0.037952467778910745),
 ('acc_on_cyl', 0.026362231079710988),
 ('Weight', 0.20641486900993777),
 ('Model Year', 0.10955505750613402),
 ('Horsepower', 0.12760556359951133),
 ('Displacement', 0.30370732708137826),
 ('Cylinders', 0.1595292713382288),
 ('Acceleration', 0.02244682896961372)]

## Evaluating the entire system on Test Data

In [55]:
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("MPG", axis=1)
y_test = strat_test_set["MPG"].copy()

X_test_preprocessed = preprocess_origin_cols(X_test)
X_test_prepared = pipeline_transformer(X_test_preprocessed)

final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [56]:
final_rmse

3.1021740808306975

## Create a function to cover this entire flow

In [57]:
def predict_mpg(config, model):
    
    if type(config) == dict:
        df = pd.DataFrame(config)
    else:
        df = config
    
    preproc_df = preprocess_origin_cols(df)
    prepared_df = pipeline_transformer(preproc_df)
    y_pred = model.predict(prepared_df)
    return y_pred

In [58]:
##checking it on a random sample
vehicle_config = {
    'Cylinders': [4, 6, 8],
    'Displacement': [155.0, 160.0, 165.5],
    'Horsepower': [93.0, 130.0, 98.0],
    'Weight': [2500.0, 3150.0, 2600.0],
    'Acceleration': [15.0, 14.0, 16.0],
    'Model Year': [81, 80, 78],
    'Origin': [3, 2, 1]
}

predict_mpg(vehicle_config, final_model)

array([28.56333333, 22.88666667, 17.76666667])

## Save the Model

In [59]:
import pickle

In [60]:
## saving the model
with open("model.bin", 'wb') as f_out:
    pickle.dump(final_model, f_out)
    f_out.close()

In [61]:
## Loading the model from the saved file
with open('model.bin', 'rb') as f_in:
    model = pickle.load(f_in)
    
predict_mpg(vehicle_config, model)

array([28.56333333, 22.88666667, 17.76666667])

In [63]:
import requests

url = "https://predict-fuel-efficiency.herokuapp.com/"
r = requests.post(url, json=vehicle_config)
r.text.strip()

'{"mpg_predictions":[26.139999999999997,26.550000000000004,19.129999999999995]}'