# Selecting and Training Models

1. Select and Train a few Algorithms(Linear Regression, Decision Tree, RandomForest)
2. Evaluation using Mean Squared Error
3. Model Evaluation using Cross Validation
4. Hyperparameter Tuning using GridSearchCV
5. Check Feature Importance
6. Evaluate the Final System on test data
7. Saving the Model


In [11]:
##importing a few general use case libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer


import warnings
warnings.filterwarnings('ignore')

In [12]:
# reading the .data file using pandas

cols = ['MPG','Cylinders','Displacement','Horsepower','Weight',
                'Acceleration', 'Model Year', 'Origin']

df = pd.read_csv('./auto-mpg.data', names=cols, na_values = "?",
                comment = '\t',
                sep= " ",
                skipinitialspace=True)

data = df.copy()

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["Cylinders"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [13]:
##segregate the feature and target variable
data = strat_train_set.drop("MPG", axis=1)
data_labels = strat_train_set["MPG"].copy()
data

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
145,4,83.0,61.0,2003.0,19.0,74,3
151,4,79.0,67.0,2000.0,16.0,74,2
388,4,156.0,92.0,2585.0,14.5,82,1
48,6,250.0,88.0,3139.0,14.5,71,1
114,4,98.0,90.0,2265.0,15.5,73,2
...,...,...,...,...,...,...,...
147,4,90.0,75.0,2108.0,15.5,74,2
156,8,400.0,170.0,4668.0,11.5,75,1
395,4,135.0,84.0,2295.0,11.6,82,1
14,4,113.0,95.0,2372.0,15.0,70,3


In [14]:
##preprocess the Origin column in data
def preprocess_origin_cols(df):
    df["Origin"] = df["Origin"].map({1: "India", 2: "USA", 3: "Germany"})
    return df

In [15]:
##creating custom attribute adder class
acc_ix, hpower_ix, cyl_ix = 4,2, 0

class CustomAttrAdder(BaseEstimator, TransformerMixin):
    def __init__(self, acc_on_power=True): # no *args or **kargs
        self.acc_on_power = acc_on_power
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        acc_on_cyl = X[:, acc_ix] / X[:, cyl_ix]
        if self.acc_on_power:
            acc_on_power = X[:, acc_ix] / X[:, hpower_ix]
            return np.c_[X, acc_on_power, acc_on_cyl]
        
        return np.c_[X, acc_on_cyl]

In [16]:
def num_pipeline_transformer(data):
    '''
    Function to process numerical transformations
    Argument:
        data: original dataframe 
    Returns:
        num_attrs: numerical dataframe
        num_pipeline: numerical pipeline object
        
    '''
    numerics = ['float64', 'int64']

    num_attrs = data.select_dtypes(include=numerics)

    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attrs_adder', CustomAttrAdder()),
        ('std_scaler', StandardScaler()),
        ])
    return num_attrs, num_pipeline

def pipeline_transformer(data):
    '''
    Complete transformation pipeline for both
    nuerical and categorical data.
    
    Argument:
        data: original dataframe 
    Returns:
        prepared_data: transformed data, ready to use
    '''
    cat_attrs = ["Origin"]
    num_attrs, num_pipeline = num_pipeline_transformer(data)
    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, list(num_attrs)),
        ("cat", OneHotEncoder(), cat_attrs),
        ])
    prepared_data = full_pipeline.fit_transform(data)
    return prepared_data

## From raw data to processed data in 2 steps

In [17]:
## from raw data to processed data in two 2 steps 
preprocessed_df = preprocess_origin_cols(data)
prepared_data = pipeline_transformer(preprocessed_df)
prepared_data

array([[-0.85657842, -1.07804475, -1.15192977, ...,  1.        ,
         0.        ,  0.        ],
       [-0.85657842, -1.1174582 , -0.9900351 , ...,  0.        ,
         0.        ,  1.        ],
       [-0.85657842, -0.3587492 , -0.31547399, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.85657842, -0.56566984, -0.53133355, ...,  0.        ,
         1.        ,  0.        ],
       [-0.85657842, -0.78244384, -0.23452666, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.32260746, -0.45728283,  0.44003446, ...,  1.        ,
         0.        ,  0.        ]])

In [18]:
prepared_data[0]

array([-0.85657842, -1.07804475, -1.15192977, -1.17220298,  1.21586943,
       -0.54436373,  1.70952741,  1.29565517,  1.        ,  0.        ,
        0.        ])

## Selecting and Training models
1. Linear Regression
2. Decision Tree
3. Random Forest
4. SVM regressor

### 1. Linear Regression

In [19]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(prepared_data, data_labels)

LinearRegression()

In [20]:
## testing the predictions with the 
sample_data = data.iloc[:5]
sample_labels = data_labels.iloc[:5]

sample_data_prepared = pipeline_transformer(sample_data)

print("Prediction on Sample data : ", lin_reg.predict(sample_data_prepared))

Prediction on Sample data :  [29.08069379 27.78336755 26.08031176 12.70419279 22.23454159]


In [21]:
print("Actual Labels of samples: ", list(sample_labels))

Actual Labels of samples:  [32.0, 31.0, 26.0, 18.0, 26.0]


#### Mean Squared Error

In [26]:
from sklearn.metrics import mean_squared_error

mpg_predictions = lin_reg.predict(prepared_data)
lin_mse = mean_squared_error(data_labels, mpg_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

2.9590402225760863

### 2. Decision Tree

In [47]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(prepared_data, data_labels)

DecisionTreeRegressor()

In [48]:
mpg_predictions = tree_reg.predict(prepared_data)
tree_mse = mean_squared_error(data_labels, mpg_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_mse

0.0

But no model is perfect, this means that our model has overfit the data to a great extent.

We won't be touching out test data until we finalize our model. So, how do we check for what's happening?

#### Model Evaluation using Cross Validation

Scikit-Learn’s K-fold cross-validation feature randomly splits the training set into K distinct subsets called folds, then it trains and evaluates the model K times, picking a different fold for evaluation every time and training on the other K-1 folds.

The result is an array containing the K evaluation scores:

In [49]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg,
                        prepared_data,
                        data_labels,
                        scoring="neg_mean_squared_error",
                        cv=10)
tree_reg_rmse_scores = np.sqrt(-scores)

In [50]:
tree_reg_rmse_scores

array([3.16652412, 3.22567822, 2.91161467, 3.2069261 , 2.25963216,
       2.97331885, 3.55184811, 4.68484925, 4.17577499, 2.66119061])

In [51]:
tree_reg_rmse_scores.mean()

3.2817357092348765

In [52]:
## Cross-validation evaluation for Linear Model
scores = cross_val_score(lin_reg, prepared_data, 
                         data_labels, 
                         scoring="neg_mean_squared_error", 
                         cv = 10)
lin_reg_rmse_scores = np.sqrt(-scores)
lin_reg_rmse_scores

array([3.43254597, 3.45157629, 3.6621715 , 2.59652976, 2.48023405,
       2.74798115, 3.32524647, 2.42208917, 3.78133275, 2.8573747 ])

In [53]:
lin_reg_rmse_scores.mean()

3.075708179370932

### 3. Random Forrest

In [54]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(prepared_data, data_labels)
forest_reg_cv_scores = cross_val_score(forest_reg,
                                      prepared_data,
                                      data_labels,
                                      scoring="neg_mean_squared_error",
                                      cv=10)

forrest_reg_rmse_scores = np.sqrt(-forest_reg_cv_scores)
forrest_reg_rmse_scores.mean()

2.57609323277891

### 4. Support Vector Machine Regressor

In [55]:
from sklearn.svm import SVR

svm_reg = SVR(kernel='linear')
svm_reg.fit(prepared_data, data_labels)
svm_cv_score = cross_val_score(svm_reg, 
                               prepared_data, 
                               data_labels,
                               scoring="neg_mean_squared_error",
                               cv=10)
svm_rmse_scores = np.sqrt(-svm_cv_score)
svm_rmse_scores.mean()

3.086591620802831

## Hyperparameter Tuning using GridSearchCV

In [56]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]

forrest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid,
                          scoring='neg_mean_squared_error',
                          return_train_score=True,
                          cv=10,
                          )

grid_search.fit(prepared_data, data_labels)

GridSearchCV(cv=10, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [57]:
grid_search.best_params_

{'max_features': 8, 'n_estimators': 10}

In [58]:
cv_scores = grid_search.cv_results_

## printing all the parameters along with their scores

for mean_score, params in zip(cv_scores['mean_test_score'], cv_scores["params"]):
    print(np.sqrt(-mean_score), params)

3.3344840755623966 {'max_features': 2, 'n_estimators': 3}
3.152297923353794 {'max_features': 2, 'n_estimators': 10}
2.8831076649763667 {'max_features': 2, 'n_estimators': 30}
3.0912244814640792 {'max_features': 4, 'n_estimators': 3}
2.93081663004556 {'max_features': 4, 'n_estimators': 10}
2.7344913635762533 {'max_features': 4, 'n_estimators': 30}
3.1944381584011707 {'max_features': 6, 'n_estimators': 3}
2.7402437880862283 {'max_features': 6, 'n_estimators': 10}
2.7024456464466047 {'max_features': 6, 'n_estimators': 30}
3.08906603442752 {'max_features': 8, 'n_estimators': 3}
2.671255088904839 {'max_features': 8, 'n_estimators': 10}
2.705762051956904 {'max_features': 8, 'n_estimators': 30}
3.367991727954493 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
3.073729554137092 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
3.192791973636485 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
2.851355008248625 {'bootstrap': False, 'max_features': 3, 'n_estimators

### Checking Feature importance 

In [59]:
feature_importance = grid_search.best_estimator_.feature_importances_
feature_importance

array([0.12622476, 0.295405  , 0.15929424, 0.23948901, 0.01700968,
       0.11265794, 0.02626374, 0.01778286, 0.0018543 , 0.00143576,
       0.00258272])

In [60]:
extra_attrs = ["acc_on_power", "acc_on_cyl"]
numerics = ['float64', 'int64']
num_attrs = list(data.select_dtypes(include=numerics))

attrs = num_attrs + extra_attrs
sorted(zip(attrs, feature_importance), reverse=True)

[('acc_on_power', 0.026263743002759153),
 ('acc_on_cyl', 0.01778285612926325),
 ('Weight', 0.23948900585904026),
 ('Model Year', 0.11265794228292969),
 ('Horsepower', 0.1592942361838274),
 ('Displacement', 0.2954050015000301),
 ('Cylinders', 0.12622475506325048),
 ('Acceleration', 0.017009683003898244)]

## Evaluating the entire system on Test Data

In [61]:
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("MPG", axis = 1)
y_test = strat_test_set["MPG"].copy()

X_test_preprocessed = preprocess_origin_cols(X_test)
X_test_prepared = pipeline_transformer(X_test_preprocessed)

final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [62]:
final_rmse

3.239769281908821

## Creating a function to cover this entire flow

In [63]:
def predict_mpg(config, model):
    
    if type(config) == dict:
        df = pd.DataFrame(config)
    else:
        df = config
    
    preproc_df = preprocess_origin_cols(df)
    prepared_df = pipeline_transformer(preproc_df)
    y_pred = model.predict(prepared_df)
    return y_pred

In [64]:
## checking it on a random example 
vehicle_config = {
    'Cylinders' : [4, 6, 8],
    'Displacement' : [155.0, 160.0, 165.5],
    'Horsepower' : [93.0, 130.0, 98.0],
    'Weight' : [2500.0, 3150.0, 2600.0],
    'Accelaration' : [15.0, 14.0, 16.0],
    'Model Year' : [81, 80, 78],
    'Origin' : [3, 2, 1]
}

predict_mpg(vehicle_config, final_model)

array([36.3 , 16.39, 20.04])

## Save the Model

In [65]:
import pickle

In [66]:
## Saving the model
with open("model.bin", 'wb') as f_out:
    pickle.dump(final_model, f_out)
    f_out.close()

In [67]:
## loading the model from the saved file
with open("model.bin", 'rb') as f_in:
    model = pickle.load(f_in)
    
predict_mpg(vehicle_config, model)

array([36.3 , 16.39, 20.04])