# Productionisation of Machine Learning Models (Model Serialization)

### Loading the Data

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
cardata = pd.read_csv('data/ToyotaCorolla.csv', encoding='latin1')

In [3]:
print(cardata.shape)

(1436, 38)


In [4]:
cardata = cardata[["Price","Age_08_04","KM","HP","cc","Doors","Gears","Quarterly_Tax","Weight"]]
cardata

Unnamed: 0,Price,Age_08_04,KM,HP,cc,Doors,Gears,Quarterly_Tax,Weight
0,13500,23,46986,90,2000,3,5,210,1165
1,13750,23,72937,90,2000,3,5,210,1165
2,13950,24,41711,90,2000,3,5,210,1165
3,14950,26,48000,90,2000,3,5,210,1165
4,13750,30,38500,90,2000,3,5,210,1170
...,...,...,...,...,...,...,...,...,...
1431,7500,69,20544,86,1300,3,5,69,1025
1432,10845,72,19000,86,1300,3,5,69,1015
1433,8500,71,17016,86,1300,3,5,69,1015
1434,7250,70,16916,86,1300,3,5,69,1015


In [5]:
X = cardata.drop('Price', axis=1)
y = cardata.Price

### Test Train Split

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 10)

### Data Preprocessing on Training Data

In [7]:
# scaling the numerical features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_rescaled = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns, index = X_train.index)

X_train_rescaled.head()

Unnamed: 0,Age_08_04,KM,HP,cc,Doors,Gears,Quarterly_Tax,Weight
403,-0.383727,1.022845,-0.294503,-0.37416,1.024196,-0.126261,-0.047964,-0.23006
454,-0.329753,0.081026,0.560251,0.044554,-1.070883,-0.126261,-0.047964,-0.034191
1316,1.019579,-0.045662,-1.017756,-0.583516,-0.023343,-0.126261,-0.43801,-1.405276
384,-0.11386,2.119434,-1.93826,0.881981,1.024196,-0.126261,2.389826,1.23896
80,-1.679086,-1.299904,0.560251,30.191928,1.024196,-0.126261,0.317705,2.120371


In [8]:
scaler.var_, scaler.scale_

(array([3.43274904e+02, 1.43014564e+09, 2.31315487e+02, 2.28152908e+05,
        9.11295433e-01, 3.65574100e-02, 1.68270368e+03, 2.60655737e+03]),
 array([1.85276794e+01, 3.78172664e+04, 1.52090594e+01, 4.77653544e+02,
        9.54617951e-01, 1.91199921e-01, 4.10207713e+01, 5.10544550e+01]))

### Preparing Test Data

In [9]:
X_test_rescaled = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns, index = X_test.index)

X_test_rescaled.head()

Unnamed: 0,Age_08_04,KM,HP,cc,Doors,Gears,Quarterly_Tax,Weight
995,0.641766,-0.698829,0.560251,0.044554,-1.070883,-0.126261,-0.43801,-0.425929
514,-0.005914,-0.442385,-0.294503,-0.37416,-1.070883,-0.126261,-0.43801,-0.915603
612,0.156006,2.263654,-1.93826,0.881981,-1.070883,-0.126261,2.389826,0.945156
307,-0.76154,-0.799788,0.560251,0.044554,-1.070883,-0.126261,-0.43801,-0.621799
981,0.04806,-0.639332,0.560251,0.044554,-1.070883,-0.126261,-0.43801,-0.425929


### Training KNN Classifier

In [10]:
from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier()
knn_classifier.fit(X_train_rescaled, y_train)

KNeighborsClassifier()

In [11]:
y_test_pred = knn_classifier.predict(X_test_rescaled)

In [12]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_test_pred)

0.05013927576601671

### Training Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression
lr_classifier = LogisticRegression()
lr_classifier.fit(X_train_rescaled, y_train)

LogisticRegression()

In [14]:
y_test_pred = lr_classifier.predict(X_test_rescaled)

In [15]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_test_pred)

0.08356545961002786

### Training Naive Bayes

In [16]:
from sklearn.naive_bayes import GaussianNB
nb_classifier = GaussianNB()
nb_classifier.fit(X_train_rescaled, y_train)

GaussianNB()

In [17]:
y_test_pred = nb_classifier.predict(X_test_rescaled)

In [18]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_test_pred)

0.022284122562674095

### Training Decision Tree

In [19]:
from sklearn.tree import DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier(max_depth = 3)
dt_classifier.fit(X_train_rescaled, y_train)

DecisionTreeClassifier(max_depth=3)

In [20]:
y_test_pred = dt_classifier.predict(X_test_rescaled)

In [21]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_test_pred)

0.12534818941504178

### Training Support Vector Classifier

In [22]:
from sklearn.svm import SVC
sv_classifier = SVC()
sv_classifier.fit(X_train_rescaled, y_train)

SVC()

In [23]:
y_test_pred = sv_classifier.predict(X_test_rescaled)

In [24]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_test_pred)

0.10027855153203342

## Saving the Model (Serialization)

In [25]:
from pickle import dump

dump(scaler, open('models/standard_scaler.pkl', 'wb'))
dump(knn_classifier, open('models/knn_model.pkl', 'wb'))
dump(lr_classifier, open('models/lr_model.pkl', 'wb'))
dump(nb_classifier, open('models/nb_model.pkl', 'wb'))
dump(dt_classifier, open('models/dt_model.pkl', 'wb'))
dump(sv_classifier, open('models/sv_model.pkl', 'wb'))

# THE END !!!