# Saving and Loading ML Models

In [1]:
import pandas
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
import pickle
from sklearn.externals import joblib



In [2]:
# these examples use the Pima Indian diabetes dataset
url = "pima-indians-diabetes.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pandas.read_csv(url, names=names)
array = dataframe.values

In [3]:
# separate array into features (X) and label (y) parts
X = array[:,0:8]
y = array[:,8]

In [4]:
# these examples train a logistic regression model
test_size = 0.3
seed = 8
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=test_size, random_state=seed)

# Fit the model
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

## pickle
Pickle is the standard way of serializing objects in Python. This example trains a logistic regression model, saves the model to file and then loads it to make predictions on unseen data.

In [5]:
# Save the model to disk
filename = 'model_pickle.dat'
pickle.dump(model, open(filename, 'wb'))

# Some time later...

# Load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.7792207792207793


## joblib
Joblib is part of the SciPy ecosystem and provides utilities for efficiently saving and loading Python objects that use NumPy data structures. This example trains a logistic regression model, saves the model to file and then loads it to make predictions on unseen data.

In [6]:
# Save the model to disk
filename = 'model_joblib.dat'
joblib.dump(model, filename)

# Some time later...

# Load the model from disk
loaded_model = joblib.load(filename)
result = loaded_model.score(X_test, y_test)
print(result)

0.7792207792207793


## Tips for saving and loading models
* Make a note of the Python version used to save the model. The same Python version may be required when later loading the model.
* Make a note of the version of imported libraries used to build the model.
* Consider manually outputting the parameters of the model.