# 6. Save and Load trained model

Two ways to Save andLoad trained model.

* with python `pickle` module
* with `joblib` module

[model_persistence documentation](https://scikit-learn.org/stable/modules/model_persistence.html)

In [1]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

In [2]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_predictions_classification(y_true, y_pred):
    """
    performs evaluation comparion on y_true and y_pred
    """
    accuracy = accuracy_score(y_true, y_pred) 
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    metric_dict = {
        'accuracy': round(accuracy, 2),
        'precision': round(precision, 2),
        'recall': round(recall, 2),
        'f1': round(f1, 2)
    }

    print(f'Accuracy: {accuracy * 100:.2f}%')
    print(f'Precision: {precision:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'F1: {f1:.2f}')
    
    return metric_dict

In [3]:
import numpy as np
import pandas as pd

heart_disease = pd.read_csv('../00.datasets/heart-diseases.csv')

In [4]:
# shuffle data
heart_disease = heart_disease.sample(frac=1)

In [5]:
X = heart_disease.drop('target', axis=1)
y = heart_disease['target']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state = 42)

In [6]:
# using RandomziedSearchCV
from sklearn.model_selection import RandomizedSearchCV

grid_params = {
    'n_estimators': [10, 100, 200, 500, 1000],
    'max_depth': [None, 5, 10, 20, 30],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 4, 6]
}

In [8]:
# create model
model = RandomForestClassifier(n_jobs=1) # n_jobs means how much of your PC resources you want to dedicated to ML

# set up Randomized Search CV
rs_model = RandomizedSearchCV(estimator = model, 
                                                      param_distributions = grid_params,
                                                      n_iter = 10, # number of models to try
                                                      refit= True,
                                                      cv=5, 
                                                      verbose = 2)

# model training
rs_model.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=30 
[CV]  n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=30, total=   0.2s
[CV] n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=30 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV]  n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=30, total=   0.2s
[CV] n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=30 
[CV]  n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=30, total=   0.2s
[CV] n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=30 
[CV]  n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=30, total=   0.2s
[CV] n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=30 
[CV]  n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=30, total=   0.2s
[CV] n_estimators=200, min_samples_split=6, min_samples_leaf=1, max_features=sqrt, max_depth=20 
[CV]  n_estimators=200, min_samples_split=6, min_samples_leaf=1, max_features=sqrt, max_depth=20, total=   0.4s
[CV] n_estimators=200, min_samples_split=6, min_samp

[CV]  n_estimators=500, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=10, total=   0.9s
[CV] n_estimators=500, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=10 
[CV]  n_estimators=500, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=10, total=   0.9s
[CV] n_estimators=500, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=10 
[CV]  n_estimators=500, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=10, total=   0.9s
[CV] n_estimators=500, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=10 
[CV]  n_estimators=500, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=10, total=   0.9s
[CV] n_estimators=10, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=30 
[CV]  n_estimators=10, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=30, total=   0.0s
[CV] n_estimators=10, min_samples_split=2, min_samples

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   30.3s finished


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(n_jobs=1),
                   param_distributions={'max_depth': [None, 5, 10, 20, 30],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 4, 6],
                                        'n_estimators': [10, 100, 200, 500,
                                                         1000]},
                   verbose=2)

In [9]:
# get the best params to see which combinations give the best Result
rs_model.best_params_

{'n_estimators': 500,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 10}

In [10]:
# make predictions
rs_y_pred = rs_model.predict(X_val)

# evaluate the performance
rs_metrics = evaluate_predictions_classification(y_val, rs_y_pred)

Accuracy: 82.22%
Precision: 0.83
Recall: 0.89
F1: 0.86


### Final Evaluation with test data

In [11]:
y_pred_test = rs_model.predict(X_test)

evaluate_predictions_classification(y_test, y_pred_test)

Accuracy: 86.96%
Precision: 0.86
Recall: 0.86
F1: 0.86


{'accuracy': 0.87, 'precision': 0.86, 'recall': 0.86, 'f1': 0.86}

---------

# Using `Pickle` module

In [13]:
# save the model 
import pickle

pickle.dump(rs_model, open('models/rs_random_forest_model.pkl', 'wb'))

In [18]:
# load the model
loaded_model = pickle.load(open('models/rs_random_forest_model.pkl','rb'))

In [19]:
pred = loaded_model.predict(X_test)

evaluate_predictions_classification(y_test, pred)

Accuracy: 86.96%
Precision: 0.86
Recall: 0.86
F1: 0.86


{'accuracy': 0.87, 'precision': 0.86, 'recall': 0.86, 'f1': 0.86}

-------

# Using `joblib` module

In [21]:
from joblib import dump, load

In [22]:
# save model
dump(rs_model, 'models/rs_random_forest_model2.joblib')

['models/rs_random_forest_model2.joblib']

In [23]:
# load model
loaded_model_2 = load( 'models/rs_random_forest_model2.joblib')

In [24]:
pred2 = loaded_model_2.predict(X_test)

evaluate_predictions_classification(y_test, pred2)

Accuracy: 86.96%
Precision: 0.86
Recall: 0.86
F1: 0.86


{'accuracy': 0.87, 'precision': 0.86, 'recall': 0.86, 'f1': 0.86}