# Topics

## 1. Saving your trained classifier
## 2. GridSearchCV (for facial recognition or Alak)

In [1]:
%matplotlib inline
# All imports

import numpy as np
import matplotlib.pyplot as plt

import sklearn
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import label_binarize


# -------------> The following three are new imports <------------------
import time
from sklearn.metrics import classification_report, confusion_matrix

np.set_printoptions(formatter={'float': '{:.5f}'.format})


## Saving your classifier

In [2]:
'''
https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_lfw_people.html

'''

from sklearn.datasets import fetch_lfw_people

lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)

In [3]:
from sklearn.model_selection import GridSearchCV

In [5]:
"""
Breakout Solution

*******************************
For me, for the >= 70 faces set: the following takes 30-40 sec to train and the results are pretty good (alpha has to be 1e-5, and not 1e-10, not 1e-7):

clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100,), activation='tanh', \
                    random_state=42, max_iter=100000000, learning_rate_init=0.001)
                    
Fitting the classifier to the training set
Training done in 38.525s
Training score: 0.8674948240165632
Predicting people's names on the test set
Validation done in 0.005s
Test score: 0.7422360248447205

+++++++++++++++++++++++++++++

            
but alpha=1e-10 takes much longer to train: 200 sec

===> Consider adding progress bar


In 2016: for iMac's in Comp Phys Lab, it takes anywhere between 10 - 50 min.

"""
# for machine learning we use the 2 data directly (as relative pixel
# positions info is ignored by this model)

X = lfw_people.data

n_samples, n_features = X.shape

# the label to predict is the id of the person
y = lfw_people.target
target_names = lfw_people.target_names
n_classes = target_names.shape[0]

print("Total dataset size:")
print("targets: {}".format(y))
print("n_samples: {:d}".format(n_samples))
print("n_features: {:d}".format(n_features))
print("n_classes: {:d}".format(n_classes))


# split into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)


# Train a NN classification model

print("Fitting the classifier to the training set")
t0 = time.time()

# (250,) works well, too.  (80, ) too.
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100,), activation='tanh', \
                    random_state=42, max_iter=100000000, learning_rate_init=0.001)
      
clf = clf.fit(X_train, y_train)

print("Training done in {:0.3f}s".format(time.time() - t0))
print("Training score: {:.4f}".format(clf.score(X_train, y_train)))

print()
# Quantitative evaluation of the model quality on the test set

print("Predicting people's names on the test set")
t0 = time.time()
y_pred = clf.predict(X_test)
print("Validation done in {:0.3f}s".format(time.time() - t0))
print("Test score: {:.4f}".format(clf.score(X_test, y_test)))

Total dataset size:
targets: [5 6 3 ... 5 3 5]
n_samples: 1288
n_features: 1850
n_classes: 7
Fitting the classifier to the training set


KeyboardInterrupt: 

In [8]:
'''Saving the classifier using pickle'''

import pickle

with open('clf-70min-lfw-faces.p', 'wb') as f:
    pickle.dump(clf, f)



In [12]:
'''
Retrieving the saved classifier
'''

with open('clf-70min-lfw-faces.p', 'rb') as f:
    clf_saved = pickle.load(f)

In [14]:
'''
Show the retrieved classifier works the same as before

'''

print("Making predictions on the test set USING THE SAVED CLASSIFIER:")
t0 = time.time()

y_pred = clf_saved.predict(X_test)
print("Test score: {:.4f}".format(clf_saved.score(X_test, y_test)))

print("Validation done in {:0.3f}s".format(time.time() - t0))

Making predictions on the test set USING THE SAVED CLASSIFIER
Test score: 0.7422
Validation done in 0.010s


## 2. GridSearchCV

In [15]:
'''
Example given in sklearn is for a different ML algorithm

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

'''

from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
iris = datasets.load_iris()
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(iris.data, iris.target)


sorted(clf.cv_results_.keys())



['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_C',
 'param_kernel',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

In [None]:
'''How to apply grid search in our case'''


mlp_clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100,), activation='tanh', \
                    random_state=42, max_iter=100000000, learning_rate_init=0.001)

parameters = {'learning_rate_init':[0.001, 0.005, 0.01, 0.05, 0.1, 0.5], \
              'hidden_layer_sizes':[(80,), (100,), (100, 20), (100, 20, 4) ], \
              'solver': ['lbfgs', 'adam']}

grid_clf = GridSearchCV(mlp_clf, parameters)


# End of Week14-1