"""
Faces recognition example using eigenfaces and SVMs

The dataset used in this example is a preprocessed excerpt of the
"Labeled Faces in the Wild", aka LFW_:

  http://vis-www.cs.umass.edu/lfw/lfw-funneled.tgz (233MB)

  .. _LFW: http://vis-www.cs.umass.edu/lfw/

  original source: http://scikit-learn.org/stable/auto_examples/applications/face_recognition.html

"""

In [1]:
print __doc__

Automatically created module for IPython interactive environment


In [2]:
from time import time
import logging
import pylab as pl
import numpy as np

In [3]:
from sklearn.cross_validation import train_test_split
from sklearn.datasets import fetch_lfw_people
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import RandomizedPCA
from sklearn.svm import SVC

# Display progress logs on stdout

In [4]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')

# Download the data, if not already on disk and load it as numpy arrays

In [6]:
lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)



In [7]:
n_samples, h, w = lfw_people.images.shape
np.random.seed(42)

# for machine learning we use the data directly (as relative pixel position info is ignored by this model)

In [10]:
X = lfw_people.data
n_features = X.shape[1]

# the label to predict is the id of the person

In [11]:
y = lfw_people.target
target_names = lfw_people.target_names
n_classes = target_names.shape[0]

In [12]:
print "Total dataset size:"
print "n_samples: %d" % n_samples
print "n_features: %d" % n_features
print "n_classes: %d" % n_classes

Total dataset size:
n_samples: 1288
n_features: 1850
n_classes: 7


# Split into a training and testing set

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled dataset): unsupervised feature extraction / dimensionality reduction

In [14]:
n_components = 150

In [15]:
print "Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])

Extracting the top 150 eigenfaces from 966 faces


In [17]:
t0 = time()
pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)
print "done in %0.3fs" % (time() - t0)

done in 0.308s


In [18]:
eigenfaces = pca.components_.reshape((n_components, h, w))

In [19]:
print "Projecting the input data on the eigenfaces orthonormal basis"
t0 = time()
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print "done in %0.3fs" % (time() - t0)

Projecting the input data on the eigenfaces orthonormal basis
done in 0.039s


# Train a SVM classification model

In [20]:
print "Fitting the classifier to the training set"
t0 = time()
param_grid = {
         'C': [1e3, 5e3, 1e4, 5e4, 1e5],
          'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
          }
# for sklearn version 0.16 or prior, the class_weight parameter value is 'auto'
## For the 'class_weight' parameter, the argument string "auto" is a valid value for sklearn version 0.16 and prior, but will be depreciated by 0.19. If you are running sklearn version 0.17 or later, the expected argument string should be "balanced"
clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
clf = clf.fit(X_train_pca, y_train)
print "done in %0.3fs" % (time() - t0)
print "Best estimator found by grid search:"
print clf.best_estimator_

Fitting the classifier to the training set
done in 18.414s
Best estimator found by grid search:
SVC(C=1000.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


# Quantitative evaluation of the model quality on the test set

In [21]:
print "Predicting the people names on the testing set"
t0 = time()
y_pred = clf.predict(X_test_pca)
print "done in %0.3fs" % (time() - t0)

Predicting the people names on the testing set
done in 0.067s


In [22]:
print classification_report(y_test, y_pred, target_names=target_names)
print confusion_matrix(y_test, y_pred, labels=range(n_classes))

                   precision    recall  f1-score   support

     Ariel Sharon       0.50      0.62      0.55        13
     Colin Powell       0.76      0.88      0.82        60
  Donald Rumsfeld       0.73      0.70      0.72        27
    George W Bush       0.92      0.87      0.89       146
Gerhard Schroeder       0.77      0.80      0.78        25
      Hugo Chavez       0.75      0.60      0.67        15
       Tony Blair       0.88      0.83      0.86        36

      avg / total       0.83      0.83      0.83       322

[[  8   0   3   2   0   0   0]
 [  2  53   1   3   0   1   0]
 [  4   1  19   2   0   0   1]
 [  1  11   2 127   3   1   1]
 [  0   2   0   1  20   1   1]
 [  0   2   0   1   2   9   1]
 [  1   1   1   2   1   0  30]]


# Qualitative evaluation of the predictions using matplotlib

In [23]:
def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
    """Helper function to plot a gallery of portraits"""
    pl.figure(figsize=(1.8 * n_col, 2.4 * n_row))
    pl.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
    for i in range(n_row * n_col):
        pl.subplot(n_row, n_col, i + 1)
        pl.imshow(images[i].reshape((h, w)), cmap=pl.cm.gray)
        pl.title(titles[i], size=12)
        pl.xticks(())
        pl.yticks(())

# plot the result of the prediction on a portion of the test set

In [24]:
def title(y_pred, y_test, target_names, i):
    pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1]
    true_name = target_names[y_test[i]].rsplit(' ', 1)[-1]
    return 'predicted: %s\ntrue:      %s' % (pred_name, true_name)

In [25]:
prediction_titles = [title(y_pred, y_test, target_names, i)
                         for i in range(y_pred.shape[0])]

In [26]:
plot_gallery(X_test, prediction_titles, h, w)

# plot the gallery of the most significative eigenfaces

In [27]:
eigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])]
plot_gallery(eigenfaces, eigenface_titles, h, w)

In [28]:
pl.show()

#  How much of the variance is explained by the first principal component? The second?

In [29]:
print pca.explained_variance_ratio_ 

[ 0.19346474  0.15116931  0.07083688  0.05952028  0.05157574  0.02887213
  0.02514474  0.02176463  0.0201937   0.01902118  0.01682174  0.01580626
  0.01223351  0.01087937  0.01064428  0.00979671  0.00892415  0.00854861
  0.00835728  0.00722645  0.0069658   0.00653871  0.00639547  0.0056132
  0.00531102  0.00520167  0.00507469  0.00484211  0.00443586  0.0041782
  0.00393684  0.00381711  0.00356077  0.00351197  0.00334554  0.00329936
  0.00314637  0.00296207  0.00290131  0.00284712  0.00279984  0.00267544
  0.00259903  0.00258378  0.00240921  0.00238992  0.0023542   0.00222581
  0.00217505  0.00216559  0.00209063  0.00205427  0.00200421  0.00197374
  0.00193836  0.00188752  0.00180161  0.00178887  0.00174822  0.00173047
  0.00165645  0.00162943  0.00157416  0.00153416  0.00149965  0.00147248
  0.00143907  0.00141871  0.00139683  0.00138136  0.00133992  0.0013316
  0.00128791  0.00125579  0.00124233  0.00121852  0.00120941  0.00118278
  0.00115082  0.00113637  0.00112584  0.00111595  0.00

# As you add more principal components as features for training your classifier, do you expect it to get better or worse performance?

In [31]:
for i in [10, 15, 25, 50, 100, 250]:
    n_components = i
    pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)
    eigenfaces = pca.components_.reshape((n_components, h, w))
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)
    clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
    clf = clf.fit(X_train_pca, y_train)
    clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
    clf = clf.fit(X_train_pca, y_train)
    y_pred = clf.predict(X_test_pca)
    print classification_report(y_test, y_pred, target_names=target_names)

                   precision    recall  f1-score   support

     Ariel Sharon       0.14      0.23      0.17        13
     Colin Powell       0.45      0.58      0.51        60
  Donald Rumsfeld       0.19      0.26      0.22        27
    George W Bush       0.69      0.54      0.61       146
Gerhard Schroeder       0.12      0.12      0.12        25
      Hugo Chavez       0.24      0.27      0.25        15
       Tony Blair       0.45      0.36      0.40        36

      avg / total       0.49      0.45      0.46       322

                   precision    recall  f1-score   support

     Ariel Sharon       0.56      0.38      0.45        13
     Colin Powell       0.69      0.73      0.71        60
  Donald Rumsfeld       0.45      0.56      0.50        27
    George W Bush       0.73      0.75      0.74       146
Gerhard Schroeder       0.60      0.36      0.45        25
      Hugo Chavez       0.57      0.53      0.55        15
       Tony Blair       0.59      0.61      0.60    

## Remember:
## we can use gridCV to tune parameters such as 
## C, kernel and gamma for VALIDATION
## but not the
## N-components for PCA

param_grid = {
         'C': [1e3, 5e3, 1e4, 5e4, 1e5],
          'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
          }
 for sklearn version 0.16 or prior, the class_weight parameter value is 'auto'
 For the 'class_weight' parameter, the argument string "auto" is a valid value for sklearn version 0.16 and prior, but will be depreciated by 0.19. If you are running sklearn version 0.17 or later, the expected argument string should be "balanced"
clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
clf = clf.fit(X_train_pca, y_train)
print "done in %0.3fs" % (time() - t0)
print "Best estimator found by grid search:"
print clf.best_estimator_

#### More components mean higher score in F1, higher score means higher performance

# Do you see any evidence of overfitting when using a large number of PCs?

#### from 100 to 250, the f1-score decrease