In [49]:
from time import time
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.datasets import fetch_lfw_people
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.svm import SVC
import os
import pandas as pd

In [76]:
# Load data
lfw_dataset = sklearn.datasets.fetch_lfw_people(data_home = "temp", min_faces_per_person=100,  download_if_missing = False)

In [77]:
n_samples, h, w = lfw_dataset.images.shape
# for machine learning we use the 2 data directly (as relative pixel
# positions info is ignored by this model)
X = lfw_dataset.data
n_features = X.shape[1]

# the label to predict is the id of the person
y = lfw_dataset.target
target_names = lfw_dataset.target_names
n_classes = target_names.shape[0]

In [78]:
pd.DataFrame(X).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2904,2905,2906,2907,2908,2909,2910,2911,2912,2913
count,1140.0,1140.0,1140.0,1140.0,1140.0,1140.0,1140.0,1140.0,1140.0,1140.0,...,1140.0,1140.0,1140.0,1140.0,1140.0,1140.0,1140.0,1140.0,1140.0,1140.0
mean,0.351984,0.363995,0.386272,0.420429,0.45985,0.495814,0.52639,0.550349,0.568752,0.584434,...,0.423837,0.45121,0.478944,0.499107,0.504997,0.493591,0.475365,0.454717,0.425807,0.397944
std,0.169499,0.166324,0.163815,0.162791,0.161108,0.160037,0.152762,0.145584,0.141472,0.138356,...,0.235938,0.253246,0.270739,0.284656,0.29256,0.299549,0.306241,0.309115,0.306493,0.30556
min,0.0,0.001307,0.0,0.002614,0.002614,0.005229,0.007843,0.005229,0.011765,0.024837,...,0.003922,0.003922,0.003922,0.002614,0.001307,0.0,0.0,0.0,0.0,0.0
25%,0.231373,0.249673,0.273203,0.312418,0.361765,0.4,0.43366,0.45719,0.478431,0.49902,...,0.244444,0.25719,0.267647,0.259804,0.25098,0.219281,0.180392,0.163399,0.142157,0.121569
50%,0.338562,0.355556,0.386928,0.422222,0.460131,0.498693,0.530719,0.552288,0.573856,0.589543,...,0.398693,0.422222,0.452288,0.484967,0.491503,0.485621,0.460131,0.420261,0.365359,0.312418
75%,0.465359,0.471895,0.490196,0.526797,0.56732,0.603922,0.624837,0.643137,0.656209,0.677124,...,0.569281,0.626144,0.686275,0.735948,0.75817,0.758824,0.754248,0.73366,0.695752,0.664052
max,0.951634,0.964706,0.96732,0.94902,0.922876,0.952941,0.959477,0.96732,0.979085,0.992157,...,0.993464,0.997386,0.998693,0.998693,1.0,1.0,1.0,1.0,0.998693,1.0


In [79]:
print("Total dataset size:")
print("n_samples: %d" % n_samples)
print("n_features: %d" % n_features)
print("n_classes: %d" % n_classes)

Total dataset size:
n_samples: 1140
n_features: 2914
n_classes: 5


In [80]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [82]:
n_components = 150
pca = PCA(n_components=n_components, svd_solver='randomized', whiten=True).fit(X_train)

In [83]:
eigenfaces = pca.components_.reshape((n_components, h, w))

X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

In [84]:
print("Fitting the classifier to the training set")
param_grid = {
    'C': [1e3, 5e3, 1e4, 5e4, 1e5],
    'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], 
}
clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid, verbose=2)
clf = clf.fit(X_train_pca, y_train)

print("Best estimator found by grid search:")
print(clf.best_estimator_)

Fitting the classifier to the training set
Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] END .............................C=1000.0, gamma=0.0001; total time=   0.0s
[CV] END .............................C=1000.0, gamma=0.0001; total time=   0.0s
[CV] END .............................C=1000.0, gamma=0.0001; total time=   0.0s
[CV] END .............................C=1000.0, gamma=0.0001; total time=   0.0s
[CV] END .............................C=1000.0, gamma=0.0001; total time=   0.0s
[CV] END .............................C=1000.0, gamma=0.0005; total time=   0.0s
[CV] END .............................C=1000.0, gamma=0.0005; total time=   0.0s
[CV] END .............................C=1000.0, gamma=0.0005; total time=   0.0s
[CV] END .............................C=1000.0, gamma=0.0005; total time=   0.0s
[CV] END .............................C=1000.0, gamma=0.0005; total time=   0.0s
[CV] END ..............................C=1000.0, gamma=0.001; total time=   0.0s
[CV]

In [86]:
print("Predicting people's names on the test set")
y_pred = clf.predict(X_test_pca)

print(classification_report(y_test, y_pred, target_names=target_names))
print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))

Predicting people's names on the test set
                   precision    recall  f1-score   support

     Colin Powell       0.90      0.94      0.92        50
  Donald Rumsfeld       0.92      0.92      0.92        25
    George W Bush       0.94      0.97      0.95       106
Gerhard Schroeder       1.00      0.82      0.90        22
       Tony Blair       0.91      0.84      0.88        25

         accuracy                           0.93       228
        macro avg       0.93      0.90      0.91       228
     weighted avg       0.93      0.93      0.93       228

[[ 47   1   2   0   0]
 [  0  23   1   0   1]
 [  3   0 103   0   0]
 [  2   0   1  18   1]
 [  0   1   3   0  21]]
