PCA using Olivetti (Sklearn)

In [None]:
# Import matplotlib library
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import fetch_lfw_people
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.svm import SVC
import time
import numpy as np

In [None]:
from sklearn.datasets import fetch_olivetti_faces

# this command will download the LFW_people's dataset to hard disk.
lfw_people = fetch_olivetti_faces()
 
# introspect the images arrays to find the shapes (for plotting)
n_samples, h, w = lfw_people.images.shape
 
# Instead of providing 2D data, X has data already in the form  of a vector that
# is required in this approach.
X = lfw_people.data
n_features = X.shape[1]
 
# the label to predict is the id of the person
y = lfw_people.target

 


downloading Olivetti faces from https://ndownloader.figshare.com/files/5976027 to /root/scikit_learn_data


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.25, random_state = 42)
print("size of training Data is % d and Testing Data is % d" %(
        y_train.shape[0], y_test.shape[0]))

size of training Data is  300 and Testing Data is  100


In [None]:
n_components = 150

t0 = time.time()
pca = PCA(n_components = n_components,whiten = True).fit(X_train)
 
eigenfaces = pca.components_.reshape((n_components, h, w))
 
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print("done in % 0.3fs" % (time.time() - t0))

done in  0.431s


In [None]:
print("Sample Data point after applying PCA\n", X_train_pca[0])
print("-----------------------------------------------------")
print("Dimensions of training set = % s and Test Set = % s"%( X_train.shape, X_test.shape))

Sample Data point after applying PCA
 [-6.1811554e-01  1.0630195e+00  3.1684369e-01  2.2064978e-01
 -9.9376613e-01  8.9177823e-01 -8.2789475e-01 -6.5192527e-01
 -6.8825018e-01 -1.0344398e-01  2.0083746e-02  9.6160740e-01
  9.8682661e-04 -1.9289309e+00  9.4358718e-01 -1.2143946e+00
  5.5862308e-01 -4.2292315e-01 -1.5368174e+00 -1.0801630e+00
 -2.8341243e-01 -2.2951812e-01 -1.2562701e+00 -5.6461722e-01
  1.0349580e+00 -1.3543636e+00  9.1815937e-01  3.1847030e-01
  1.3232473e+00 -7.2048110e-01  1.3002092e+00 -1.8173767e+00
 -1.2921642e+00 -1.2713994e+00 -1.3630527e-01  1.0455195e+00
 -2.4110581e-01  4.2490548e-01 -7.5799042e-01  4.2938665e-01
  1.7469458e-01 -9.7508377e-01 -1.1866050e+00  6.3068718e-01
  4.9410972e-01  2.5728539e-01  6.3709676e-01 -1.8806926e+00
 -2.1672519e-01 -1.7180933e+00 -1.1125642e+00  1.0265244e+00
 -1.4158516e+00  7.5793666e-01  1.2013366e+00  6.1488098e-01
 -2.0598148e-01 -2.3721318e-01  1.5910673e+00 -5.8445132e-01
 -1.6475677e+00  1.2587591e+00  7.4554485e-01  

In [None]:
print("Fitting the classifier to the training set")

param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
clf = GridSearchCV(SVC(kernel ='rbf', class_weight ='balanced'), param_grid)
clf = clf.fit(X_train_pca, y_train)
print("Best estimator found by grid search:")
print(clf.best_estimator_)
 
print("Predicting people's names on the test set")
t0 = time.time()
y_pred = clf.predict(X_test_pca)
print("done in % 0.3fs" % (time.time() - t0))
# print classification results
print(classification_report(y_test, y_pred))
# print confusion matrix

Fitting the classifier to the training set




Best estimator found by grid search:
SVC(C=1000.0, class_weight='balanced', gamma=0.001)
Predicting people's names on the test set
done in  0.012s
              precision    recall  f1-score   support

           0       1.00      0.50      0.67         4
           1       1.00      1.00      1.00         2
           2       0.50      0.50      0.50         2
           3       1.00      1.00      1.00         4
           4       0.75      1.00      0.86         3
           5       1.00      1.00      1.00         3
           6       1.00      1.00      1.00         1
           7       1.00      0.86      0.92         7
           8       1.00      1.00      1.00         2
           9       1.00      1.00      1.00         3
          10       1.00      1.00      1.00         3
          11       1.00      1.00      1.00         4
          12       1.00      1.00      1.00         2
          13       1.00      1.00      1.00         1
          14       1.00      1.00      1.0

In [None]:
from sklearn.model_selection import cross_val_score
cv = cross_val_score(pca, X_train, y_train, cv=10) 
print(cv.mean()/5000)

0.836335791015625


In [None]:
pca.score(X_test, y_test)

4304.467

KPCA using sklearn

In [None]:
from sklearn.decomposition import KernelPCA
n_components = 150
t0 = time.time()
kpca = KernelPCA(n_components = n_components).fit(X_train)
 
#eigenfaces = pca.components_.reshape((n_components, h, w))
 
X_train_pca = kpca.transform(X_train)
X_test_pca = kpca.transform(X_test)
print("done in % 0.3fs" % (time.time() - t0))

done in  0.142s


In [None]:
print("Sample Data point after applying PCA\n", X_train_pca[0])
print("-----------------------------------------------------")
print("Dimensions of training set = % s and Test Set = % s"%(
        X_train.shape, X_test.shape))

Sample Data point after applying PCA
 [-2.6352129e+00  3.3890424e+00  8.0212569e-01  4.3721646e-01
 -1.7056330e+00  1.3636025e+00 -1.1178060e+00 -8.4976685e-01
 -8.4997994e-01 -1.2231505e-01  2.2066392e-02  1.0194203e+00
  8.1996340e-04 -1.8851430e+00  9.0681440e-01 -1.0988235e+00
  4.9031019e-01 -3.4120208e-01 -1.2101213e+00 -8.2905948e-01
 -2.0989722e-01 -1.6556382e-01 -8.6321509e-01 -3.8449988e-01
  6.9700193e-01 -8.8418829e-01  5.9209818e-01  2.0149793e-01
  8.1835759e-01 -4.2404586e-01  7.4491715e-01 -1.0209610e+00
 -7.0889342e-01 -6.6649836e-01 -6.9871016e-02  5.3332871e-01
 -1.2099931e-01  2.1002671e-01 -3.6578706e-01  2.0461288e-01
  8.2105190e-02 -4.5015496e-01 -5.3585982e-01  2.7805984e-01
  2.1422662e-01  1.1113755e-01  2.7278545e-01 -7.9104471e-01
 -9.0406477e-02 -7.0317805e-01 -4.5087945e-01  4.0744719e-01
 -5.5057919e-01  2.9007375e-01  4.5099914e-01  2.2924396e-01
 -7.6495849e-02 -8.6446196e-02  5.7794791e-01 -2.0988300e-01
 -5.9511942e-01  4.4757304e-01  2.6262122e-01  

In [None]:
print("Fitting the classifier to the training set")

param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
clf = GridSearchCV(SVC(kernel ='rbf', class_weight ='balanced'), param_grid)
clf = clf.fit(X_train_pca, y_train)
print("Best estimator found by grid search:")
print(clf.best_estimator_)
 
print("Predicting people's names on the test set")
t0 = time.time()
y_pred = clf.predict(X_test_pca)
print("done in % 0.3fs" % (time.time() - t0))
# print classification results
print(classification_report(y_test, y_pred))


Fitting the classifier to the training set




Best estimator found by grid search:
SVC(C=1000.0, class_weight='balanced', gamma=0.0001)
Predicting people's names on the test set
done in  0.010s
              precision    recall  f1-score   support

           0       1.00      0.50      0.67         4
           1       1.00      1.00      1.00         2
           2       1.00      1.00      1.00         2
           3       1.00      1.00      1.00         4
           4       1.00      1.00      1.00         3
           5       1.00      1.00      1.00         3
           6       1.00      1.00      1.00         1
           7       1.00      0.86      0.92         7
           8       1.00      1.00      1.00         2
           9       1.00      1.00      1.00         3
          10       1.00      1.00      1.00         3
          11       1.00      1.00      1.00         4
          12       1.00      1.00      1.00         2
          13       1.00      1.00      1.00         1
          14       1.00      1.00      1.

LDA

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
n_components = 150
t0 = time.time()
lda = LinearDiscriminantAnalysis().fit(X_train,y_train)
 
#eigenfaces = pca.components_.reshape((n_components, h, w))
 
X_train_lda = lda.transform(X_train)
X_test_lda = lda.transform(X_test)
print("done in % 0.3fs" % (time.time() - t0))

done in  0.534s


In [None]:
print("Sample Data point after applying PCA\n", X_train_lda[0])
print("-----------------------------------------------------")
print("Dimensions of training set = % s and Test Set = % s"%(
        X_train.shape, X_test.shape))

Sample Data point after applying PCA
 [ 6.27391968  0.35939903 -3.17492692  2.42980618 -0.36221616  0.45760319
 -0.96499823  1.6948721  -0.36765641 -3.11354212  0.93546147  0.29667266
  6.23040073 -0.27387318  0.51508694 -0.82362963  2.52132724  0.81617149
  0.92812117  7.12426924  2.33407947 -3.2896281   0.32180777 -0.17750356
  0.59911011 -1.21534004  2.98662163 -4.19007495 -0.49803416 -1.52232799
 -2.0583664  -0.68788364  0.22993702  2.73760462 -0.45790701  1.52190489
  0.26188413 -0.84734004  0.54325775]
-----------------------------------------------------
Dimensions of training set = (300, 4096) and Test Set = (100, 4096)


In [None]:
print("Fitting the classifier to the training set")

param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
clf = GridSearchCV(
    SVC(kernel ='rbf', class_weight ='balanced'), param_grid
)
clf = clf.fit(X_train_lda, y_train)
print("Best estimator found by grid search:")
print(clf.best_estimator_)
 
print("Predicting people's names on the test set")
t0 = time.time()
y_pred = clf.predict(X_test_lda)
print("done in % 0.3fs" % (time.time() - t0))
# print classification results
print(classification_report(y_test, y_pred))

Fitting the classifier to the training set




Best estimator found by grid search:
SVC(C=1000.0, class_weight='balanced', gamma=0.0001)
Predicting people's names on the test set
done in  0.005s
              precision    recall  f1-score   support

           0       1.00      0.75      0.86         4
           1       1.00      1.00      1.00         2
           2       1.00      1.00      1.00         2
           3       1.00      1.00      1.00         4
           4       1.00      1.00      1.00         3
           5       1.00      1.00      1.00         3
           6       1.00      1.00      1.00         1
           7       1.00      0.86      0.92         7
           8       1.00      1.00      1.00         2
           9       1.00      1.00      1.00         3
          10       1.00      1.00      1.00         3
          11       1.00      1.00      1.00         4
          12       1.00      1.00      1.00         2
          13       1.00      1.00      1.00         1
          14       1.00      1.00      1.