In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_olivetti_faces

X,y = fetch_olivetti_faces(return_X_y=True)

In [2]:
from sklearn.model_selection import train_test_split
# Given the scarcity of the dataset, an 80-20 ratio for train/test will be considered.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True,random_state=42,stratify=y)

In [3]:
from sklearn.model_selection import StratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# With 8 samples available for each person for training, We set the n_splits=8 , so for each split, we'll
# have 7 records per person to train and 1 record to validate with.
skf = StratifiedKFold(n_splits=8,shuffle=True,random_state=42)
clf = Pipeline([
    ("scaler", StandardScaler()),
    ("nn", MLPClassifier(solver='adam', activation='tanh',hidden_layer_sizes=(80,60,40,40), random_state=42,max_iter=2000))
])

scores = []
for train_index, test_index in skf.split(X_train,y_train):
    X_train_batch, X_test_batch = X_train[train_index], X_train[test_index]
    y_train_batch, y_test_batch = y_train[train_index], y_train[test_index]
    clf.fit(X_train_batch,y_train_batch)
    pred = clf.predict(X_test_batch)
    score = accuracy_score(y_test_batch, pred)
    scores.append(score)
    print(score)
    
print(f'Average score: {np.mean(scores)}')

0.925
0.975
0.95
0.925
0.925
0.975
0.95
0.875
Average score: 0.9375


In [4]:
pred = clf.predict(X_test)
score = accuracy_score(y_test, pred)

In [5]:
score

0.9375

In [6]:
X.shape

(400, 4096)

In [7]:
from sklearn.metrics import silhouette_samples, silhouette_score
from scipy.cluster.hierarchy import linkage,fcluster
from sklearn.cluster import FeatureAgglomeration

range_n_clusters = range(2,400,5)

def clusterize(metric,method):
    silhouette_scores = {}
    for n_clusters in range_n_clusters:
        matrix = linkage(X,metric=metric, method=method)
        cluster_labels = fcluster(matrix,t=n_clusters,criterion="maxclust")
        silhouette_scores[n_clusters] = silhouette_score(X, cluster_labels)
    best_num_clusters = max(silhouette_scores, key=silhouette_scores.get)
    print(f'optimal # of clusters for linkage with metric={metric} and method={method}: {best_num_clusters}')
    model = FeatureAgglomeration(
        n_clusters=best_num_clusters,
        affinity=metric,
        linkage="average"
    )
    return model

In [8]:
euclidean = clusterize(metric="euclidean", method="centroid")

optimal # of clusters for linkage with metric=euclidean and method=centroid: 237


In [9]:
minkowski = clusterize(metric="minkowski", method="average")

optimal # of clusters for linkage with metric=minkowski and method=average: 2


In [10]:
cosine = clusterize(metric="cosine", method="average")

optimal # of clusters for linkage with metric=cosine and method=average: 197


In [17]:
for (name, clusterer) in [("euclidean",euclidean), ("minkowski", minkowski), ("cosine",cosine)]:
    
    X_train_new = clusterer.fit_transform(X_train)
    X_test_new = clusterer.fit_transform(X_test)
    
    clf_new = Pipeline([
        ("scaler", StandardScaler()),
        ("nn", MLPClassifier(solver='adam', activation='tanh',hidden_layer_sizes=(80,60,40,40), random_state=42,max_iter=1000))
    ])
    
    print(f'evaluating scores or metric = {name}')
    scores = []
    for train_index, test_index in skf.split(X_train_new,y_train):
        X_train_batch, X_test_batch = X_train_new[train_index], X_train_new[test_index]
        y_train_batch, y_test_batch = y_train[train_index], y_train[test_index]
        clf_new.fit(X_train_batch,y_train_batch)
        pred = clf_new.predict(X_test_batch)
        score = accuracy_score(y_test_batch, pred)
        scores.append(score)
        print(score)

    print(f'Average score: {np.mean(scores)}')

evaluating scores or metric = euclidean
0.875
0.95
0.95
0.825
0.95
1.0
0.925
0.925
Average score: 0.9249999999999999
evaluating scores or metric = minkowski




0.275




0.3




0.375




0.225




0.425




0.45




0.4




0.2
Average score: 0.33125
evaluating scores or metric = cosine
0.8
0.95
0.875
0.9
0.85
0.95
0.9
0.875
Average score: 0.8875
