# Perform clustering on Olivetti faces dataset

## Import the dataset and visualise

In [None]:
from sklearn.datasets import fetch_olivetti_faces

faces = fetch_olivetti_faces()

In [None]:
import matplotlib.pyplot as plt

example_image_num = 20

print('Image size', faces['images'][example_image_num].shape)

plt.imshow(faces['images'][example_image_num], cmap='gray')
plt.show()

In [None]:
print('Image flatterned shape:\n', faces['data'][example_image_num].shape)
print('\nImage identifiers:\n', faces['target'])

## Split the dataset into train & test
### Need to shuffle as images are ordered by person

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val_test, y_train, y_val_test = train_test_split(faces['data'], 
                                                            faces['target'], 
                                                            test_size=0.2,
                                                            stratify=faces['target'])
X_val, X_test, y_val, y_test = train_test_split(X_val_test, 
                                                y_val_test, 
                                                test_size=0.5,
                                                stratify=y_val_test)

print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)

## Cluster the dataset
### Choose the best number of clusters

In [None]:
from sklearn.cluster import KMeans
import numpy as np
from sklearn.metrics import silhouette_score

clusters = np.arange(10, 300, 30)
inertias = []
silhouettes = []

for cluster in clusters:
    kmc = KMeans(n_clusters=cluster)
    kmc.fit(X_train)
    inertias.append(kmc.inertia_)
    silhouettes.append(silhouette_score(X_train, kmc.labels_))

In [None]:
plt.scatter(clusters, inertias)
plt.xlabel('# clusters')
plt.ylabel('Inertia')
plt.grid(which='both')
plt.title('# clusters choice from inertia')

In [None]:
plt.scatter(clusters, silhouettes)
plt.xlabel('# clusters')
plt.ylabel('Silhouette score')
plt.grid(which='both')
plt.title('# clusters choice from silhouettes')

### Re train the model using the best cluster number

In [None]:
kmc = KMeans(clusters[np.argmax(silhouettes)])
kmc.fit(X_train)

In [None]:
a_cluster_index = 30

for flat_image in X_train[np.where(kmc.labels_ == a_cluster_index)]:
    plt.imshow(flat_image.reshape(64, 64), cmap='gray')
    plt.show()

# Train a classifier

In [None]:
from sklearn.linear_model import LogisticRegression

lrc = LogisticRegression()
lrc.fit(X_train, y_train)

In [None]:
lrc.score(X_val, y_val)

## Use kmeans for dimensionality reduction

### Use kmeans to find distance to cluster centers, then use logistic regression

In [None]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('clustering', KMeans(clusters[np.argmax(silhouettes)])),
    ('log_reg', LogisticRegression())    
])

pipeline.fit(X_train, y_train)

In [None]:
pipeline.score(X_val, y_val)

## Perform a search over number of clusters to use

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'clustering__n_clusters': np.arange(10, 150, 30)}
grid_clf = GridSearchCV(pipeline, param_grid, cv=3)
grid_clf.fit(X_train, y_train)

In [None]:
print(grid_clf.best_params_)
grid_clf.score(X_val, y_val)

## Add the cluster distances to the feature set & retrain

## STILL TODO

In [None]:
X_train.shape

In [None]:
pipeline_2 = Pipeline([
    ('predict_cluster', KMeans(clusters[np.argmax(silhouettes)])),
    ('log_reg', LogisticRegression()) 
])

In [None]:
X_train_with_cluster_dist = np.c_[X_train, pipeline['clustering'].labels_]
X_train_with_cluster_dist.shape

In [None]:
lrc = LogisticRegression()
lrc.fit(X_train_with_cluster_dist, y_train)

In [None]:
lrc.score(X_val, y_val)