In [None]:
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import pickle
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import confusion_matrix, classification_report

import sys
sys.path.insert(0,'../')
from helper import (classification_tools as ct,
                   visualize as vis)
%matplotlib inline

In [None]:
fc1_path = Path('..','data','features','VGG16_fc1_features_std.pickle')
with open(fc1_path, 'rb') as f:
    data = pickle.load(f)

le_path = Path('..','models','label_encoder.pickle')
with open(le_path, 'rb') as f:
    le = pickle.load(f)

In [None]:
files = data['filename']
fc1 = data['features']
labels = data['labels']
y_gt = le.transform(labels)

In [None]:
pca = PCA(n_components=50, svd_solver='full', whiten=True)
pca_nw = PCA(n_components=50, svd_solver='full', whiten=False)
x = pca.fit_transform(fc1)
x_nw = pca_nw.fit_transform(fc1)

In [None]:
tsne = TSNE(n_components=2, random_state=12214)
tsne_w = TSNE(n_components=2, random_state=654753)
x_nw_tsne = tsne.fit_transform(x_nw)
x_w_tsne = tsne_w.fit_transform(x)

# Without whitening cluster accuracy is consistently around 96% 


In [None]:
rs = np.random.RandomState(seed=1115068143)
for seed in rs.randint(2**32,size=5):
    kmeans = KMeans(n_clusters=7, init='k-means++', n_init=50, random_state=seed)
    kmeans.fit(x_nw)
    labels_unmatched = kmeans.labels_
    y_pred = ct.label_matcher(labels_unmatched, y_gt)
    print('inertia: {:.2f}'.format(kmeans.inertia_))
    CM = confusion_matrix(y_gt, y_pred)
    print(CM)
    print(CM.trace()/CM.sum())

# varying the number of components


In [None]:
pca_nw = PCA(whiten=False, svd_solver='full')
pca_w = PCA(whiten=True, svd_solver='full')

pca_nw.fit(fc1)
pca_w.fit(fc1)

In [None]:
var = pca_w.explained_variance_ratio_.astype(np.float64).cumsum()
plt.plot(var)

In [None]:
nc = [1, 5, 10, 20, 50, 100, 250, 500, 1000, 1800] 
var[np.asarray(nc)-1]

In [None]:
rs = np.random.RandomState(seed=3731806785)
[[x, y] for x, y in zip(rs.randint(2**32, size=len(nc)), rs.randint(2**32, size=len(nc)))]

In [None]:
use_cache = True # set to False to re-compute results
pca_nc_cache_path = Path('..','.neu_cache','sensitivity_pca_num_components.pickle')

if not use_cache or not pca_nc_cache_path.is_file(): # run if user specifies (switch=True) or if cached results not found
    accs_w = []
    accs_nw = []
    rs = np.random.RandomState(seed=3731806785)
    for c, seed1, seed2  in zip(nc, 
                                rs.randint(2**32, size=len(nc)), 
                                rs.randint(2**32, size=len(nc))):

        print('number of components: {:>4}'.format(c))
        pca_nw = PCA(whiten=False, svd_solver='full', n_components=c)
        pca_w = PCA(whiten=True, svd_solver='full', n_components=c)

        x_nw = pca_nw.fit_transform(fc1)
        x_w = pca_w.fit_transform(fc1)

        kmeans_nw = KMeans(n_clusters=7, init='k-means++', n_init=500, random_state=seed1)
        kmeans_nw.fit(x_nw)
        labels_unmatched_nw = kmeans_nw.labels_
        y_pred_nw  = ct.label_matcher(labels_unmatched_nw, y_gt)

        CM_nw = confusion_matrix(y_gt, y_pred_nw)
        accs_nw.append(CM_nw.trace()/CM_nw.sum())

        kmeans_w = KMeans(n_clusters=7, init='k-means++', n_init=500, random_state=seed2)
        kmeans_w.fit(x_w)
        labels_unmatched_w = kmeans_w.labels_
        y_pred_w  = ct.label_matcher(labels_unmatched_w, y_gt,)


        CM_w = confusion_matrix(y_gt, y_pred_w)
        accs_w.append(CM_w.trace()/CM_w.sum())
    with open(pca_nc_cache_path, 'wb') as f:
        pickle.dump({'nc':nc,
                'accs_nw':accs_nw,
                'accs_w':accs_w},
               f)
else:
    with open(pca_nc_cache_path, 'rb') as f:
        results_ = pickle.load(f)
        nc = results_['nc']
        accs_nw = results_['accs_nw']
        accs_w = results_['accs_w']
        
        

In [None]:
fig = plt.figure(dpi=300, figsize=(3,2.5))
ax = fig.add_subplot(111)
ax.plot(nc, accs_w, '-.sk', label='whitening', color='deeppink')
ax.plot(nc, accs_nw, ':ok', label='no whitening', color='slateblue')
leg = ax.legend()
ax.set_xscale('log')
ax.set_xlabel('Number of PCA components')
ax.set_ylabel('Clustering accuracy')
fig.tight_layout()
fig.savefig(Path('..','Figures','pca_n_components.png'), bbox_inches='tight')

In [None]:
print('number of components\taccuaries (no whitening)\taccuracies (whitening)')
for n, nw, w in zip(nc, accs_nw, accs_w):
    print('\t{:>4}\t\t\t{:.3f}\t\t\t\t{:.3f}'.format(n,nw,w))