In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd
from os import listdir
from os.path import join
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import matplotlib._color_data as mcd

In [None]:
emb_dir = './embeddings/radius_2.0'
emb_list = listdir(emb_dir)
emb_list.sort()

In [None]:
vectors, labels = [], []
for label in emb_list:
    loaded = np.load(join(emb_dir,label), allow_pickle=True)

    for e in loaded:
        vectors.append(e.detach().numpy()[0])
        labels.append(label[:-4])

x = np.asarray(vectors)

In [None]:
# Normalize Data
x_norm = StandardScaler().fit_transform(x) # normalizing the features

print('Untouched:\t ', np.mean(x),'\t', np.std(x))
print('Normalized:\t ', np.mean(x_norm),'\t', np.std(x_norm))
x_norm.shape

In [None]:

pca = PCA(n_components=3)
components_ear = pca.fit_transform(x_norm)

ear_df = pd.DataFrame(data = components_ear, index = labels, columns = ['Component 1', 'Component 2', 'Component 3'],)


ear_df.head()

In [None]:
print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))
print('Total variance explained: {}'.format(sum(pca.explained_variance_ratio_)))


In [None]:
# Only for creating colors
ear_df['label'] = pd.Categorical(labels)
# my_color = ear_df['label'].cat.codes
# ear_df = ear_df.drop('persons', 1)

In [None]:
unique = list(set(labels))
unique.sort()

pseudonyms = dict()
for i, proband in enumerate(unique):
    pseudonyms[proband] = "Proband_"+str(i+1)

cluster_center = dict()
for person in unique:
    cluster_center[person] = (sum(ear_df.loc[person, 'Component 1'])/80, sum(ear_df.loc[person, 'Component 2'])/80, sum(ear_df.loc[person, 'Component 3'])/80)

In [None]:
for i in pseudonyms:
    print(i, '\t--\t', pseudonyms[i])

In [None]:
## DEFINE X- and Y- AXIS
x_component = 1
y_component = 2
assert x_component >= 1 and x_component <=3 and y_component >= 1 and y_component <=3
# Plot preparations
fig = plt.figure(figsize = (30,30))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component ' + str(x_component), fontsize = 15)
ax.set_ylabel('Principal Component ' + str(y_component), fontsize = 15)
ax.set_title('PCA of ear embeddings - 2 of three dimensions ', fontsize = 20)
targets = unique
colors = list(mcd.XKCD_COLORS.values())
for target, color in zip(targets,colors):
    indicesToKeep = ear_df['label'] == target
    # ax.scatter(x=cluster_center[target][0], y=cluster_center[target][1], s=5, c='red', marker='*')
    ax.scatter(ear_df.loc[indicesToKeep, 'Component ' + str(x_component)]
               , ear_df.loc[indicesToKeep, 'Component ' + str(y_component)]
               , c = color
               , s = 15
               , alpha=0.6)

# SWITCH annotations and legend respectively if pseudonyms are wanted
    ax.annotate(s=pseudonyms[target], xy=(cluster_center[target][x_component-1], cluster_center[target][y_component-1]), textcoords='data' )
    # ax.annotate(s=target, xy=(cluster_center[target][x_component-1], cluster_center[target][y_component-1]), textcoords='data' )
ax.legend(pseudonyms.values())
# ax.legend(targets)

ax.grid()

In [None]:
def sort_dict_by_component(dictionary, component, reverse=False):
    '''Function to sort the cluster center dictionary by the component
    Arguments
    ----------
    component: 1, 2 or 3 representing the 3 dimensions
    reverse: default is false, returning an ascending order

    Returns
    ----------
    sorted_reduced: a list containing the tuples of (key, component) of specified component
    '''

    assert component>=1 and component<=3
    # change to array space
    component -= 1

    sorted_values = sorted(dictionary.items(), key=lambda x: x[1][component], reverse=reverse)

    sorted_reduced = [(label, values[component]) for (label, values) in sorted_values]
    return sorted_reduced

In [None]:
## Set component 1, 2 or 3
c = 1

sort_values = sort_dict_by_component(cluster_center, c)
print('These are the values of the component ', c, '\n')
for l,v in sort_values:
    # SWITCH respectively if pseudonyms are wanted
    print('{:>20} : {:<}'.format(pseudonyms[l],v))
    # print('{:>20} : {:<}'.format(l,v))


In [None]:
val = [v for _,v in sort_values]
steps = []
for i in range(len(val)-1):
    steps.append(abs(val[i]-val[i+1]))

steps.sort()

print('Average step size\t: ', np.mean(steps))
print('Median of step size\t: ', np.median(steps))
steps_int = [int(x) for x in steps]
print(steps)
print(steps_int)