In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd
from os import listdir
from os.path import join
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import matplotlib._color_data as mcd

In [None]:
emb_dir = './embeddings/radius_2.0'
emb_list = listdir(emb_dir)
emb_list.sort()

In [None]:
vectors, labels = [], []
for label in emb_list:
    loaded = np.load(join(emb_dir,label), allow_pickle=True)

    for e in loaded:
        vectors.append(e.detach().numpy()[0])
        labels.append(label[:-4])

x = np.asarray(vectors)

In [None]:
# Normalize Data
x_norm = StandardScaler().fit_transform(x) # normalizing the features

print('Untouched:\t ', np.mean(x),'\t', np.std(x))
print('Normalized:\t ', np.mean(x_norm),'\t', np.std(x_norm))

In [None]:
x_norm.shape

In [None]:

pca = PCA(n_components=3)
components_ear = pca.fit_transform(x_norm)

ear_df = pd.DataFrame(data = components_ear, index = labels, columns = ['component 1', 'component 2', 'component 3'],)


# print('Covariance: ', pca.get_covariance())

In [None]:
ear_df.head()

In [None]:
print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))

In [None]:
# Only for creating colors
ear_df['label'] = pd.Categorical(labels)
# my_color = ear_df['label'].cat.codes
# ear_df = ear_df.drop('persons', 1)

In [None]:
unique = list(set(labels))
unique.sort()

In [None]:
cluster_center = dict()
for person in unique:
    cluster_center[person] = (sum(ear_df.loc[person, 'component 1'])/80, sum(ear_df.loc[person, 'component 2'])/80, sum(ear_df.loc[person, 'component 3'])/80)



In [None]:
fig = plt.figure(figsize = (30,30))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = unique
colors = list(mcd.XKCD_COLORS.values())
for target, color in zip(targets,colors):
    indicesToKeep = ear_df['label'] == target
    # ax.scatter(x=cluster_center[target][0], y=cluster_center[target][1], s=5, c='red', marker='*')
    ax.scatter(ear_df.loc[indicesToKeep, 'component 1']
               , ear_df.loc[indicesToKeep, 'component 2']
               , c = color
               , s = 10)
    ax.annotate(s=target, xy=(cluster_center[target][0], cluster_center[target][1]), textcoords='data' )
ax.legend(targets)
ax.grid()

In [None]:
cluster_center