# Qualitative differences

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import manifold, metrics, preprocessing, cluster

# Load Data
data = np.load('data/data_w_labels.npz')
Bdata = data['vec']     # Binder Word Vectors
Gdata = data['gVec']    # Google word Vectors
L1 = data['L1']         # Super Category labels
L2 = data['L2']         # Category labels



In [None]:
# Use t-SNE to decompose to 2 dim
B_red = manifold.TSNE(n_components=2).fit_transform(Bdata)
G_red = manifold.TSNE(n_components=2).fit_transform(Gdata)

In [None]:
plt.figure(1, figsize=(16,6))
plt.subplot(121)
plt.scatter(B_red[:,0], B_red[:,1], s=5, c=L2)
plt.title('Binder tSNE Reduced')
plt.subplot(122)
plt.scatter(G_red[:,0], G_red[:,1], s=5, c=L2)
plt.title('Google tSNE Reduced')
plt.show()

In [None]:
# For general analysis, lets normalize the vectors
B_normed = preprocessing.normalize(Bdata, norm='l2')   # Binder Word Vectors
G_normed = preprocessing.normalize(Gdata, norm='l2')   # Google word Vectors

# Look at different measures of seperation
scB  = metrics.silhouette_score(Bdata, L2)
scG  = metrics.silhouette_score(Gdata, L2)
chsB = metrics.calinski_harabaz_score(Bdata, L2)
chsG = metrics.calinski_harabaz_score(Gdata, L2)
print( 'B score: ', scB, '      G score: ', scG)
print( 'B score: ', chsB, '      G score: ', chsG)

Note, the higher the score, the better the result. Also, I tried this with normalization and the results were a little better. However, the motivation for normalization is not completely clear

In [None]:
# Lets do K-means, and just look at general  traits. Won't use normalization
estB = cluster.KMeans(n_clusters=int(max(L2))).fit(Bdata)
estG = cluster.KMeans(n_clusters=int(max(L2))).fit(Gdata)
Bapp = metrics.adjusted_rand_score(L2, estB.labels_)
Gapp = metrics.adjusted_rand_score(L2, estG.labels_)
print(Bapp, '     ', Gapp)


## Lets look at which natural clusters differ the  most

In [None]:
# Calculate Centroids and total distances (L2)
B_centroids = np.zeros((47,65))
B_cDist = np.zeros((47,))
for label in range(47):
    B_centroids[label] = np.mean(Bdata[L2==label])
for idx, row in enumerate(Bdata):
    B_cDist[int(L2[idx])] = B_cDist[int(L2[idx])] + np.linalg.norm(row - B_centroids[int(L2[idx])])

G_centroids = np.zeros((47,300))
G_cDist = np.zeros((47,))
for label in range(47):
    G_centroids[label] = np.mean(Gdata[L2==label])
for idx, row in enumerate(Gdata):
    G_cDist[int(L2[idx])] = G_cDist[int(L2[idx])] + np.linalg.norm(row - G_centroids[int(L2[idx])])

In [None]:
# Scatter
plt.scatter(B_cDist, G_cDist)
plt.title('Centroid Cumalitive Distance',fontweight='bold', fontsize=16)
plt.xlabel('Binder Centroid Distances')
plt.ylabel('Google Centroid Distances')
plt.show()