In [None]:
from scipy.spatial import distance
import pandas as pd
import numpy as np

## Exploring Wikipedia Distance

In [None]:
def compare(df, val1, val2):
    return distance.euclidean(df.loc[val1], df.loc[val2])

### 3D embedding distances

In [None]:
# Load the wikipedia data in memory
path = '../demo_embeddings/wikipedia_3000/iterations_250/perplexity_3/pca_25/learning_rate_10'
embedding_df = pd.read_csv(path + f'/data.csv', index_col=0)
embedding_df.head()

In [None]:
print("the vs of:", compare(embedding_df, 'the', 'of'))
print("of vs to:", compare(embedding_df, 'to', 'of'))

### Original Embeddings distances

In [None]:
glove_df = pd.read_csv("../data/wikipedia_3000.csv", index_col=0)
glove_df.head()

In [None]:
print("Euclidean distances for Original 300D Glove embeddings:")
print("the, of:", compare(glove_df, 'the', 'of'))
print("of, to:", compare(glove_df, 'to', 'of'))

### Speed comparison, 3D vs original

In [None]:
print("Speed for 3D Embedding:")
%timeit compare(embedding_df, 'the', 'of')
print("\nSpeed for original Embedding (300D):")
%timeit compare(glove_df, 'the', 'of')

In [None]:
print("Speed for 3D Embedding:")
%timeit compare(embedding_df, 'to', 'of')
print("\nSpeed for original Embedding (300D):")
%timeit compare(glove_df, 'to', 'of')

We can see that the speeds are very similar to each other, and the computation is very efficient (less than a ms)

In [None]:
selected_word = 'united'
selected_vec = glove_df.loc[selected_word]
word_dict = pd.DataFrame(glove_df.index)

def compare_pd(vector):
    return distance.euclidean(vector, selected_vec)

%timeit glove_df.apply(compare_pd, axis=1)
distance_map = glove_df.apply(compare_pd, axis=1)
print(distance_map.shape)
distance_map.head()

This returns exactly what we want, which is a list of all the distances a certain word, in our case 'the', and does so in a reasonable amount of time.

In [None]:
selected_word = 'united'
selected_vec = embedding_df.loc[selected_word]

def compare_pd(vector):
    return distance.euclidean(vector, selected_vec)

%timeit embedding_df.apply(compare_pd, axis=1)
distance_map_3d = embedding_df.apply(compare_pd, axis=1)
distance_map_3d.head()

We again notice similar performance for the 3D embedding. Therefore there's no good reason to use this one in particular, since the original embedding captures the distribution better in any case.

### Testing preprocessing

In [None]:
sorted_map = distance_map.sort_values()
sorted_map

In [None]:
sorted_3d_map = distance_map_3d.sort_values()
sorted_3d_map