In [1]:
import pandas as pd
import csv
from zipfile import ZipFile

N_DIMS = 300
z = ZipFile("../embeddings/glove6b/glove.6B.zip")
f = z.open(f'glove.6B.{N_DIMS}d.txt')

word_matrix = pd.read_table(
    f, sep=" ", index_col=0, 
    header=None, quoting=csv.QUOTE_NONE
)
word_matrix.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,291,292,293,294,295,296,297,298,299,300
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
the,0.04656,0.21318,-0.007436,-0.45854,-0.035639,0.23643,-0.28836,0.21521,-0.13486,-1.6413,...,-0.013064,-0.29686,-0.079913,0.195,0.031549,0.28506,-0.087461,0.009061,-0.20989,0.053913
",",-0.25539,-0.25723,0.13169,-0.042688,0.21817,-0.022702,-0.17854,0.10756,0.058936,-1.3854,...,0.075968,-0.014359,-0.073794,0.22176,0.14652,0.56686,0.053307,-0.2329,-0.12226,0.35499
.,-0.12559,0.01363,0.10306,-0.10123,0.098128,0.13627,-0.10721,0.23697,0.3287,-1.6785,...,0.060148,-0.15619,-0.11949,0.23445,0.081367,0.24618,-0.15242,-0.34224,-0.022394,0.13684
of,-0.076947,-0.021211,0.21271,-0.72232,-0.13988,-0.12234,-0.17521,0.12137,-0.070866,-1.5721,...,-0.36673,-0.38603,0.3029,0.015747,0.34036,0.47841,0.068617,0.18351,-0.29183,-0.046533
to,-0.25756,-0.057132,-0.6719,-0.38082,-0.36421,-0.082155,-0.010955,-0.082047,0.46056,-1.8477,...,-0.012806,-0.59707,0.31734,-0.25267,0.54384,0.063007,-0.049795,-0.16043,0.046744,-0.070621


In [2]:
from sklearn.metrics.pairwise import cosine_similarity
word_list = ["dog","cat","carbon"]
words = word_matrix.loc[word_list]
sims = pd.DataFrame(cosine_similarity(words))
sims.index, sims.columns = word_list, word_list
sims

Unnamed: 0,dog,cat,carbon
dog,1.0,0.681675,0.019671
cat,0.681675,1.0,0.01033
carbon,0.019671,0.01033,1.0


In [3]:
# Distance between 2 words
from scipy.spatial.distance import cosine
vec_a = word_matrix.loc["paris"]
vec_b = word_matrix.loc["france"]
1 - cosine(vec_a, vec_b)

0.6580672325437252

In [4]:
# Similar words to x
vec_a = word_matrix.loc["cat"]
sims = 1 - word_matrix.apply(cosine, axis=1, args=(vec_a,))
sims.sort_values(ascending=False).head(6)

0
cat       1.000000
dog       0.681675
cats      0.681584
pet       0.587037
dogs      0.540767
feline    0.489797
dtype: float64

In [5]:
# function for similar words to x
def similar_words(word, word_matrix):
    vec_a = word_matrix.loc[word]
    sims = 1 - word_matrix.apply(cosine, axis=1, args=(vec_a,))
    return sims.sort_values(ascending=False)

similar_words("carbon",word_matrix).head(6)

0
carbon        1.000000
dioxide       0.847373
emissions     0.768768
co2           0.753625
greenhouse    0.701156
gases         0.675826
dtype: float64

In [6]:
diff = word_matrix.loc["paris"] - word_matrix.loc["france"] 
vec_d = word_matrix.loc["berlin"] - diff
sims = 1 - word_matrix.apply(cosine, axis=1, args=(vec_d,))
sims.sort_values(ascending=False).head(6)

0
germany    0.809877
berlin     0.657476
german     0.619871
france     0.616306
austria    0.564680
poland     0.561546
dtype: float64

In [7]:
## Document embeddings

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
docs = [
  "The acclaimed author penned novels based on her life",
  "Nobel prize-winning writer writes autobiographical fiction"
]
vec = CountVectorizer()
dfmat = vec.fit_transform(docs).todense()
dfmat

matrix([[1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0],
        [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1]])

In [9]:
import numpy as np
common_features = set(word_matrix.index) & set(vec.get_feature_names_out())
common_features = list(common_features)
vocab_ids = [vec.vocabulary_[x] for x in common_features]
glove_dfmat = dfmat[:,vocab_ids]
corpus_word_matrix = word_matrix.loc[common_features,]
doc_matrix = glove_dfmat.dot(corpus_word_matrix)
1 - cosine(doc_matrix[0,].A1, doc_matrix[1,].A1)

0.7185793601183077