# Loading word embeddings

In [4]:
import io
import numpy as np

In [None]:
def load_vec(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id)%500 == 0:
                print("loaded : %d" % len(word2id))
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

In [6]:
src_path = 'wiki.multi.ar.vec'
tgt_path = 'wiki.multi.he.vec'
nmax = 50000  # maximum number of word embeddings to load

src_embeddings, src_id2word, src_word2id = load_vec(src_path, nmax)
tgt_embeddings, tgt_id2word, tgt_word2id = load_vec(tgt_path, nmax)

loaded : 500
loaded : 1000
loaded : 1500
loaded : 2000
loaded : 2500
loaded : 3000
loaded : 3500
loaded : 4000
loaded : 4500
loaded : 5000
loaded : 5500
loaded : 6000
loaded : 6500
loaded : 7000
loaded : 7500
loaded : 8000
loaded : 8500
loaded : 9000
loaded : 9500
loaded : 10000
loaded : 10500
loaded : 11000
loaded : 11500
loaded : 12000
loaded : 12500
loaded : 13000
loaded : 13500
loaded : 14000
loaded : 14500
loaded : 15000
loaded : 15500
loaded : 16000
loaded : 16500
loaded : 17000
loaded : 17500
loaded : 18000
loaded : 18500
loaded : 19000
loaded : 19500
loaded : 20000
loaded : 20500
loaded : 21000
loaded : 21500
loaded : 22000
loaded : 22500
loaded : 23000
loaded : 23500
loaded : 24000
loaded : 24500
loaded : 25000
loaded : 25500
loaded : 26000
loaded : 26500
loaded : 27000
loaded : 27500
loaded : 28000
loaded : 28500
loaded : 29000
loaded : 29500
loaded : 30000
loaded : 30500
loaded : 31000
loaded : 31500
loaded : 32000
loaded : 32500
loaded : 33000
loaded : 33500
loaded : 34000


# Get nearest neighbors

In [7]:
def get_nn(word, src_emb, src_id2word, tgt_emb, tgt_id2word, K=5):
    print("Nearest neighbors of \"%s\":" % word)
    word2id = {v: k for k, v in src_id2word.items()}
    word_emb = src_emb[word2id[word]]
    scores = (tgt_emb / np.linalg.norm(tgt_emb, 2, 1)[:, None]).dot(word_emb / np.linalg.norm(word_emb))
    k_best = scores.argsort()[-K:][::-1]
    for i, idx in enumerate(k_best):
        print('%.4f - %s' % (scores[idx], tgt_id2word[idx]))

In [8]:
# printing nearest neighbors in the source space
src_word = u'ماء'
get_nn(src_word, src_embeddings, src_id2word, src_embeddings, src_id2word, K=5)

Nearest neighbors of "ماء":
1.0000 - ماء
0.6451 - الماء
0.6000 - وماء
0.5760 - مياه
0.5574 - بالماء


In [9]:
# printing nearest neighbors in the target space
src_word = u'ماء'
get_nn(src_word, src_embeddings, src_id2word, tgt_embeddings, tgt_id2word, K=5)

Nearest neighbors of "ماء":
0.2247 - מגלן
0.2203 - מקוצר
0.2173 - vehicle
0.2064 - שורר
0.2056 - טלר


# Visualize multilingual embeddings

In [10]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2, whiten=True)  # TSNE(n_components=2, n_iter=3000, verbose=2)
pca.fit(np.vstack([src_embeddings, tgt_embeddings]))
print('Variance explained: %.2f' % pca.explained_variance_ratio_.sum())

Variance explained: 0.10


In [11]:
import matplotlib.pyplot as plt


def plot_similar_word(src_words, src_word2id, src_emb, tgt_words, tgt_word2id, tgt_emb, pca):

    Y = []
    word_labels = []
    for sw in src_words:
        Y.append(src_emb[src_word2id[sw]])
        word_labels.append(sw)
    for tw in tgt_words:
        Y.append(tgt_emb[tgt_word2id[tw]])
        word_labels.append(tw)

    # find tsne coords for 2 dimensions
    Y = pca.transform(Y)
    x_coords = Y[:, 0]
    y_coords = Y[:, 1]

    # display scatter plot
    plt.figure(figsize=(10, 8), dpi=80)
    plt.scatter(x_coords, y_coords, marker='x')

    for k, (label, x, y) in enumerate(zip(word_labels, x_coords, y_coords)):
        color = 'blue' if k < len(src_words) else 'red'  # src words in blue / tgt words in red
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points', fontsize=19,
                     color=color, weight='bold')

    plt.xlim(x_coords.min() - 0.2, x_coords.max() + 0.2)
    plt.ylim(y_coords.min() - 0.2, y_coords.max() + 0.2)
    plt.title('Visualization of the multilingual word embedding space')

    plt.show()

In [None]:
# get 5 random input words
src_words = [u'ماء', u'جامعة', u'بيت', u'طعام', u'سيارة', u'كلب']
tgt_words = [u'מים', u'אוניברסיטה', u'בית', u'אוכל', u'אוטו', u'כלב']

# assert words in dictionaries
for sw in src_words:
    assert sw in src_word2id, '"%s" not in source dictionary' % sw
for tw in tgt_words:
    assert tw in tgt_word2id, '"%s" not in target dictionary' % sw

plot_similar_word(src_words, src_word2id, src_embeddings, tgt_words, tgt_word2id, tgt_embeddings, pca)