In [1]:
import json

import numpy as np
import torch
from tqdm.auto import tqdm


def load_vectors(path: str):
    tokens, vectors = [], []
    with open(path, 'r', encoding="utf-8") as f:
        n_lines, n_dim = map(int, f.readline().split())
        for line in tqdm(f, total=n_lines, desc='Loading vectors'):
            word, *vector = line.split()
            if not len(vector) == n_dim:
                continue
            tokens.append(word)
            vectors.append(np.array(vector, dtype=np.float32))

    print("loaded: ", len(tokens), len(vectors))
    return tokens, np.stack(vectors)

In [2]:
with open("ur_to_en.json", "r") as f:
    ur_to_en = json.load(f)
    src_toks, tgt_toks = list(ur_to_en.keys()), [tok.lower() for tok in ur_to_en.values()]

src_tokens, src_vectors = load_vectors("temp/align/cc.ur.300.vec")
src_idxs = np.array([src_tokens.index(tok) for tok in src_toks])
src_vecs = src_vectors[src_idxs]

tgt_tokens, tgt_vectors = load_vectors("temp/align/cc.en.300.vec")
tgt_tokens = [t.lower() for t in tgt_tokens]
tgt_idxs = np.array([tgt_tokens.index(tok) for tok in tgt_toks])
tgt_vecs = tgt_vectors[tgt_idxs]

src_vecs.shape, tgt_vecs.shape

Loading vectors:   0%|          | 0/200123 [00:00<?, ?it/s]

loaded:  200117 200117


Loading vectors:   0%|          | 0/200604 [00:00<?, ?it/s]

loaded:  200604 200604


((9416, 300), (9416, 300))

In [3]:
idxs = np.arange(len(src_vecs))
np.random.shuffle(idxs)
n_train = int(0.8 * len(src_vecs))
train_idxs = idxs[:n_train]
valid_idxs = idxs[n_train:]

n_train

7532

In [4]:
def normalized(a, axis=-1, order=2):
    """Utility function to normalize the rows of a numpy array."""
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2==0] = 1
    return a / np.expand_dims(l2, axis)

def learn_transformation(source_matrix, target_matrix, normalize_vectors=True):
    """
    Source and target matrices are numpy arrays, shape
    (dictionary_length, embedding_dimension). These contain paired
    word vectors from the bilingual dictionary.
    """
    # optionally normalize the training vectors
    if normalize_vectors:
        source_matrix = normalized(source_matrix)
        target_matrix = normalized(target_matrix)

    # perform the SVD
    product = np.matmul(source_matrix.transpose(), target_matrix)
    U, s, V = np.linalg.svd(product)

    # return orthogonal transformation which aligns source language to the target
    return np.matmul(U, V)

In [130]:
R = learn_transformation(src_vecs[train_idxs], tgt_vecs[train_idxs])
# R = learn_transformation(src_vecs, tgt_vecs)
# torch.save(torch.tensor(R), "temp/align/ur_to_en_300x300.pt")

In [6]:
src_word = np.random.choice(np.array(src_toks)[valid_idxs], 1)[0]
src_vec = src_vecs[src_toks.index(src_word)]

tgt_word = ur_to_en[src_word].lower()
tgt_vec = tgt_vecs[tgt_toks.index(tgt_word)]

print("source: ", src_word, "target: ", tgt_word)
print("original similarity: ", (src_vec @ tgt_vec) / (np.linalg.norm(src_vec) * np.linalg.norm(tgt_vec)))

print("aligned similarity: ", ((src_vec @ R) @ tgt_vec) / (np.linalg.norm(src_vec) * np.linalg.norm(tgt_vec)))

source:  لائی target:  brought
original similarity:  0.09436258
aligned similarity:  0.4547023


In [84]:
all_sims = normalized(src_vecs @ R) @ normalized(tgt_vectors).T
val_sims = all_sims[valid_idxs]
val_sims.shape

(1884, 200604)

In [129]:
i = src_toks.index("ٹین") # np.random.choice(valid_idxs)
idx = all_sims[i].argsort()[-5:][::-1]
sims = all_sims[i][idx]
np.array(src_toks)[i], list(zip(np.array(tgt_tokens)[idx], sims))

('ٹین',
 [('pancake', 0.32896447),
  ('k-', 0.31401414),
  ('loft', 0.30195698),
  ('boys', 0.29707932),
  ('caulking', 0.2964992)])