In [1]:
import json

import numpy as np
import torch
from tqdm.auto import tqdm
import fasttext

src_lang = "ur"
tgt_lang = "en"

In [3]:
src_model = fasttext.load_model(f"../../../../datasets/cc.{src_lang}.300.bin")
tgt_model = fasttext.load_model(f"../../../../datasets/cc.{tgt_lang}.300.bin")

with open(f"temp/{src_lang}_to_{tgt_lang}.json", "r") as f:
    src_to_tgt_words = json.load(f)
    src_toks, tgt_toks = list(src_to_tgt_words.keys()), list(src_to_tgt_words.values())

src_vecs = np.array([src_model.get_word_vector(tok) for tok in src_toks])
tgt_vecs = np.array([tgt_model.get_word_vector(tok) for tok in tgt_toks])

src_vecs.shape, tgt_vecs.shape

((9313, 300), (9313, 300))

In [3]:
all_tgt_tokens = np.array(tgt_model.get_words())[:200_000]
all_tgt_vectors = np.array([tgt_model.get_word_vector(tok) for tok in tqdm(all_tgt_tokens)])
all_src_tokens = np.array(src_model.get_words())[:200_000]
all_src_vectors = np.array([src_model.get_word_vector(tok) for tok in tqdm(all_src_tokens)])

all_tgt_tokens.shape, all_tgt_vectors.shape, all_src_tokens.shape, all_src_vectors.shape

  0%|          | 0/200000 [00:00<?, ?it/s]

  0%|          | 0/200000 [00:00<?, ?it/s]

((200000,), (200000, 300), (200000,), (200000, 300))

In [4]:
idxs = np.arange(len(src_vecs))
np.random.shuffle(idxs)

n_train = int(0.8 * len(src_vecs))
train_idxs = idxs[:n_train]
valid_idxs = idxs[n_train:]

n_train

7450

In [8]:
# https://github.com/babylonhealth/fastText_multilingual
def normalized(a, axis=-1, order=2):
    """Utility function to normalize the rows of a numpy array."""
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2==0] = 1
    return a / np.expand_dims(l2, axis)

def learn_transformation(source_matrix, target_matrix, normalize_vectors=True):
    """
    Source and target matrices are numpy arrays, shape
    (dictionary_length, embedding_dimension). These contain paired
    word vectors from the bilingual dictionary.
    """
    # optionally normalize the training vectors
    if normalize_vectors:
        source_matrix = normalized(source_matrix)
        target_matrix = normalized(target_matrix)

    # perform the SVD
    product = np.matmul(source_matrix.transpose(), target_matrix)
    U, s, V = np.linalg.svd(product)

    # return orthogonal transformation which aligns source language to the target
    return np.matmul(U, V)

In [5]:
R = learn_transformation(src_vecs[train_idxs], tgt_vecs[train_idxs])
# R = learn_transformation(src_vecs, tgt_vecs)
# torch.save(torch.from_numpy(R), "temp/align/ur_to_en_300x300.pt")

In [6]:
# ALTERNATE METHOD
# cross-domain similarity local scaling (CSLS) method
# https://arxiv.org/abs/1804.07745

def avg_vectors(vecs, all_vectors, n=10):
    vecs_normed = normalized(vecs)
    all_vectors_normed = normalized(all_vectors)

    similarities = np.dot(vecs_normed, all_vectors_normed.T)
    top_n_sims_idx = np.argpartition(similarities, -n, axis=1)[:, -n:]
    weights = similarities[np.arange(similarities.shape[0])[:, None], top_n_sims_idx]
    weights = weights ** 2  # my customization
    weights = weights / weights.sum(axis=1, keepdims=True)
    avg_vecs = (all_vectors[top_n_sims_idx] * weights[:, :, None]).sum(axis=1)

    return avg_vecs

def csls(src_vectrs, tgt_vectrs, all_src_vectors, all_tgt_vectors, n=10):
    src_avg = avg_vectors(src_vectrs, all_src_vectors, n=n)
    tgt_avg = avg_vectors(tgt_vectrs, all_tgt_vectors, n=n)

    R = learn_transformation(
        np.concatenate([src_vectrs, src_vectrs, src_avg, src_vectrs]),
        np.concatenate([tgt_vectrs, tgt_vectrs, tgt_vectrs, tgt_avg]),
    )
    return R

R = csls(src_vecs[train_idxs], tgt_vecs[train_idxs], all_src_vectors, all_tgt_vectors, n=10)
# R = csls(src_vecs, tgt_vecs, all_src_vectors, all_tgt_vectors, n=10)
# torch.save(torch.from_numpy(R), "temp/align/ur_to_en_300x300_csls.pt")

In [17]:
src_word = np.random.choice(np.array(src_toks)[valid_idxs], 1)[0]
src_vec = src_vecs[src_toks.index(src_word)]

tgt_word = src_to_tgt_words[src_word].lower()
tgt_vec = tgt_vecs[tgt_toks.index(tgt_word)]

print("source: ", src_word, "target: ", tgt_word)
print("original similarity: ", (src_vec @ tgt_vec) / (np.linalg.norm(src_vec) * np.linalg.norm(tgt_vec)))

print("aligned similarity 1: ", ((src_vec @ R) @ tgt_vec) / (np.linalg.norm(src_vec) * np.linalg.norm(tgt_vec)))

source:  سراغ target:  clue
original similarity:  -0.11701719
aligned similarity 1:  0.29586565


In [56]:
all_sims = normalized(src_vecs @ R) @ normalized(all_tgt_vectors).T
val_sims = all_sims[valid_idxs]
val_sims.shape

(1863, 200000)

In [66]:
i = np.random.choice(idxs) # src_toks.index("صبر")
idx = all_sims[i].argsort()[-5:][::-1]
sims = all_sims[i][idx]
src_toks[i], src_to_tgt_words[src_toks[i]],  list(zip(all_tgt_tokens[idx], sims))

('جسٹس',
 'Justice',
 [('Judge', 0.5101216),
  ('Justice', 0.45613417),
  ('Court', 0.4475362),
  ('Commissioner', 0.4444608),
  ('Magistrate', 0.43128565)])

In [None]:
# accuracy


## Alignment dataset

In [None]:
from deep_translator import GoogleTranslator
gt = GoogleTranslator(source='ur', target='en')
# with open(f"drive/Othercomputers/mac19/personal_repos/slt/notebooks/text/temp/ur_words.json", "r") as f:
with open(f"temp/ur_words.json", "r") as f:
    ur_words = json.load(f)

translations = {}
def translate(w):
    if not translations.get(w, ""):
        try:
            trans = gt.translate(w)
        except:
            trans = ""
        translations[w] = trans

In [None]:
# SAVE
# with open("drive/Othercomputers/mac19/personal_repos/slt/notebooks/text/translations.json", "w") as f:
with open("translations.json", "w") as f:
    json.dump({k:v for k,v in translations.items() if v}, f, ensure_ascii=False, indent=0)

In [None]:
# LOAD
# with open("drive/Othercomputers/mac19/personal_repos/slt/notebooks/text/translations.json", "r") as f:
with open("translations.json", "r") as f:
    translations = json.load(f)
len(translations)

10124

In [None]:
import sign_language_translator as slt
slt.utils.threaded_map(translate, [(w,) for w in ur_words if not translations.get(w, "")], time_delay=5e-3, timeout=30)

In [None]:
with open("ur_to_en.json", "w") as f:
    json.dump(dict(sorted(
        [
            (k,v_) for k,v in translations.items() if " " not in k and not re.match(r"^\W+$", k) and (v_:=v.removeprefix("the ").removeprefix("The ").removeprefix("A ").removeprefix("to ").strip()) and " " not in v_ and v_.isascii()]
        , key=lambda x: (-len(fasttext.tokenize(x[1])),len(x[0]), x[0]))), f, ensure_ascii=False, indent=0)

In [None]:
!python3 ../../fastText/alignment/align.py \
    --src_emb temp/align/cc.ur.300.vec --tgt_emb temp/align/cc.en.300.vec \
    --dico_train temp/align/dico_train.txt --dico_test temp/align/dico_test.txt \
    --maxload 201000 \
    --output ur_to_en_align \
    --lr 25 --niter 10