# Text Embeddings

In [None]:
!pip install sign-language-translator==0.7.2

In [11]:
import json
import math
import re
from collections import Counter

import numpy as np
import torch
from tqdm.auto import tqdm

import sign_language_translator as slt

## Word List

`from our json datasets`

In [2]:
lang = slt.languages.text.Urdu()
lang.vocab.misspelled_to_correct={}
voc = lang.vocab
symbols = "".join(lang.SYMBOLS)

senses = {w.strip("()").replace("-", " ") for t in voc.supported_words_with_word_sense for w in  re.findall(r'\([^\(\)]+\)', t) if "handed" not in w}
used = set(voc.supported_words) | set(voc.person_names) | set(voc.words_to_numbers.keys()) | senses
used |= {(w.upper() if w.isascii() else w) for w in used}
used = {w for u in used for w in [u]+u.split()}
used = used.union(lang.allowed_characters())
used = used.union({"بسم اللہ الرحمن الرحیم"})
used = used.union(set(":;"))
len(used)

2803

`from text corpora`

In [None]:
word_counts = {}
bar = tqdm(total=751_000_000)
drive_path = "/Users/mudassar.iqbal/Library/CloudStorage/GoogleDrive-mdsriqb@gmail.com/My Drive/sign-language-translator/sign-language-datasets"
path = "../../../../datasets/news.txt" # 751_000_000 words  # 5M unique words
# path = drive_path + "/temp/temp/raw_corpora/wikipedia.txt"  # 2_648_777 lines  # 1.8M unique words
# path = "r.txt"  # 895_790 lines  #
# path = "glosbe.txt"  # 100_939 lines  # 76K unique words
# path = drive_path+"/temp/temp/raw_corpora/ur_wikipedia_lines.json"
# path = drive_path+"/temp/temp/raw_corpora/glosbe.json"
with open(path, "r") as f:
    for line in f:
    # data = json.load(f)
    # for line in tqdm(data.keys()):
        for word in lang.wikipedia_preprocessor(line).split():  # slt.text.utils.make_ngrams(line.split(), 2)
            word_counts[word] = word_counts.get(word, 0) + 1
            bar.update(1)
bar.close()
len(word_counts), sum(word_counts.values())

  0%|          | 0/100939 [00:00<?, ?it/s]

(76370, 2303747)

In [None]:
# normalize
processed_counts = {}
for w, c in tqdm(word_counts.items()):
    processed_word = lang.preprocess(w).strip(symbols)
    if re.sub(r"[^\w]", "", processed_word) in ("", " "):
        continue
    processed_counts[processed_word] = processed_counts.get(processed_word, 0) + c
len(processed_counts)

  0%|          | 0/76370 [00:00<?, ?it/s]

53181

In [None]:
# drop rare
processed_counts = {w: c for w, c in processed_counts.items() if (c > 4) and not re.match(r"^[\d -]+$", w)}
len(processed_counts)

13921

In [None]:
sorted_counts = sorted(processed_counts.items(), key=lambda x: x[1], reverse=True)
print(len(sorted_counts))
sorted_counts[-100:]

In [None]:
with open("temp/glosbe_unigrams.json", "w", encoding="utf-8") as f:
    json.dump(dict(sorted_counts), f, ensure_ascii=False, indent=0)

`compile`

In [None]:
counts = []
for path in [
    "temp/rekhta_unigrams.json",
    "temp/glosbe_unigrams.json",
    "temp/news_unigrams.json",
    "temp/wikisearch_unigrams.json",
    "temp/wikipedia_unigrams.json"
]:
    with open(path, "r", encoding="utf-8") as f:
        counts.append(sorted(json.load(f).items(), key=lambda x: x[1], reverse=True))

In [None]:
([len(c) for c in counts]), sum(len(c) for c in counts)

([24868, 13917, 81860, 45199, 32924], 198768)

In [None]:
words = {u:0 for u in used}
for data in counts:
    for w, c in data[:len(data)//10]:
        # if w not in ur_words:
        words[w] = words.get(w, 0) + c
len(words)

10764

In [None]:
# save
# with open("words.json", 'r', encoding="utf-8") as f:
#     words = json.load(f)
with open("words.json", 'w', encoding="utf-8") as f:
    json.dump(dict(sorted(words.items(), key=lambda x: (-x[1], x[0]), reverse=False)), f, ensure_ascii=False, indent=0)
len(words)

10330

## FastText CC

In [2]:
from fasttext.util import download_model, reduce_model
import fasttext
import torch
import numpy as np
# download_model("hi", if_exists="ignore")

In [3]:
model = fasttext.load_model("../../../../datasets/cc.ur.300.bin")
# model = reduce_model(model, 300)



In [4]:
def ft_embed(word, model):
    if " " not in word:
        return model[word]
    else:
        return (
        model[word]
        + model.get_sentence_vector(word)
        + (
            model[word.replace(" ", "-")]
            + model[word.replace(" ", "_")]
            + model[word.replace(" ", "") *4]
        )/6/2
    )/2.5

In [None]:
# similar words from fasttext
# model.get_subwords("عبدالرزاق")[0]
vec = np.mean([ft_embed(lang.preprocess(w), model) for w in ["عبدالرزاق"]], axis=0)
similarities = model.get_input_matrix() @ vec
idxs = (similarities).argsort()[::-1]
werds=np.array(model.get_labels())[idxs[idxs < len(model.get_labels())][:50]]
werds

In [5]:
with open("temp/ur_words.json", "r") as f:
    ur_words = list(json.load(f))

embeddings = np.array([ft_embed(w, model) for w in ur_words])
embeddings.shape

(10324, 300)

In [11]:
torch.save({
    "tokens": (ur_words),
    "vectors": torch.from_numpy(embeddings),
    "alignment": torch.load("temp/align/ur_to_en_300x300.pt"),
    # "alignment": torch.eye(300),
    "description": "FastText[1] embeddings for Urdu words with an orthogonal alignment[2] matrix that maps the vectors to synonymous English embeddings.\n(1. https://fasttext.cc/docs/en/crawl-vectors.html, 2. https://github.com/babylonhealth/fastText_multilingual)",
    }, "temp/lookup-ur-fasttext-cc.pt")
vlm = slt.models.VectorLookupModel.load("temp/lookup-ur-fasttext-cc.pt")

matrix = vlm.vectors.numpy()
matrix = matrix / np.linalg.norm(matrix, axis=1, keepdims=True)

In [25]:
vlm.description

'FastText[1] embeddings for Urdu words with an orthogonal alignment[2] matrix that maps the vectors to synonymous English embeddings.\n(1. https://fasttext.cc/docs/en/crawl-vectors.html, 2. https://github.com/babylonhealth/fastText_multilingual)'

In [14]:
en_model = fasttext.load_model("../../../../datasets/cc.en.300.bin")
en_tokens = np.array(en_model.get_words())[:100_000]
en_embeddings = np.array([en_model[w] for w in en_tokens])
en_matrix = en_embeddings / np.linalg.norm(en_embeddings, axis=1, keepdims=True)



In [24]:
vector = vlm.embed("مصنف", post_normalize=True, align=True).numpy()
idxs = (en_matrix @ vector).argsort()[::-1]
en_tokens[idxs[:10]]

array(['author', 'writer', 'novelist', 'essayist', 'historian',
       'scriptwriter', 'screenwriter', 'biographer', 'book', 'scholar'],
      dtype='<U1003')

In [None]:
# words not in my vocab
old = set(ur_words)
t = (model.get_words(include_freq=True))
new = [(lang.character_normalize(item[0]), item[1]) for item in sorted(list(zip(t[0], t[1].tolist())), key=lambda x: x[::-1], reverse=True)
       if (lang.character_normalize(item[0]) not in old and not item[0].isascii())]

In [19]:
avg_words = ["آمد و رفت", ]
vector = torch.stack([vlm.embed(w, post_normalize=True) for w in
                      avg_words]).mean(0).numpy()
werd = "عبدالرزاق"
# model.get_subwords("عبدالرزاق")[0]
# vector = model.get_sentence_vector(werd)
# vector = vlm.embed(werd).numpy()
# vector = vlm.embed(werd, post_normalize=True, align=True).numpy()
vector = model.get_word_vector(werd)
vector = vector / np.linalg.norm(vector)
idxs = (matrix @ vector).argsort()[::-1]
[w for w in np.array(vlm.index_to_token)[idxs] if w not in avg_words][:20]

['عبدالرزاق',
 'محمد',
 'حضرت محمد',
 'عبدالخالق',
 'رزاق',
 'ڈاکٹر علامہ محمد اقبال',
 'محمد علی جناح',
 'عبدالرحمن',
 'قادری',
 'علامہ محمد اقبال',
 'حضرت علی',
 'حضرت ابوبکر صدیق',
 'عبدالرحمان',
 'مصنف',
 'عبدالرشید',
 'عبدالحفیظ',
 'سپنر',
 'قائد اعظم محمد علی جناح',
 'عبدالمجید',
 'حضرت عمر فاروق']

## OpenAI

In [None]:
!pip install "openai==1.10.0"

In [None]:
from openai import OpenAI
import numpy as np

with open("temp/key.txt", "r") as f:
    client = OpenAI(api_key = f.read().strip())

In [None]:
with open("temp/ur_words.json", "r") as f:
    ur_words = list(json.load(f))
ur_embeddings = []

In [None]:
words=ur_words[2_000:4_000]
response = client.embeddings.create(model="text-embedding-3-large", input=["hello"])
embeddings = np.array([item.embedding for item in response.data])
embeddings.shape

(1, 3072)

In [None]:
torch.Tensor(response.data[0].embedding).type(torch.float32)

tensor([-0.0246, -0.0075,  0.0040,  ...,  0.0104,  0.0033,  0.0076],
       dtype=torch.float16)

In [None]:
ur_embeddings.append(embeddings)
len(ur_embeddings)

6

In [None]:
torch.save({"tokens": (ur_words), "vectors": torch.from_numpy(np.concatenate(ur_embeddings))}, "temp/lookup-ur-openai-te3large.pt", _use_new_zipfile_serialization=True)

In [None]:
vlm = slt.models.VectorLookupModel.load("temp/lookup-ur-openai-te3large.zip")
matrix = vlm.vectors.numpy()

In [None]:
avg_words = ["بچوں"]
vector = torch.stack([vlm.embed(w, post_normalize=True) for w in
                      avg_words]).mean(0).numpy()
similarities = matrix @ vector
idxs = (similarities).argsort()[::-1][:10]
synonyms=np.array(vlm.index_to_token)[idxs]
list(zip(synonyms, similarities[idxs]))

[('بچوں', 0.9999999679244003),
 ('بچیوں', 0.9279826872676715),
 ('بچیاں', 0.8419694746428193),
 ('بچے', 0.8337293989675666),
 ('بچ', 0.800367358381217),
 ('بچہ', 0.7936287975694107),
 ('بچنے', 0.7759354631662607),
 ('بچن', 0.7722504460708091),
 ('بچی', 0.7644357494630329),
 ('بچاؤ', 0.7576362949474518)]

In [None]:
vlm = slt.models.VectorLookupModel.load("temp/lookup-ur-fasttext-cc.pt")
matrix = vlm.vectors.numpy()
matrix = matrix / np.linalg.norm(matrix, axis=1, keepdims=True)

In [None]:
avg_words = ["بچوں"]
vector = torch.stack([vlm.embed(w, post_normalize=True) for w in
                      avg_words]).mean(0).numpy()
similarities = matrix @ vector
idxs = (similarities).argsort()[::-1][:10]
synonyms=np.array(vlm.index_to_token)[idxs]
list(zip(synonyms, similarities[idxs]))

[('بچوں', 1.0),
 ('بچے', 0.7393133),
 ('والدین', 0.6573115),
 ('بچیوں', 0.6395565),
 ('بچیاں', 0.5350236),
 ('اسکول', 0.5298384),
 ('خواتین', 0.52769685),
 ('اسکولوں', 0.5139956),
 ('ماں', 0.5126909),
 ('عورتوں', 0.50144684)]