In [None]:
mount_dir = '/content/drive'

from google.colab import drive
drive.mount(mount_dir)

# install the package
# %pip install -q sign-language-translator
# !pip install -q -e "/content/drive/Othercomputers/mac19/personal_repos/slt/sign-language-translator"
# remember to restart runtime after install
# !pip install -q -U sentence-transformers
# !pip install -q fasttext
%cd "/content/drive/Othercomputers/mac19/personal_repos/slt/notebooks/model_training"

In [None]:
import torch
import json
with open("./models/token_to_id.json", 'r') as f:
    token_to_id = json.load(f)
id_to_token = {v:k for k,v in token_to_id.items()}
id_to_token = {i:id_to_token.get(i, "__unk__") for i in range(2560)}

tokens = sorted(token_to_id.keys(), key=lambda token: token_to_id[token])

### `sBERT`

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2') # stupid model just embeds the characters not meaning

In [None]:
embeddings = model.encode(tokens, convert_to_tensor=True)
embeddings.shape

### FastText

In [None]:
import fasttext
import fasttext.util
# fasttext.util.download_model('ur', if_exists='ignore')  # Urdu
model = fasttext.load_model('cc.ur.300.bin')
embed_size = 256 # 300
fasttext.util.reduce_model(model, embed_size)
model.get_dimension()

In [None]:
embeddings = torch.stack([torch.Tensor(model.get_word_vector(w)) for w in tokens])

### save

In [None]:
bad_ids = [tokens.index("__pad__"), tokens.index("__unknown__")]
embeddings[bad_ids] = torch.normal(0.0, embeddings.std(), (len(bad_ids),embed_size))
# embeddings[bad_ids] = 0

embeddings[:len(tokens)-1]

In [None]:
# make vector for " "
extra_vector = torch.normal(0.0, embeddings.std(), (embed_size,))
space_vector = torch.normal(0.0, embeddings.std(), (embed_size,))

sym = [t for t in tokens if len(t)==1 and not t.isalnum() and t not in "<>‘’ ()'”\"“"+'ؑ']
sym_ids = [token_to_id[sym] for sym in  sym]

sym_embd = torch.stack([torch.Tensor(model.get_word_vector(s)) for s in sym])
sym_embd /= sym_embd.norm(dim=1, keepdim=True)
space_vector += sym_embd.mean(dim=0)
extra_vector.norm(), space_vector.norm(), list(zip(sym,sym_ids))

In [None]:
# make matrix
vocab_size = int("101000000000",2)
_embeddings = torch.normal(0.0, 0.07, (vocab_size, embed_size))

_embeddings[:len(embeddings)] = embeddings
# _embeddings[len(embeddings):] = extra_vector
_embeddings[0] = space_vector

_embeddings.shape, (_embeddings == 0).all(dim=1).sum().item()

In [None]:
__embeddings = _embeddings[(_embeddings!=0).all(dim=1)] / _embeddings[(_embeddings!=0).all(dim=1)].norm(dim=1, keepdim=True)

In [None]:
# torch.save(__embeddings, "models/token_embeddings_sbert.pt")
torch.save(__embeddings, "models/token_embeddings_fasttext.pt")

## Evaluate Embeddings

In [None]:
%cd "/content/drive/Othercomputers/mac19/personal_repos/slt/notebooks/model_training"

In [None]:
import torch
torch.autograd.set_detect_anomaly(True)
device = "cuda" if torch.cuda.is_available() else "cpu"

# embeddings = torch.load("models/token_embeddings_sbert_2560_768.pt")
embeddings = torch.load("models/token_embeddings_fasttext.pt")

vocab_size, embed_size = embeddings.shape
embedding_layer = torch.nn.Embedding(vocab_size, embed_size).to(device)
embedding_layer.load_state_dict({"weight": embeddings})
embedding_layer.weight.requires_grad = False

In [None]:
from copy import deepcopy
y = torch.arange(vocab_size).to(device).type(torch.long)
x = embedding_layer(y)
y[(x==0).all(dim=1)] = token_to_id["__unknown__"]

class LM_Head(torch.nn.Module):
    def __init__(self, embed_size, vocab_size, dropout=0.5):
        super().__init__()
        self.fc1 = torch.nn.Linear(embed_size, 2*embed_size)
        self.activation = torch.nn.GELU()
        self.layer_norm = torch.nn.LayerNorm(2*embed_size)
        self.dropout = torch.nn.Dropout(dropout)
        self.fc2 = torch.nn.Linear(2*embed_size, vocab_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.layer_norm(x)
        x = self.activation(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

head = LM_Head(embed_size, vocab_size).to(device)
optimizer = torch.optim.AdamW(head.parameters(), lr=3e-3)
# sch = slt.models.utils.FullyLambdaLR(optimizer, lr_lambda= lambda step, base, prev: base/((1+step/10)**0.5))
sch = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda= lambda epoch: 1/((1+epoch/10)**0.5))
best_acc = 0
try:
    for _ in range(40):
        head.train()
        optimizer.zero_grad()
        y_hat = head(x)

        mask = (y_hat.argmax(dim=1) != y)
        mask = (mask | mask[torch.randperm(len(mask))]| mask[torch.randperm(len(mask))]) if best_acc < 1 else torch.ones(len(y)) ==1

        loss = torch.nn.functional.cross_entropy(y_hat[mask], y[mask])
        loss.backward()
        optimizer.step()
        sch.step()
        head.eval()
        acc = ((head(x).argmax(dim=1) == y).sum()/len(y)).item()
        if acc > best_acc:
            best = deepcopy(head)
            best_acc = acc
        print(f"\repoch:{_:>3} | {loss.item()= :.8f} | acc: {acc:.2%} | n_miss: {(head(x).argmax(dim=1) != y).sum().item()}| lr: {sch.get_last_lr()[0]:.8f} | ",
            end="")
except KeyboardInterrupt:
    pass
head = best
best_acc

In [None]:
# not mapped
a = torch.arange(len(y))[head(x).argmax(dim=1) != y].tolist()
a, [id_to_token[aa] for aa in a]

In [None]:
# see similar tokens from LM_Head
i=torch.randint(0, len(y), (1,)).item()
i=1201
top_n = 15

a,b = torch.sort(torch.nn.functional.softmax(head(x[i]), dim=0), descending=True)
print(id_to_token[i])
list(zip([id_to_token[aa] for aa in b.tolist()][:top_n], a[:top_n].tolist()))

In [None]:
# see similar tokens directly from embedding layer
i=torch.randint(0, vocab_size, (1,)).item()
i=1234
# i=token_to_id[" "]
top_n = 15

a,b = torch.sort(torch.nn.functional.softmax(embedding_layer(torch.Tensor([i]).type(torch.long)) @ embeddings.T, dim=1), descending=True)
a,b = a.squeeze(), b.squeeze()
print(id_to_token[i])
list(zip([id_to_token[aa] for aa in b.tolist()][:top_n], a[:top_n].tolist()))