In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'


In [None]:
import os 
import src.globals as g
import src.utils as utils

import src.data_handler as handling

In [None]:
dataset_path = os.path.join(g.DATA_FOLDER,'training_set.json')
squad_dataset = handling.RawSquadDataset(dataset_path)

df = squad_dataset.train_df.copy()

df.head()

In [None]:
model, vocab = utils.get_Glove_model_and_vocab()

In [None]:
from tokenizers import  Tokenizer, Encoding
from tokenizers.models import WordLevel
from tokenizers.normalizers import Lowercase, Strip, StripAccents, NFD, BertNormalizer
from tokenizers.normalizers import Sequence as NormSequence
from tokenizers.pre_tokenizers import Punctuation, Whitespace
from tokenizers.pre_tokenizers import Sequence as PreSequence
from tokenizers.processors import TemplateProcessing
from tokenizers.trainers import WordLevelTrainer


tokenizer = Tokenizer(WordLevel(unk_token=g.UNK_TOKEN))
tokenizer.normalizer = BertNormalizer(handle_chinese_chars=False) #NormSequence([NFD(), StripAccents(), Lowercase(), Strip()])    
tokenizer.pre_tokenizer = PreSequence([Whitespace(), Punctuation()])

tokenizer.post_processor = TemplateProcessing(
    single="[SOS] $A [EOS]",
    pair="[SOS] $A [EOS] [SOS]:1 $B:1 [EOS]:1",
    special_tokens=[
        ("[SOS]", 2),
        ("[EOS]", 3),
    ],
)


trainer = WordLevelTrainer(special_tokens=[g.PAD_TOKEN,g.UNK_TOKEN,g.SOS_TOKEN,g.EOS_TOKEN],vocab_size=65000)   #min_frequency

l = df.context.to_list() + df.answer.to_list()
#l = df.question.to_list()
tokenizer.train_from_iterator(l,trainer=trainer) 
tokenizer.enable_padding(direction="right", pad_id=tokenizer.token_to_id(g.PAD_TOKEN), pad_type_id=1, pad_token=g.PAD_TOKEN)


In [None]:
tokenizer.get_vocab_size()
len(tokenizer.get_vocab())

In [None]:
tokenizer.add_special_tokens([g.PAD_TOKEN,g.UNK_TOKEN]) #,g.SOS_TOKEN,g.EOS_TOKEN

In [None]:
# l = df.context.to_list() + df.answer.to_list() 
s = set()
for e in l :
    # if 'intellectu' in e:
    #     print(e)
    s.update(e.split())

len(s)
        

#tokenizer.encode('To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?').tokens



In [None]:
tokenizer.save('data/tokenizer.json')

In [None]:
n = 0
for e in tokenizer.get_vocab().keys() :
    if e not in vocab:
        # print(e)
        n+=1
n

In [None]:
tokenizer.get_vocab()["tδ"]

df[df['context'].str.contains('tδ')]

for e in l :
    if 'tδ' in e:
        print(e)

In [None]:
tokenizer.id_to_token(2)
tokenizer.token_to_id('plda')

In [None]:
df[df['question_id']=='5726d73d708984140094d310']['context']

In [None]:
r1 = df.loc[49591]
r2 = df.loc[49593]
s1 = r1['context']
s2 = r2['context']
s1
s2

In [None]:
starts = [r1['label_char'][0],r2['label_char'][0]]
ends = [r1['label_char'][1],r2['label_char'][1]]

starts
ends

In [None]:
r1['answer']
r2['answer']

In [None]:
encodings: list[Encoding] = tokenizer.encode_batch([s1,s2])

print([e.ids for e in encodings])
print([e.attention_mask for e in encodings])
# print([e.offsets for e in encodings])
print([e.char_to_token(starts[i]) for i,e in enumerate(encodings)])
print([e.char_to_token(ends[i]-1) for i,e in enumerate(encodings)])
print([e.type_ids for e in encodings])
print([e.tokens for e in encodings])
print([e.special_tokens_mask for e in encodings])

print(encodings[0].tokens[94:100])

In [None]:
print(tokenizer.get_vocab()['hokkien'])

In [None]:
import gensim.downloader as gloader
from gensim.models import KeyedVectors
import time 
import pandas as pd 
import numpy as np

import logging 

logger = logging.getLogger(g.LOG_NAME)

In [None]:
model['hello'].shape
type(np.float32)

In [None]:
def load_glove_embedding():
    """
    Loads a pre-trained word embedding model via gensim library

    """
    start_time = time.perf_counter()

    model_name = "glove-wiki-gigaword-{}".format(g.EMBEDDING_DIMENSION)
    glove_model_path = os.path.join(g.DATA_FOLDER, f"glove_vectors_{g.EMBEDDING_DIMENSION}.txt")

    #if already stored in data, retrieve it 
    if os.path.exists(glove_model_path): 

        logger.info('loading embedding vectors (dim = %s) from file',g.EMBEDDING_DIMENSION)
        embedding_model = KeyedVectors.load_word2vec_format(glove_model_path, binary=True)
    
    else:
        logger.info('downloading glove model (dim = %s)...',g.EMBEDDING_DIMENSION)
        embedding_model : KeyedVectors = gloader.load(model_name)
        logger.info('glove loaded')

        embedding_model.save_word2vec_format(glove_model_path, binary=True)
        logger.info('glove model saved to file in data directory')
    
    end_time = time.perf_counter()
    logger.info('loading time: %f',end_time-start_time)

    return embedding_model

def build_qg_embedding_matrix(type : str, vocab : dict) -> np.ndarray:

    assert type in ['encoder','decoder']
    emb_matrix_path = os.path.join(g.DATA_FOLDER, f"{type}_emb_matrix")

    if os.path.exists(emb_matrix_path): 
        logger.info('loading embedding matrix from file')
        embedding_matrix = np.load(emb_matrix_path,allow_pickle=True)
    
    else : 
        logger.info('Building embedding matrix...')

        emb_model = load_glove_embedding()
        assert emb_model is not None, 'WARNING: empty embeddings model'

        embedding_dimension = emb_model.vector_size      #how many numbers each emb vector is composed of                                                           
        embedding_matrix = np.zeros((len(vocab), embedding_dimension+3), dtype=np.float32)   #create a matrix initialized with all zeros 

        for word, idx in vocab.items():
            if idx<4 : continue      #skip the first tokens as they are special tokens 
            try:
                embedding_vector = emb_model[word]
            except (KeyError, TypeError):
                embedding_vector = np.random.uniform(low=-0.05, high=0.05, size=embedding_dimension)

            embedding_matrix[idx] = np.concatenate([embedding_vector,[0,0,0]])    #assign the retrived or the generated vector to the corresponding index 
        
        unk = np.mean(emb_model.vectors, axis=0)
        if unk in emb_model.vectors:
            unk = np.concatenate([np.random.uniform(low=-0.05, high=0.05,size=embedding_dimension),[0,0,0]])    

        embedding_matrix[vocab[g.UNK_TOKEN]] = unk      # add the unk token embedding  

        embedding_matrix[vocab[g.PAD_TOKEN],300] = 1.0
        embedding_matrix[vocab[g.SOS_TOKEN],301] = 1.0
        embedding_matrix[vocab[g.EOS_TOKEN],302] = 1.0

        logger.info(f"Built embedding matrix with shape: {embedding_matrix.shape}")

        np.save(emb_matrix_path,embedding_matrix,allow_pickle=True)
        logger.info('embedding matrix saved to file in data directory')

    return embedding_matrix

glove_embeddings = load_glove_embedding()

embedding_matrix = build_qg_embedding_matrix(glove_embeddings, tokenizer.get_vocab())

    
    

In [None]:
'hello' in model

In [None]:
embedding_matrix[0].dtype

In [None]:
a = np.array([1,2,3,4,5])
np.concatenate([a,[0,0,0]])


In [None]:
import torch
import numpy as np

In [None]:
a = torch.tensor([[[1,2,3],[1,2,3],[1,2,3]],[[4,5,6],[4,5,6],[4,5,6]],[[7,8,9],[7,8,9],[7,8,9]]])

b = torch.tensor([[[1,2,3,1],[1,2,3,1],[1,2,3,0]],[[4,5,6,0],[4,5,6,1],[4,5,6,0]],[[7,8,9,1],[7,8,9,1],[7,8,9,1]]])

b

start = np.array([0,1,0])
end = np.array([1,1,2])

# c = torch.zeros(a.shape[0],a.shape[1])

c = (start[:,None] <= np.arange(a.shape[1])).view('i1')
d = (end[:,None] >= np.arange(a.shape[1])).view('i1')

f = c*d

f = torch.from_numpy(f)

f = f.unsqueeze(-1)

# c.index_fill_(dim=1,start)

torch.cat((a,f),dim=2)

# f