In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [None]:
import os 
import gensim
import gensim.downloader as gloader
from gensim.models import KeyedVectors

import numpy as np


In [None]:
data_folder = os.path.join(os.getcwd(),"data") # directory containing the data
dataset_path = os.path.join(data_folder,'training_set.json')

In [None]:
import lib.data_handling as handling

squad_dataset = handling.RawSquadDataset(dataset_path)

In [None]:
squad_dataset.raw_df.head(6)

In [None]:
def load_embedding_model(embedding_dimension=50, unk_token="[UNK]", pad_token="[PAD]"
):
    """
    Loads a pre-trained word embedding model via gensim library
    """

    model = "glove-wiki-gigaword-{}".format(embedding_dimension)
    try:
        embedding_model : KeyedVectors = gloader.load(model)

        # Build the unknown vector as the mean of all vectors
        # (if the mean is already present, use a random vector)
        assert unk_token not in embedding_model, f"{unk_token} key already present"
        unk = np.mean(embedding_model.vectors, axis=0)
        if unk in embedding_model.vectors:
            mins = np.min(embedding_model.vectors, axis=0)
            maxs = np.max(embedding_model.vectors, axis=0)
            unk = (maxs - mins) * np.random.rand(embedding_dimension) + mins
        assert unk not in embedding_model.vectors, f"{unk_token} value already present"

        

        # Build the pad vector as a zero vector
        assert pad_token not in embedding_model, f"{pad_token} key already present"
        pad = np.zeros((embedding_model.vectors.shape[1],))
        assert pad not in embedding_model.vectors, f"{pad_token} value already present"

        print(unk.shape)
        print(pad.shape)
        embedding_model.add_vectors([unk_token,pad_token], [unk,pad])
        
        # embedding_model.add_vectors(pad_token, pad)

        # Extract a mapping from keys to indexes
        vocab = dict(
            zip(embedding_model.index_to_key , range(len(embedding_model.index_to_key)))
        )

        return embedding_model, vocab
    except Exception as e:
        print("Invalid embedding model name.")
        raise e

model, vocab = load_embedding_model()

In [None]:
model.key_to_index == vocab

In [None]:
model.get_index('[PAD]')

In [None]:
for k,v in vocab.items():
    if v == 400001:
        print(k)

In [None]:
from tokenizers import  Tokenizer
from tokenizers.models import WordLevel
from tokenizers.normalizers import Lowercase, Sequence, Strip, StripAccents
from tokenizers.pre_tokenizers import Punctuation
from tokenizers.pre_tokenizers import Sequence as PreSequence
from tokenizers.pre_tokenizers import Whitespace

In [None]:
from datasets import Dataset 

hf_dataset = Dataset.from_pandas(squad_dataset.raw_df)

In [None]:
hf_dataset

In [None]:
tokenizer = Tokenizer(WordLevel(vocab,unk_token="[UNK]"))
tokenizer.normalizer = Sequence([StripAccents(), Lowercase(), Strip()])
tokenizer.pre_tokenizer = PreSequence([Whitespace(), Punctuation()])
# tokenizer.enable_padding(
#         direction="right", pad_id=vocab['[PAD]'], pad_type_id=1, pad_token='[PAD]'
#     )

def convert_to_features(batch):
    #print(example_batch['context'][0])
    encodings = tokenizer.encode_batch(batch['context'])
    # target_encodings = tokenizer.encode(example_batch['target_text'], pad_to_max_length=True, max_length=16)

    encodings = {
        'input_ids': [e.ids for e in encodings], 
        'attention_mask': [e.attention_mask for e in encodings],
        'offset':[e.offsets for e in encodings]
    }

    return encodings



hf_dataset2 = hf_dataset.map(convert_to_features,batched=True)



# hf_dataset.set_transform(pad_batch,output_all_columns=False)
# 'label_token' : [list(e.char_to_token(e.label_char[0],3),e.char_to_token(e.label_char[0],3)) for e in encodings]


In [None]:
hf_dataset2[0]

In [None]:
print(hf_dataset2[4:6])

In [None]:
def pad_batch(batch):
    tokenizer.enable_padding(
        direction="right", pad_id=vocab['[PAD]'], pad_type_id=0, pad_token='[PAD]'
    )
    padded_encodings = tokenizer.encode_batch(batch['context'])

    #print(padded_encodings[0])

    encodings = {
        'padded_ids': [e.ids for e in padded_encodings], 
        'attention_mask':[e.attention_mask for e in padded_encodings],
        'context': batch['context']
    }

    return encodings
hf_dataset2.set_transform(pad_batch,output_all_columns=False)

In [None]:
hf_dataset2

In [None]:
print(hf_dataset2[4:6])




In [None]:
# 
len(hf_dataset[6]['padded_ids']) - len(hf_dataset[0]['padded_ids'])

In [None]:
print(tokenizer.padding)

In [None]:
print(len(hf_dataset[56459]['input_ids']))

In [None]:
print(tokenizer.id_to_token(1962))
print(tokenizer.token_to_id('[PAD]'))

In [None]:
hf_dataset[0,1]