In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [None]:
import os 
import torch 
from torch import nn

import numpy as np 

import src.utils as utils 
import src.globals as globals
import src.data_handler as handling

In [None]:
dataset_path = os.path.join(globals.DATA_FOLDER,'training_set.json')

In [None]:
model, vocab = utils.load_embedding_model()

In [None]:
squad_dataset = handling.RawSquadDataset(dataset_path)

df = squad_dataset.df.copy()

In [None]:
df[df['question_id']=='5727b1c13acd2414000de9eb']

In [None]:
starts = np.array(list(map(lambda x: x[0],df['label_char'])))
ends = np.array(list(map(lambda x: x[1],df['label_char'])))

s = starts - ends

df[s==0]

In [None]:
from tokenizers import  Tokenizer
from tokenizers.models import WordLevel
from tokenizers.normalizers import Lowercase, Sequence, Strip, StripAccents
from tokenizers.pre_tokenizers import Punctuation
from tokenizers.pre_tokenizers import Sequence as PreSequence
from tokenizers.pre_tokenizers import Whitespace

from datasets import Dataset 


In [None]:

hf_dataset = Dataset.from_pandas(squad_dataset.df)

In [None]:
tokenizer = Tokenizer(WordLevel(vocab,unk_token=globals.UNK_TOKEN))
tokenizer.normalizer = Sequence([StripAccents(), Lowercase(), Strip()])
tokenizer.pre_tokenizer = PreSequence([Whitespace(), Punctuation()])
tokenizer.enable_padding(direction="right", pad_id=vocab[globals.PAD_TOKEN], pad_type_id=1, pad_token=globals.PAD_TOKEN)

In [None]:
from tokenizers import Encoding

def transform(batch):

    context_encodings: list[Encoding] = tokenizer.encode_batch(batch['context'])
    question_encodings: list[Encoding] = tokenizer.encode_batch(batch['question'])
    answer_encodings: list[Encoding] = tokenizer.encode_batch(batch['answer'])

    starts = list(map(lambda x: x[0],batch['label_char']))
    ends = list(map(lambda x: x[1],batch['label_char']))

    encodings = {
        #'context_ids': [e.ids for e in context_encodings],
        # 'question_ids': [e.ids for e in question_encodings],
        # 'context_mask': torch.tensor([e.attention_mask for e in context_encodings]),
        # 'question_mask': [e.attention_mask for e in question_encodings],
        'offsets': [e.offsets for e in context_encodings], 
        'context_text': batch['context'],
        'question_text': batch['question'],
        'answer_text': batch['answer'],
        'context_tokens': [e.tokens for e in context_encodings], 
        'label_token_start': [e.char_to_token(starts[i]) for i,e in enumerate(context_encodings)],
        'label_token_end': [e.char_to_token(ends[i]-1) for i,e in enumerate(context_encodings)],
        'label_char_start': starts,
        'label_char_end': ends,
        'answer_tokens': [e.tokens for e in answer_encodings], 
    }

    return encodings

hf_dataset.set_transform(transform,output_all_columns=False)

In [None]:
print(hf_dataset[57912])

In [None]:
ex = hf_dataset[57912]
start_token = ex['label_token_start']
end_token = ex['label_token_end']
start_char = ex['offsets'][start_token][0]
end_char = ex['offsets'][end_token][1]

print(start_char)
print(end_char)

ex['context_text'][start_char:end_char]
ex['context_text'][ex['label_char_start']:ex['label_char_end']]
ex['answer_text']

len(ex['context_ids']) == len(ex['context_tokens'])


In [None]:
start_c = ex['label_char_start']
end_c = ex['label_char_end']

starts, ends = zip(*ex['offsets'])

try :
    start_idx = starts.index(start_c)
except :
    print('errore start')

try: 
    end_idx = ends.index(end_c)
except :
    print('errore end')


ex['context_tokens'][start_idx] == ex['answer_tokens'][0]
ex['context_tokens'][end_idx] == ex['answer_tokens'][-1]

In [None]:
a = torch.tensor([[ 1.3398,  0.2663, -0.2686,  0.2450],
        [-0.7401, -0.8805, -0.3402, -1.1936],
        [ 0.4907, -1.3948, -1.0691, -0.3132],
        [-1.6092,  0.5419, -0.2993,  0.3195]])

torch.argmax(a,dim=1)

In [None]:
from collections import namedtuple, OrderedDict, defaultdict
d =  OrderedDict({'a':torch.tensor([1,0,1,1]), 'b':['ab c','Dfg H','hil','mnohgh'], 'c': [5,6,7,8], 'offsets': torch.tensor([[[0,0],[0,0]],[[1,1],[1,1]],[[2,2],[2,2]],[[3,3],[3,3]]])})

# for i,e in enumerate(zip(*d.values())):
#     print(i,e)
#     # print(e[1])

Record = namedtuple('Record', d.keys())
b = [Record(*t) for t in zip(*(d.values()))]


for ex in b:
    print(ex.b[ex.offsets[ex.a][0]:])
    

In [None]:
a = torch.tensor([1,2,3,4,5,6],dtype=torch.float)
b = torch.tensor([1,4,7,4,5,8],dtype=torch.float)

np.mean(torch.abs(a-b).numpy())
torch.abs(a-b).mean().item()

In [None]:
d1 = {'a':1 , 'b':2 , 'c':3 , 'd':4}
d2 = {'a':1 , 'b':2 , 'c':3 , 'd':4}

m = defaultdict(list)

In [None]:
a = [1,2,3,4,5]

np.mean(a)

In [None]:


for k,v in d2.items():
    m[k].append(v)


In [None]:
m