In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'


In [None]:
import os 
import src.globals as g
import src.utils as utils

import torch 

import src.data_handler as handling

In [None]:
dataset_path = os.path.join(g.DATA_FOLDER,'training_set.json')
squad_dataset = handling.RawSquadDataset(dataset_path)

df = squad_dataset.train_df.copy()

df.head()

In [None]:
from spacy.lang.en import English

nlp = English()
nlp.add_pipe('sentencizer')

df["context"] = df["context"].apply(lambda x: [sent.text for sent in nlp(x).sents])

df = df.explode("context", ignore_index=True)

df = df.drop(df[[x[0] not in x[1] for x in zip(df['answer'], df['context'])]].index)

df = (df.groupby(['context_id', 'question_id', 'title', 'question', 'answer', 'label_char'], sort = False).agg({'context': lambda x: ",".join(x)}).reset_index())
df = df[['context_id', 'question_id', 'title', 'context', 'question', 'answer', 'label_char']]

df.to_csv('data/new_df.csv')

In [None]:
model, vocab = utils.get_Glove_model_and_vocab()

In [None]:
from tokenizers import  Tokenizer, Encoding
from tokenizers.models import WordLevel
from tokenizers.normalizers import Lowercase, Strip, StripAccents, NFD, BertNormalizer
from tokenizers.normalizers import Sequence as NormSequence
from tokenizers.pre_tokenizers import Punctuation, Whitespace
from tokenizers.pre_tokenizers import Sequence as PreSequence
from tokenizers.processors import TemplateProcessing
from tokenizers.trainers import WordLevelTrainer
from datasets import Dataset 


In [None]:
hf_dataset = Dataset.from_pandas(df)

In [None]:
tokenizer = Tokenizer(WordLevel(unk_token=g.UNK_TOKEN))
tokenizer.normalizer = BertNormalizer(handle_chinese_chars=False) #NormSequence([NFD(), StripAccents(), Lowercase(), Strip()])    
tokenizer.pre_tokenizer = PreSequence([Whitespace(), Punctuation()])

tokenizer.post_processor = TemplateProcessing(
    single="[SOS] $A [EOS]",
    pair="[SOS] $A [EOS] [SOS]:1 $B:1 [EOS]:1",
    special_tokens=[
        ("[SOS]", 2),
        ("[EOS]", 3),
    ],
)


trainer = WordLevelTrainer(special_tokens=[g.PAD_TOKEN,g.UNK_TOKEN,g.SOS_TOKEN,g.EOS_TOKEN],vocab_size=40000)   #min_frequency

#l = df.context.to_list() + df.answer.to_list()
l = df.question.to_list()
tokenizer.train_from_iterator(l,trainer=trainer) 
tokenizer.enable_padding(direction="right", pad_id=tokenizer.token_to_id(g.PAD_TOKEN), pad_type_id=1, pad_token=g.PAD_TOKEN)
tokenizer.get_vocab_size()


In [None]:
def transform(batch):

    question_encodings: list[Encoding] = tokenizer.encode_batch(batch['question'])

    batch = {
        'question_ids': torch.tensor([e.ids for e in question_encodings]),
        'question_mask': torch.tensor([e.attention_mask for e in question_encodings]),
    }

    return batch

hf_dataset.set_transform(transform)

In [None]:
ex = hf_dataset[49591,49592,49593]

print(ex)

In [None]:
# df.loc[49592,'question']
from src.evaluation import get_tokens
from datasets import load_metric

metric = load_metric('bleu')


a = tokenizer.decode_batch(ex['question_ids'].tolist())
b = tokenizer.decode_batch(ex['question_ids'].tolist())

a
b
c = metric.compute(predictions=[get_tokens(a[0])],references=[[get_tokens(b[1])]])
print(c['bleu'])


# for t,p in zip(a,b) :
#     c = metric.compute(predictions=[get_tokens(p)],references=[[get_tokens(t)]])
#     print(c['bleu'])



In [None]:
torch.exp(torch.mean(torch.tensor([3.0,4,5]))).item()

In [None]:
ex['question_mask'].bool()

In [None]:
from datasets import load_metric

metric = load_metric('sacrebleu')

metric.compute(predictions=a,references=b)

In [None]:
tokenizer.add_special_tokens([g.PAD_TOKEN,g.UNK_TOKEN]) #,g.SOS_TOKEN,g.EOS_TOKEN

In [None]:
# l = df.context.to_list() + df.answer.to_list() 
s = set()
for e in l :
    # if 'intellectu' in e:
    #     print(e)
    s.update(e.split())

len(s)
        

#tokenizer.encode('To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?').tokens



In [None]:
tokenizer.save('data/tokenizer.json')

In [None]:
n = 0
for e in tokenizer.get_vocab().keys() :
    if e not in vocab:
        # print(e)
        n+=1
n

In [None]:
tokenizer.id_to_token(1)
tokenizer.token_to_id('plda')

In [None]:
r1 = df.loc[49591]
r2 = df.loc[49593]
s1 = r1['question']
s2 = r2['question']
s1
s2

In [None]:
print(tokenizer.get_vocab()['hokkien'])

In [None]:
import gensim.downloader as gloader
from gensim.models import KeyedVectors
import time 
import pandas as pd 
import numpy as np

import logging 

logger = logging.getLogger(g.LOG_NAME)

In [None]:
model['hello'].shape
type(np.float32)

In [None]:
a = np.array([1,2,3,4,5])
np.concatenate([a,[0,0,0]])


In [None]:
import torch
from torch import nn
import numpy as np

In [None]:
a = torch.tensor([[[1,2,3],[1,2,3],[1,2,3]],[[4,5,6],[4,5,6],[4,5,6]],[[7,8,9],[7,8,9],[7,8,9]]])

b = torch.tensor([[[1,2,3,1],[1,2,3,1],[1,2,3,0]],[[4,5,6,0],[4,5,6,1],[4,5,6,0]],[[7,8,9,1],[7,8,9,1],[7,8,9,1]]])


start = np.array([0,1,0])
end = np.array([1,1,2])

c = (start[:,None] <= np.arange(a.shape[1])).view('i1')    #np.less_equal.outer(start, np.arange(a.shape[1])).view('i1')
d = (end[:,None] >= np.arange(a.shape[1])).view('i1')
c
d
f = c*d

f

f = torch.from_numpy(f)

f = f.unsqueeze(-1)

r = torch.cat((a,f),dim=2)


In [None]:
enc_m = torch.rand((20,5))
enc_m[0] = torch.zeros(5)

enc_emb = nn.Embedding.from_pretrained(enc_m,padding_idx=0)

h_dim = 3

rnn = nn.LSTM(5+1, h_dim, batch_first=True, bidirectional=True)

In [None]:
ctx_ids = torch.tensor([[1,2,3,0,0,0],[3,7,8,12,17,19],[3,15,4,1,0,0]])

In [None]:
ctx_embeds = enc_emb(ctx_ids)

ctx_embeds

In [None]:
start = torch.tensor([0,2,3])
end = torch.tensor([1,4,3])


t1 = torch.le(start.unsqueeze(-1),torch.arange(ctx_embeds.shape[1])).float()
t2 = torch.ge(end.unsqueeze(-1),torch.arange(ctx_embeds.shape[1])).float()


m = torch.mul(t1,t2).unsqueeze(-1)

r = torch.cat((ctx_embeds,m),dim=2)

r




In [None]:
out, _ = rnn(r)

print(out)

In [None]:
out.shape
v = torch.mean(out,dim=1)
v.shape
v = v.unsqueeze(1).expand(out.size())

torch.add(v,out)

In [None]:
torch.add(out,v)

In [None]:
b = torch.zeros((3,5,10))

c = torch.rand(3,10)

b.shape
c.shape

In [None]:
b
c

In [None]:
b[:,1,:] = c

b

In [None]:
answ_ids = torch.tensor([[1,2,0],[8,12,17],[1,0,0]]) 

In [None]:
start = torch.tensor([0,2,3])
end = torch.tensor([1,4,3])

In [None]:
answ_embeds = enc_emb(answ_ids)

answ_embeds

In [None]:
out

In [None]:
index = torch.vstack([torch.arange(s,s+answ_embeds.shape[1]) for s in start])
index

In [None]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


In [None]:
i = torch.arange(answ_embeds.shape[0]).reshape(answ_embeds.shape[0],1,1)
j = index.unsqueeze(-1)
k = torch.arange(6)                   

c = out[i,j,k]
d = torch.cat((c,answ_embeds),dim=2)

l = pack_padded_sequence(d,[2,3,1],batch_first=True,enforce_sorted=False)

pad_packed_sequence(l,batch_first=True)

In [None]:
start = torch.tensor([0,2,3])
end = torch.tensor([1,4,3])

In [None]:
z = torch.zeros(answ_embeds.shape[0],answ_embeds.shape[1],h_dim*2)
z

In [None]:

for i in range(answ_embeds.shape[0]):
    z[i,0:end[i]+1-start[i],:] = out[i,start[i]:end[i]+1,:]

z


In [None]:
torch.cat((z,answ_embeds),dim=2)

In [None]:
answ_embeds

In [None]:
a = torch.tensor([[[1,2,3],[11,2,3],[1,2,3]],[[4,5,6],[43,5,6],[4,5,6]]])
b = torch.tensor([[[2,3,4],[2,3,4],[2,3,4]],[[5,6,7],[5,6,7],[5,6,7]]])

c = (a,b)

tuple((torch.cat((hidden[0:hidden.size(0):2], hidden[1:hidden.size(0):2]), dim=2) for hidden in c))


In [None]:
a = torch.tensor([[[1,2,3],[11,2,3],[1,2,3]],[[4,5,6],[43,5,6],[4,5,6]]])
b = torch.tensor([[1,2,2],[0,1,2]])


a.view(-1,a.shape[-1])
b.view(-1).unsqueeze(-1)

a[:,1:].contiguous().view(-1,a.shape[-1])
b[:,1:].contiguous().view(-1).unsqueeze(-1)

In [None]:
pred_raw = torch.rand((5,7,4))
pred_raw

In [None]:
true = torch.randint(1,6,(5,7))
true
pad_mask = torch.tensor([[1,1,1,1,1,0,0],[1,1,1,1,0,0,0],[1,1,1,1,1,1,0],[1,1,1,1,1,1,1],[1,1,1,0,0,0,0]])
pad_mask

torch.where(pad_mask!=0,true,0)

In [None]:
import torch.nn.functional as F

pred_logits = F.log_softmax(pred_raw,dim=2)
pred = pred_logits.argmax(dim=2)
pred


In [None]:
pred.eq(true)

In [None]:
import numpy as np

In [None]:
a = pred.eq(true).masked_select(pad_mask.bool()).float().mean().item()

a
# b = a.sum().item()
# c = a.size(0)

# b/c

