In [1]:
from datasets import load_dataset
from utils import sample_example_dataset
from sklearn.metrics import accuracy_score, f1_score
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [2]:
data = load_dataset("squad")
train, validation = data["train"], data["validation"]

Reusing dataset squad (/home/rocabrera/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
def load_models(model_name: str):
    
    # One can use T5ForConditionalGeneration (or the Tensorflow/Flax variant), which includes the language modeling head on top of the decoder.
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    return model, tokenizer

def tokenize_input(sample: dict):

    encoding = tokenizer(
        'question answering: ' + sample["question"],
        sample["context"],
        max_length=396,
        padding="max_length",
        truncation="only_second", # Se nao me engano trunca somente o contexto .... Problematico dependendo de onde a resposta esta
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors="pt"
    )
    
    return encoding

def model_answer(model, inputs) -> str:
    
    generated_ids = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        num_beams=1,
        repetition_penalty=2.5,
        length_penalty=1.0
    )

    preds = [tokenizer.decode(generated_id, 
                              skip_special_tokens=True, 
                              clean_up_tokenization_spaces=True) 
             for generated_id in generated_ids]

    return "".join(preds)

# **Testing t5-base**

In [4]:
model, tokenizer = load_models("t5-base")

In [6]:
idx = 10
_ = sample_example_dataset(train, idx)
sample = train[idx]
inputs = tokenize_input(sample)

answer = model_answer(model, inputs)
print(f"\nRespsota do Modelo:\n{answer}")

[37mWhere is the headquarters of the Congregation of the Holy Cross?[0m[32mThe university is the major seat of the
Congregation of Holy Cross (albeit not its official headquarters, which are in [0m[34mRome[0m[32m). Its main
seminary, Moreau Seminary, is located on the campus across St. Joseph lake from the Main Building. Old College, the
oldest building on campus and located near the shore of St. Mary lake, houses undergraduate seminarians. Retired priests
and brothers reside in Fatima House (a former retreat center), Holy Cross House, as well as Columba Hall near the
Grotto. The university through the Moreau Seminary has ties to theologian Frederick Buechner. While not Catholic,
Buechner has praised writers from Notre Dame and Moreau Seminary created a Buechner Prize for Preaching.[0m
[31mTRUE LABEL: Rome[0m

Respsota do Modelo:
Rome


# **Testing t5-large**

In [None]:
model, tokenizer = load_models("t5-large")

In [None]:
idx = 0
_ = sample_example_dataset(train, idx)
sample = train[idx]
inputs = tokenize_input(sample)

answer = model_answer(model, inputs)
print(f"\nRespsota do Modelo:\n{answer}")

# **Testing t5-3b**

In [None]:
model, tokenizer = load_models("t5-3b")

In [None]:
idx = 0
_ = sample_example_dataset(train, idx)
sample = train[idx]
inputs = tokenize_input(sample)

answer = model_answer(model, inputs)
print(f"\nRespsota do Modelo:\n{answer}")

# **Conclusões preliminares**

t5-base consegue responder o dataset do squad