In [None]:
!pip install transformers==4.28.0 -q
!pip install datasets -q
!pip install evaluate -q
!pip install rouge_score -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m63.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from huggingface_hub import HfFolder, notebook_login
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from datasets import load_dataset, concatenate_datasets
import numpy as np
import evaluate
import nltk
from nltk.tokenize import sent_tokenize

nltk.download("punkt")

In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
label_pad_token_id = -100

data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [None]:
metric = evaluate.load("rouge")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result


In [None]:
def preprocess_function(sample,padding="max_length"):

    inputs = [item for item in sample["instruction"]]
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)
    labels = tokenizer(text_target=sample["answer"], max_length=max_target_length, padding=padding, truncation=True)

    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
dataset = load_dataset('ra4wv2/qa')

Downloading and preparing dataset csv/ra4wv2--qa to /root/.cache/huggingface/datasets/ra4wv2___csv/ra4wv2--qa-703fa30e467874d5/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/526k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/133k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/ra4wv2___csv/ra4wv2--qa-703fa30e467874d5/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["instruction"], truncation=True), batched=True, remove_columns=["instruction", "answer"])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])

tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["answer"], truncation=True), batched=True, remove_columns=["instruction", "answer"])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max source length: {max_source_length}")
print(f"Max target length: {max_target_length}")


In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["instruction", "answer", "id"])

# flan-t5-large

In [None]:
model_id="google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [None]:
repository_id = f"{model_id.split('/')[1]}-{'qa'}"

training_args = Seq2SeqTrainingArguments(
    output_dir=repository_id,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False, 
    learning_rate=5e-5,
    num_train_epochs=5,

    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,

    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)

Cloning https://huggingface.co/ra4wv2/flan-t5-large-qa into local empty directory.


In [None]:
trainer.train()
trainer.evaluate()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()

trainer.push_to_hub()

# t5-large

In [None]:
model_id="t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
repository_id = f"{model_id}-{'qa'}"

training_args = Seq2SeqTrainingArguments(
    output_dir=repository_id,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False, 
    learning_rate=5e-5,
    num_train_epochs=5,

    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,

    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)

Cloning https://huggingface.co/ra4wv2/t5-large-qa into local empty directory.


In [None]:
trainer.train()
trainer.evaluate()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,0.127383,77.6174,66.4422,77.0276,77.0702,19.0
2,0.196500,0.124055,77.7328,66.0005,77.1753,77.1453,19.0
3,0.196500,0.131023,77.8688,67.4016,77.5375,77.5445,19.0
4,0.083000,0.138523,78.1193,67.0951,77.5954,77.63,19.0
5,0.047400,0.146416,78.1002,67.0309,77.5527,77.5764,19.0


Several commits (2) will be pushed upstream.


{'eval_loss': 0.12405480444431305,
 'eval_rouge1': 77.7328,
 'eval_rouge2': 66.0005,
 'eval_rougeL': 77.1753,
 'eval_rougeLsum': 77.1453,
 'eval_gen_len': 19.0,
 'eval_runtime': 59.9439,
 'eval_samples_per_second': 10.21,
 'eval_steps_per_second': 1.285,
 'epoch': 5.0}

In [None]:
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()

trainer.push_to_hub()

Several commits (3) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 1.00/2.75G [00:00<?, ?B/s]

Upload file logs/events.out.tfevents.1684426452.d1ed272f043c.3081.2:   0%|          | 1.00/613 [00:00<?, ?B/s]

Upload file logs/events.out.tfevents.1684425287.d1ed272f043c.3081.0:   0%|          | 1.00/8.17k [00:00<?, ?B/…

To https://huggingface.co/ra4wv2/t5-large-qa
   9589ab5..4eb94bf  main -> main

   9589ab5..4eb94bf  main -> main



'https://huggingface.co/ra4wv2/t5-large-qa/commit/4eb94bfa41e4c948a857d2364687e3afbecab97a'