## Setup

In [1]:
%%capture
!pip install protobuf==3.20.3
!pip install absl-py rouge_score nltk
!pip install evaluate
!pip install wandb

### Check GPU

In [2]:
import torch

if torch.cuda.is_available():
    print("GPU is enabled.")
    print("device count: {}, current device: {}".format(torch.cuda.device_count(), torch.cuda.current_device()))
else:
    print("GPU is not enabled.")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

GPU is enabled.
device count: 1, current device: 0


### wandb

In [3]:
# Weights & Biases (optional)log in
import wandb
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient() 

personal_key_for_api = user_secrets.get_secret("wandb-key")

! wandb login $personal_key_for_api

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


### hugging face

In [4]:
from huggingface_hub import login

from kaggle_secrets import UserSecretsClient
secret_label = "HFWAV2VEC"
secret_value = UserSecretsClient().get_secret(secret_label)

login(token=secret_value)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


## Data

In [5]:
from datasets import load_dataset

ds = load_dataset("csebuetnlp/xlsum", name="swahili")

Downloading builder script:   0%|          | 0.00/4.55k [00:00<?, ?B/s]

Downloading and preparing dataset xlsum/swahili to /root/.cache/huggingface/datasets/csebuetnlp___xlsum/swahili/2.0.0/518ab0af76048660bcc2240ca6e8692a977c80e384ffb18fdddebaca6daebdce...


Downloading data:   0%|          | 0.00/7.01M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset xlsum downloaded and prepared to /root/.cache/huggingface/datasets/csebuetnlp___xlsum/swahili/2.0.0/518ab0af76048660bcc2240ca6e8692a977c80e384ffb18fdddebaca6daebdce. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

## Tokenizer

In [6]:
from transformers import AutoTokenizer

t5_tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")

Downloading (…)okenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
def tokenize_sample_data(data):
    # Max token size is 14536 and 215 for inputs and labels, respectively.
    # Here I restrict these token size.
    input_feature = t5_tokenizer(data["text"], truncation=True, max_length=1024)
    label = t5_tokenizer(data["summary"], truncation=True, max_length=215)
    return {
        "input_ids": input_feature["input_ids"],
        "attention_mask": input_feature["attention_mask"],
        "labels": label["input_ids"],
    }

In [8]:
tokenized_ds = ds.map(
    tokenize_sample_data,
    remove_columns=["id", "url", "title", "summary", "text"],
    batched=True,
    batch_size=215)

  0%|          | 0/37 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

## Fine-tune

In [9]:
from transformers import AutoConfig, AutoModelForSeq2SeqLM

# see https://huggingface.co/docs/transformers/main_classes/configuration
mt5_config = AutoConfig.from_pretrained(
    "google/mt5-small",
    max_length=215,
    length_penalty=0.6,
    no_repeat_ngram_size=2,
    num_beams=15,
)
model = (AutoModelForSeq2SeqLM
         .from_pretrained("google/mt5-small", config=mt5_config)
         .to(device))



Downloading pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

### Data collator

In [10]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(
    t5_tokenizer,
    model=model,
    return_tensors="pt")

In [11]:
import evaluate
import numpy as np
from nltk.tokenize import RegexpTokenizer

rouge_metric = evaluate.load("rouge")

def tokenize_sentence(arg):
    encoded_arg = t5_tokenizer(arg)
    return t5_tokenizer.convert_ids_to_tokens(encoded_arg.input_ids)

def metrics_func(eval_arg):
    preds, labels = eval_arg
    # Replace -100
    labels = np.where(labels != -100, labels, t5_tokenizer.pad_token_id)
    # Convert id tokens to text
    text_preds = t5_tokenizer.batch_decode(preds, skip_special_tokens=True)
    text_labels = t5_tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Insert a line break (\n) in each sentence for ROUGE scoring
    text_preds = [(p if p.endswith(("!", "！", "?", "？", ".")) else p + ".") for p in text_preds]
    text_labels = [(l if l.endswith(("!", "！", "?", "？", ".")) else l + ".") for l in text_labels]
    sent_tokenizer_jp = RegexpTokenizer(u'[^!！?？.]*[!！?？.]')
    text_preds = ["\n".join(np.char.strip(sent_tokenizer_jp.tokenize(p))) for p in text_preds]
    text_labels = ["\n".join(np.char.strip(sent_tokenizer_jp.tokenize(l))) for l in text_labels]
    # compute ROUGE score with custom tokenization
    return rouge_metric.compute(
        predictions=text_preds,
        references=text_labels,
        tokenizer=tokenize_sentence
    )

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [12]:
from torch.utils.data import DataLoader

sample_dataloader = DataLoader(
    tokenized_ds["test"].with_format("torch"),
    collate_fn=data_collator,
    batch_size=5)
for batch in sample_dataloader:
    with torch.no_grad():
        preds = model.generate(
            batch["input_ids"].to(device),
            num_beams=15,
            num_return_sequences=1,
            no_repeat_ngram_size=1,
            remove_invalid_values=True,
            max_length=128,
        )
    labels = batch["labels"]
    break

metrics_func([preds, labels])

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'rouge1': 0.10799485146892882,
 'rouge2': 0.047367602011564855,
 'rougeL': 0.10799485146892882,
 'rougeLsum': 0.10681229738256348}

In [13]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir = "mt5-summarize-sw",
    log_level = "error",
    num_train_epochs = 10,
    learning_rate = 5e-4,
    lr_scheduler_type = "linear",
    warmup_steps = 90,
    optim = "adafactor",
    weight_decay = 0.01,
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 1,
    gradient_accumulation_steps = 16,
    evaluation_strategy = "steps",
    eval_steps = 100,
    predict_with_generate=True,
    generation_max_length = 128,
    save_steps = 500,
    logging_steps = 10,
    push_to_hub = True
)

In [14]:
from transformers import Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    compute_metrics = metrics_func,
    train_dataset = tokenized_ds["train"],
    eval_dataset = tokenized_ds["validation"].select(range(20)),
    tokenizer = t5_tokenizer,
)

In [15]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mronojohnmichael[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.15.12 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.15.9
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20231104_180358-1gfdbefb[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mstoic-frost-39[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/ronojohnmichael/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/ronojohnmichael/huggingface/runs/1gfdbefb[0m


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
100,4.0489,3.14218,0.270268,0.123239,0.225146,0.22616
200,3.4076,2.785073,0.279799,0.106943,0.22317,0.228101
300,3.1266,2.707583,0.319773,0.168012,0.258475,0.263642
400,3.0249,2.588176,0.353027,0.180798,0.286876,0.288126
500,2.9205,2.511946,0.328235,0.168977,0.271415,0.274794
600,2.7826,2.480632,0.350622,0.182849,0.277816,0.282127
700,2.7548,2.482805,0.348965,0.192275,0.285294,0.290491
800,2.6747,2.454485,0.351546,0.183705,0.293662,0.296144
900,2.6113,2.436714,0.359433,0.197291,0.298066,0.301815
1000,2.4864,2.411745,0.354232,0.187042,0.283497,0.289065


TrainOutput(global_step=2460, training_loss=2.8623174535549754, metrics={'train_runtime': 5691.527, 'train_samples_per_second': 13.877, 'train_steps_per_second': 0.432, 'total_flos': 6.154223195547648e+16, 'train_loss': 2.8623174535549754, 'epoch': 9.97})

In [16]:
model.push_to_hub("Jayem-11/mt5-summarize-sw")

pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Jayem-11/mt5-summarize-sw/commit/4da0030e9c7518ea02677371bde50c7a0f744dcc', commit_message='Upload MT5ForConditionalGeneration', commit_description='', oid='4da0030e9c7518ea02677371bde50c7a0f744dcc', pr_url=None, pr_revision=None, pr_num=None)

## Inference

In [17]:
# from transformers import AutoModelForSeq2SeqLM

# model = (AutoModelForSeq2SeqLM
#          .from_pretrained("Jayem-11/mt5-summarize-sw") 
#          .to(device)) 

In [18]:
# from torch.utils.data import DataLoader

# # Predict with test data (first 5 rows)
# sample_dataloader = DataLoader( 
#     tokenized_ds["test"].with_format("torch"),
#     collate_fn=data_collator,
#     batch_size=5)
# for batch in sample_dataloader:
#     with torch.no_grad():  
#         preds = model.generate( 
#             batch["input_ids"].to(device),
#             num_beams=15,   
#             num_return_sequences=1,
#             no_repeat_ngram_size=1,
#             remove_invalid_values=True,
#             max_length=128,
#         )
#     labels = batch["labels"]
#     break

# # Replace -100 (see above)
# labels = np.where(labels != -100, labels, t5_tokenizer.pad_token_id)

# # Convert id tokens to text
# text_preds = t5_tokenizer.batch_decode(preds, skip_special_tokens=True)
# text_labels = t5_tokenizer.batch_decode(labels, skip_special_tokens=True)

# # Show result
# print("***** Input's Text *****")
# print(ds["test"]["text"][0])
# print("***** Summary Text (True Value) *****")
# print(text_labels[0])
# print("***** Summary Text (Generated Text) *****")
# print(text_preds[0])

In [19]:
# print("***** Input's Text *****")
# print(ds["test"]["text"][2])
# print("***** Summary Text (True Value) *****")
# print(text_labels[2])
# print("***** Summary Text (Generated Text) *****")
# print(text_preds[2])