# Setup


In [1]:
!nvidia-smi

Thu Jul 20 08:52:27 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.161.03   Driver Version: 470.161.03   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   45C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   41C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|       

## Install Dependencies


In [2]:
!pip install --upgrade --quiet pip

In [3]:
!pip install --upgrade --quiet transformers datasets sentencepiece tqdm accelerate huggingface-hub bitsandbytes

In [4]:
import warnings

warnings.filterwarnings("ignore")

## Login to HuggingFace


In [5]:
!git config --global credential.helper store

In [6]:
from huggingface_hub import login,

token = f"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"
login(token=token, add_to_git_credential=True)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


## Import dependencies


In [15]:
import numpy as np
import os

from evaluate import load

from torch import Tensor, bfloat16

from transformers import (
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback,
    Trainer,
    TrainingArguments,
    T5ForConditionalGeneration,
    T5TokenizerFast,
)
from datasets import (
    DatasetDict,
    load_dataset,
)

# Load pre-processed dataset


In [8]:
train = load_dataset("rusano/ELI5_custom_encoded", split="train")

Downloading readme:   0%|          | 0.00/637 [00:00<?, ?B/s]

Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/rusano___parquet/rusano--ELI5_custom_encoded-28a2d918684a2bd1/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/60.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/60.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/979k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/30.1M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/196296 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1507 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/49074 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/rusano___parquet/rusano--ELI5_custom_encoded-28a2d918684a2bd1/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.


In [9]:
train.set_format(type="torch")

# Fine-tune Setup


In [10]:
CHECKPOINT = "google/t5-v1_1-base"

MODEL = T5ForConditionalGeneration.from_pretrained(
    CHECKPOINT, return_dict=True, torch_dtype=bfloat16
)
TOKENIZER = T5TokenizerFast.from_pretrained("t5-base")

Downloading (…)lve/main/config.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

In [11]:
training_args = TrainingArguments(
    run_name="Teli5",
    output_dir="./checkpoints",
    overwrite_output_dir=True,
    per_device_train_batch_size=8,
    learning_rate=1e-4,
    weight_decay=1e-2,
    num_train_epochs=3,
    log_level="info",
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=10,
    save_safetensors=True,
    fp16=True,
    report_to="none",
    push_to_hub=True,
    hub_model_id="rusano/Teli5",
    # gradient_checkpointing=True,
    auto_find_batch_size=True,
)

In [12]:
ignore_pad_token_for_loss = True
label_pad_token_id = -100 if ignore_pad_token_for_loss else TOKENIZER.pad_token_id
DATA_COLLATOR = DataCollatorForSeq2Seq(
    TOKENIZER,
    model=MODEL,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=None,
)

In [13]:
metric = load("glue", "mrpc")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

# Fine-tune START


In [None]:
trainer = Trainer(
    model=MODEL,
    args=training_args,
    train_dataset=train,
    data_collator=DATA_COLLATOR,
)

# Training
trainer.train()

Cloning https://huggingface.co/rusano/Teli5 into local empty directory.
***** Running training *****
  Num examples = 100,000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Training with DataParallel so batch size has been adjusted to: 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 18,750
  Number of trainable parameters = 247,577,856
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
100,35.5787
200,28.2587
300,25.2413
400,23.4662
500,22.6938
600,21.76
700,21.4713
800,20.8987
900,19.1919
1000,18.485


Saving model checkpoint to ./checkpoints/checkpoint-100
Configuration saved in ./checkpoints/checkpoint-100/config.json
Configuration saved in ./checkpoints/checkpoint-100/generation_config.json
Model weights saved in ./checkpoints/checkpoint-100/pytorch_model.bin
Saving model checkpoint to ./checkpoints/checkpoint-200
Configuration saved in ./checkpoints/checkpoint-200/config.json
Configuration saved in ./checkpoints/checkpoint-200/generation_config.json
Model weights saved in ./checkpoints/checkpoint-200/pytorch_model.bin
Saving model checkpoint to ./checkpoints/checkpoint-300
Configuration saved in ./checkpoints/checkpoint-300/config.json
Configuration saved in ./checkpoints/checkpoint-300/generation_config.json
Model weights saved in ./checkpoints/checkpoint-300/pytorch_model.bin
Saving model checkpoint to ./checkpoints/checkpoint-400
Configuration saved in ./checkpoints/checkpoint-400/config.json
Configuration saved in ./checkpoints/checkpoint-400/generation_config.json
Model weig

In [None]:
trainer.push_to_hub()