## T5 Finetuning project with Pre-fix Tuning(PEFT)

#### Hardware Spec CPU/GPU
CPU       : Intel(R) Xeon(R) CPU E5-2698 v4 @ 2.20GHz 20 core(40thread) * 2 ea</br>
Memory : 512GB</br>
DISK      : /  ( 440 GB)</br>
             /raid (7 TB)</br>    
GPU : nvidia V100 16GB * 8 ea</br>

In [3]:
!nvidia-smi

Sun Aug 11 02:14:30 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-16GB           On  | 00000000:06:00.0 Off |                    0 |
| N/A   43C    P0              46W / 300W |      0MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2-16GB           On  | 00000000:07:00.0 Off |  

In [4]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import os
from datasets import load_dataset

from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModelForSeq2SeqLM, default_data_collator, get_linear_schedule_with_warmup
from peft import LoraConfig, PrefixTuningConfig, get_peft_config, get_peft_model, get_peft_model_state_dict, TaskType

os.environ["TOKENIZERS_PARALLELISM"] = 'false'

device = 'cuda'
model_name_or_path = 't5-large'
tokenizer_name_or_path = 't5-large'

text_column = 'sentence'
label_column = 'text_label'
max_length = 128
lr= 1e-2
num_epochs = 5
batch_size = 8

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
!huggingface-cli login
# https://huggingface.co/settings/tokens
# Login with token
# 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


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): Traceback (most recent call last):
  File "/home/dgxuser/.local/bin/hugg

### Data load

Sentiment analysis dataset regarding News pharases

In [5]:
from datasets import load_dataset

dataset = load_dataset("financial_phrasebank", "sentences_allagree")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [6]:
dataset = dataset['train'].train_test_split(test_size=0.1)
dataset['validation'] = dataset['test']
del dataset['test']
dataset

classes = dataset['train'].features['label'].names ## label's real value is '0,1,2' but feature['label'].names represents Classlabel value
dataset = dataset.map(
    lambda x: {"text_label": [classes[label] for label in x['label']]},
    batched=True,
    num_proc=1,
)

dataset['train'][0]

Map: 100%|██████████| 2037/2037 [00:00<00:00, 83115.72 examples/s]
Map: 100%|██████████| 227/227 [00:00<00:00, 43888.03 examples/s]


{'sentence': "Den Bosch-based TomTom is Europe 's largest maker of automotive navigation devices , while Cayman Islands-based Garmin is larger in the U.S. and overall .",
 'label': 1,
 'text_label': 'neutral'}

In [7]:
dataset['train'].features['label']

ClassLabel(names=['negative', 'neutral', 'positive'], id=None)

### Load Tokenizer & data preprocessing

In [30]:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

def preprocess_function(examples):
    inputs = examples[text_column]
    targets = examples[label_column]
    model_inputs = tokenizer(inputs, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt')
    labels = tokenizer(targets, max_length=2, padding='max_length', truncation=True, return_tensors='pt')
    labels = labels['input_ids']
    labels[labels == tokenizer.pad_token_id] = -100
    model_inputs['labels'] = labels

    return model_inputs

In [31]:
process_data = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset['train'].column_names,
    load_from_cache_file=False,
    desc='Running tokenizer'
)

Running tokenizer: 100%|██████████| 2037/2037 [00:00<00:00, 3465.35 examples/s]
Running tokenizer: 100%|██████████| 227/227 [00:00<00:00, 5473.48 examples/s]


In [32]:
process_data2 = dataset.map(
    preprocess_function,
    batched=True,
    # remove_columns=dataset['train'].column_names,
    load_from_cache_file=False,
    desc='Running tokenizer'
)

Running tokenizer: 100%|██████████| 2037/2037 [00:00<00:00, 2734.54 examples/s]
Running tokenizer: 100%|██████████| 227/227 [00:00<00:00, 4811.20 examples/s]


In [33]:
process_data2

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'text_label', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2037
    })
    validation: Dataset({
        features: ['sentence', 'label', 'text_label', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 227
    })
})

In [34]:
dataset['validation'][0]

{'sentence': 'The orders also include a few high-power drives for the control of seismic compressors .',
 'label': 1,
 'text_label': 'neutral'}

In [41]:
train_dataset = process_data['train']
valid_dataset = process_data['validation']

train_dataloader = DataLoader(
    train_dataset,
    batch_size = batch_size,
    shuffle= True,
    collate_fn=default_data_collator,
    pin_memory=True,
)

valid_dataloader = DataLoader(
    valid_dataset,
    batch_size = batch_size,
    # shuffle = False,
    collate_fn=default_data_collator,
    pin_memory=True,
)

## Peft Configuration

In [42]:
peft_config = PrefixTuningConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, num_virtual_tokens=20)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)

In [43]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 983,040 || all params: 738,651,136 || trainable%: 0.1331


In [44]:
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer = optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs)
)

In [45]:
model = model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k,v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(valid_dataloader)):
        batch = {k: v.to(device) for k,v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
        )

    eval_epoch_loss = eval_loss / len(valid_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(train_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

100%|██████████| 255/255 [00:33<00:00,  7.69it/s]
100%|██████████| 29/29 [00:02<00:00, 12.24it/s]


epoch=0: train_ppl=tensor(7.1212, device='cuda:0') train_epoch_loss=tensor(1.9631, device='cuda:0') eval_ppl=tensor(1.1194, device='cuda:0') eval_epoch_loss=tensor(0.1128, device='cuda:0')


100%|██████████| 255/255 [00:33<00:00,  7.69it/s]
100%|██████████| 29/29 [00:02<00:00, 12.14it/s]


epoch=1: train_ppl=tensor(1.1162, device='cuda:0') train_epoch_loss=tensor(0.1100, device='cuda:0') eval_ppl=tensor(1.0676, device='cuda:0') eval_epoch_loss=tensor(0.0655, device='cuda:0')


100%|██████████| 255/255 [00:33<00:00,  7.68it/s]
100%|██████████| 29/29 [00:02<00:00, 12.18it/s]


epoch=2: train_ppl=tensor(1.0822, device='cuda:0') train_epoch_loss=tensor(0.0790, device='cuda:0') eval_ppl=tensor(1.0498, device='cuda:0') eval_epoch_loss=tensor(0.0486, device='cuda:0')


100%|██████████| 255/255 [00:33<00:00,  7.68it/s]
100%|██████████| 29/29 [00:02<00:00, 12.15it/s]


epoch=3: train_ppl=tensor(1.0681, device='cuda:0') train_epoch_loss=tensor(0.0659, device='cuda:0') eval_ppl=tensor(1.0530, device='cuda:0') eval_epoch_loss=tensor(0.0516, device='cuda:0')


100%|██████████| 255/255 [00:33<00:00,  7.68it/s]
100%|██████████| 29/29 [00:02<00:00, 12.16it/s]

epoch=4: train_ppl=tensor(1.0615, device='cuda:0') train_epoch_loss=tensor(0.0597, device='cuda:0') eval_ppl=tensor(1.0499, device='cuda:0') eval_epoch_loss=tensor(0.0487, device='cuda:0')





## Model evaluation

In [46]:
correct = 0
total = 0

for pred, true in zip(eval_preds, dataset['validation']['text_label']):
    if pred.strip() == true.strip(): ##.strip remove blanks
        correct += 1
    total += 1
accuracy = correct / total * 100
print(f'accuracy = {accuracy}% on the evaluation dataset')
print(f'eval_pred = {eval_preds[:10]}')
print(f"True_label = {dataset['validation']['text_label'][:10]}")

accuracy = 96.47577092511013% on the evaluation dataset
eval_pred = ['neutral', 'positive', 'neutral', 'positive', 'neutral', 'neutral', 'negative', 'neutral', 'neutral', 'neutral']
True_label = ['neutral', 'positive', 'neutral', 'neutral', 'neutral', 'neutral', 'negative', 'neutral', 'neutral', 'neutral']


In [62]:
inputs = tokenizer(
    "The Lithuanian beer market made up 14.41 million liters in January , a rise of 0.8 percent from the year-earlier figure , the Lithuanian Brewers ' Association reporting citing the results from its members .",
    return_tensors="pt",
)

In [63]:
with torch.no_grad():
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=10)
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))

['positive']


In [None]:
inputs2 = tokenizer(
    "A well-known electronics manufacturer recently discovered a serious battery issue in its newly released smartphone model. Many customers have expressed dissatisfaction, and the company is considering a recall.",
    return_tensors='pt',
)

In [65]:
inputs2

{'input_ids': tensor([[   71,   168,    18,  5661, 12800,  4818,  1310,  3883,     3,     9,
           2261,  3322,   962,    16,   165,  6164,  1883,  5626,   825,     5,
           1404,   722,    43,  7103,  1028,  9275,     7,    89,  4787,     6,
             11,     8,   349,    19,  4014,     3,     9,  7881,     5,     1]],
        device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [64]:
with torch.no_grad():
    inputs2 = {k: v.to(device) for k, v in inputs2.items()}
    outputs = model.generate(input_ids=inputs2["input_ids"], max_new_tokens=10)
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)) 
    # inputs2: tokenizing되면 input_ids 고유값, attention_mask 두가지의 딕셔너리로 생성됩니다. 
    # k는 input_ids, v는 attention_mask 
    # skip_special_token=True: [CLS], [SEP], <pad>는 제외
    # outputs.detach() 분리, 그라디언트 계산을 중지, GPU->CPU로 텐서이동, tensor-.numpy array로 변경

['negative']
