In [1]:
import os
import json
import sys
import gc
from typing import Tuple
from pathlib import Path 

from tqdm.auto import tqdm, trange
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, T5ForConditionalGeneration
from transformers import Trainer, TrainingArguments
from transformers.file_utils import cached_property

2024-03-10 14:31:47.917790: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-10 14:31:47.940518: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-10 14:31:47.940543: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-10 14:31:47.941152: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-10 14:31:47.945454: I tensorflow/core/platform/cpu_feature_guar

In [2]:
from Get_Data import GetData

In [3]:
dialog_question, dialog_answer = GetData()

In [4]:
df = pd.DataFrame({
    'dialog_question': dialog_question,
    'dialog_answer': dialog_answer
})

In [5]:
class PairsDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __getitem__(self, idx):
        assert idx < len(self.x['input_ids'])
        item = {key: val[idx] for key, val in self.x.items()}
        item['decoder_attention_mask'] = self.y['attention_mask'][idx]
        item['labels'] = self.y['input_ids'][idx]
        return item
    
    @property
    def n(self):
        return len(self.x['input_ids'])

    def __len__(self):
        return self.n # * 2

In [6]:
from typing import List, Dict, Union

class DataCollatorWithPadding:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        batch = self.tokenizer.pad(
            features,
            padding=True,
        )
        ybatch = self.tokenizer.pad(
            {'input_ids': batch['labels'], 'attention_mask': batch['decoder_attention_mask']},
            padding=True,
        ) 
        batch['labels'] = ybatch['input_ids']
        batch['decoder_attention_mask'] = ybatch['attention_mask']
        
        return {k: torch.tensor(v) for k, v in batch.items()}

In [7]:
def cleanup():
    gc.collect()
    torch.cuda.empty_cache()
    
cleanup()

In [8]:
def evaluate_model(model, test_dataloader):
    num = 0
    den = 0

    for batch in test_dataloader:
        with torch.no_grad():
            loss = model(**{k: v.to(model.device) for k, v in batch.items()}).loss
            num += len(batch) * loss.item()
            den += len(batch)
    val_loss = num / den
    return val_loss

In [9]:
def train_loop(
    model, train_dataloader, val_dataloader, 
    max_epochs=30, 
    max_steps=1_000, 
    lr=3e-5,
    gradient_accumulation_steps=1, 
    cleanup_step=100,
    report_step=300,
    window=100,
):
    cleanup()
    optimizer = torch.optim.Adam(params = [p for p in model.parameters() if p.requires_grad], lr=lr)

    ewm_loss = 0
    step = 0
    model.train()

    for epoch in trange(max_epochs):
        print(step, max_steps)
        if step >= max_steps:
            break
        tq = tqdm(train_dataloader)
        for i, batch in enumerate(tq):
            try:
                batch['labels'][batch['labels']==0] = -100
                loss = model(**{k: v.to(model.device) for k, v in batch.items()}).loss
                loss.backward()
            except Exception as e:
                print('error on step', i, e)
                loss = None
                cleanup()
                continue
            if i and i % gradient_accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()
                step += 1
                if step >= max_steps:
                    break

            if i % cleanup_step == 0:
                cleanup()

            w = 1 / min(i+1, window)
            ewm_loss = ewm_loss * (1-w) + loss.item() * w
            tq.set_description(f'loss: {ewm_loss:4.4f}')

            if (i and i % report_step == 0 or i == len(train_dataloader)-1)  and val_dataloader is not None:
                model.eval()
                eval_loss = evaluate_model(model, val_dataloader)
                model.train()
                print(f'epoch {epoch}, step {i}/{step}: train loss: {ewm_loss:4.4f}  val loss: {eval_loss:4.4f}')
                
            if step % 1000 == 0:
                model.save_pretrained(f't5_base_{dname}_{steps}')
        
    cleanup()

In [10]:
def train_model(x, y, model_name, test_size=0.1, batch_size=32, **kwargs):
    model = T5ForConditionalGeneration.from_pretrained(model_name).cuda()
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

    x1, x2, y1, y2 = train_test_split(x, y, test_size=test_size, random_state=42)
    train_dataset = PairsDataset(tokenizer(x1), tokenizer(y1))
    test_dataset = PairsDataset(tokenizer(x2), tokenizer(y2))
    
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, drop_last=False, shuffle=True, collate_fn=data_collator)
    val_dataloader = DataLoader(test_dataset, batch_size=batch_size, drop_last=False, shuffle=True, collate_fn=data_collator)

    train_loop(model, train_dataloader, val_dataloader, **kwargs)
    return model

In [11]:
model_name = 'Supiri/t5-base-conversation'

In [12]:
datasets = {
    'train': df
}

In [13]:
for steps in [300, 1000, 3000, 10000]:
    for dname, d in datasets.items():
        #print(f'\n\n\n  {dname}  {steps} \n=====================\n\n')
        model = train_model(d['dialog_question'].tolist(), d['dialog_answer'].tolist(), model_name=model_name, batch_size=16, max_epochs=1000, max_steps=steps)
        model.save_pretrained(f't5_base_{dname}_{steps}')

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/1000 [00:00<?, ?it/s]

0 300


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 0, step 209/209: train loss: 3.3500  val loss: 10.1905
209 300


  0%|          | 0/210 [00:00<?, ?it/s]

300 300


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/1000 [00:00<?, ?it/s]

0 1000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 0, step 209/209: train loss: 3.3533  val loss: 10.4218
209 1000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 1, step 209/418: train loss: 3.0172  val loss: 9.9107
418 1000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 2, step 209/627: train loss: 2.8624  val loss: 9.5195
627 1000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 3, step 209/836: train loss: 2.7217  val loss: 9.4873
836 1000


  0%|          | 0/210 [00:00<?, ?it/s]

1000 1000


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/1000 [00:00<?, ?it/s]

0 3000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 0, step 209/209: train loss: 3.3528  val loss: 10.1032
209 3000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 1, step 209/418: train loss: 3.0141  val loss: 10.1751
418 3000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 2, step 209/627: train loss: 2.8651  val loss: 9.9252
627 3000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 3, step 209/836: train loss: 2.7333  val loss: 9.7995
836 3000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 4, step 209/1045: train loss: 2.6383  val loss: 9.6960
1045 3000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 5, step 209/1254: train loss: 2.5457  val loss: 9.6445
1254 3000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 6, step 209/1463: train loss: 2.4712  val loss: 9.6160
1463 3000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 7, step 209/1672: train loss: 2.3931  val loss: 9.5893
1672 3000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 8, step 209/1881: train loss: 2.3180  val loss: 9.8324
1881 3000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 9, step 209/2090: train loss: 2.2342  val loss: 9.6353
2090 3000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 10, step 209/2299: train loss: 2.1778  val loss: 9.8804
2299 3000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 11, step 209/2508: train loss: 2.1193  val loss: 10.0189
2508 3000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 12, step 209/2717: train loss: 2.0442  val loss: 10.0484
2717 3000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 13, step 209/2926: train loss: 1.9913  val loss: 9.7642
2926 3000


  0%|          | 0/210 [00:00<?, ?it/s]

3000 3000


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/1000 [00:00<?, ?it/s]

0 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 0, step 209/209: train loss: 3.3480  val loss: 10.0371
209 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 1, step 209/418: train loss: 3.0195  val loss: 10.1824
418 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 2, step 209/627: train loss: 2.8598  val loss: 9.8388
627 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 3, step 209/836: train loss: 2.7443  val loss: 9.7533
836 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 4, step 209/1045: train loss: 2.6415  val loss: 9.6538
1045 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 5, step 209/1254: train loss: 2.5420  val loss: 9.8033
1254 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 6, step 209/1463: train loss: 2.4761  val loss: 9.4865
1463 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 7, step 209/1672: train loss: 2.3906  val loss: 9.6202
1672 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 8, step 209/1881: train loss: 2.3202  val loss: 9.8419
1881 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 9, step 209/2090: train loss: 2.2385  val loss: 9.7453
2090 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 10, step 209/2299: train loss: 2.1810  val loss: 9.5645
2299 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 11, step 209/2508: train loss: 2.1164  val loss: 9.9678
2508 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 12, step 209/2717: train loss: 2.0510  val loss: 9.6864
2717 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 13, step 209/2926: train loss: 1.9918  val loss: 9.7516
2926 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 14, step 209/3135: train loss: 1.9320  val loss: 9.7387
3135 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 15, step 209/3344: train loss: 1.8933  val loss: 9.7841
3344 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 16, step 209/3553: train loss: 1.8255  val loss: 9.9051
3553 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 17, step 209/3762: train loss: 1.7728  val loss: 9.7463
3762 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 18, step 209/3971: train loss: 1.7348  val loss: 9.9727
3971 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 19, step 209/4180: train loss: 1.6909  val loss: 9.7180
4180 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 20, step 209/4389: train loss: 1.6348  val loss: 9.8795
4389 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 21, step 209/4598: train loss: 1.5889  val loss: 9.7994
4598 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 22, step 209/4807: train loss: 1.5465  val loss: 9.8475
4807 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 23, step 209/5016: train loss: 1.5111  val loss: 9.7397
5016 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 24, step 209/5225: train loss: 1.4756  val loss: 9.4743
5225 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 25, step 209/5434: train loss: 1.4390  val loss: 9.6428
5434 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 26, step 209/5643: train loss: 1.4038  val loss: 9.5766
5643 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 27, step 209/5852: train loss: 1.3644  val loss: 9.4004
5852 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 28, step 209/6061: train loss: 1.3336  val loss: 9.3810
6061 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 29, step 209/6270: train loss: 1.2960  val loss: 9.6215
6270 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 30, step 209/6479: train loss: 1.2540  val loss: 9.5946
6479 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 31, step 209/6688: train loss: 1.2220  val loss: 9.5735
6688 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 32, step 209/6897: train loss: 1.1937  val loss: 9.3925
6897 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 33, step 209/7106: train loss: 1.1616  val loss: 9.2208
7106 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 34, step 209/7315: train loss: 1.1421  val loss: 9.2748
7315 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 35, step 209/7524: train loss: 1.1048  val loss: 9.3317
7524 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 36, step 209/7733: train loss: 1.0936  val loss: 9.0831
7733 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 37, step 209/7942: train loss: 1.0502  val loss: 9.2661
7942 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 38, step 209/8151: train loss: 1.0330  val loss: 9.2444
8151 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 39, step 209/8360: train loss: 1.0112  val loss: 9.1915
8360 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 40, step 209/8569: train loss: 0.9750  val loss: 9.1619
8569 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 41, step 209/8778: train loss: 0.9607  val loss: 9.0020
8778 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 42, step 209/8987: train loss: 0.9307  val loss: 8.9227
8987 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 43, step 209/9196: train loss: 0.9007  val loss: 8.9690
9196 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 44, step 209/9405: train loss: 0.8749  val loss: 9.0965
9405 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 45, step 209/9614: train loss: 0.8594  val loss: 8.8865
9614 10000


  0%|          | 0/210 [00:00<?, ?it/s]

epoch 46, step 209/9823: train loss: 0.8418  val loss: 9.0043
9823 10000


  0%|          | 0/210 [00:00<?, ?it/s]

10000 10000


In [14]:
model.save_pretrained(f'./DZ2model/t5_base')