In [1]:
!pip install torch==1.10.1 transformers==4.14.1 pytorch-lightning==1.5.6 setuptools==59.5.0 numpy pandas

Collecting torch==1.10.1
  Downloading torch-1.10.1-cp38-cp38-manylinux1_x86_64.whl (881.9 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m881.9/881.9 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hCollecting transformers==4.14.1
  Downloading transformers-4.14.1-py3-none-any.whl (3.4 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m[0m eta [36m0:00:01[0m
[?25hCollecting pytorch-lightning==1.5.6
  Downloading pytorch_lightning-1.5.6-py3-none-any.whl (525 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m525.4/525.4 KB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m
Collecting install
  Downloading install-1.3.5-py3-none-any.whl (3.2 kB)
Collecting setuptools==59.5.0
  Downloading setuptools-59.5.0-py3-none-any.whl (952 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

[?25hCollecting charset-normalizer~=2.0.0
  Downloading charset_normalizer-2.0.12-py3-none-any.whl (39 kB)
Collecting certifi>=2017.4.17
  Downloading certifi-2021.10.8-py2.py3-none-any.whl (149 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.2/149.2 KB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting urllib3<1.27,>=1.21.1
  Downloading urllib3-1.26.9-py2.py3-none-any.whl (138 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.0/139.0 KB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting idna<4,>=2.5
  Downloading idna-3.3-py3-none-any.whl (61 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.2/61.2 KB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting joblib
  Downloading joblib-1.1.0-py2.py3-none-any.whl (306 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.0/307.0 KB[0m [31m46.0 MB/s[0m eta [36m0:00:00[0m
[?25hC

In [1]:
import csv
import json
import random
from typing import Dict, List, Optional, Tuple
from glob import glob

import torch
from transformers.tokenization_utils import PreTrainedTokenizerBase
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration
from pytorch_lightning import Trainer, seed_everything

from pytorch_lightning.callbacks import LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
from torch.utils.data import DataLoader

seed_everything(42)

Global seed set to 42


42

In [2]:
tokenizer = PreTrainedTokenizerFast.from_pretrained('hyunwoongko/kobart', sep_token='<unused0>', cls_token='<unused1>')
model = BartForConditionalGeneration.from_pretrained('hyunwoongko/kobart')

In [3]:
tokenizer.special_tokens_map

{'bos_token': '</s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '<unused0>',
 'pad_token': '<pad>',
 'cls_token': '<unused1>',
 'mask_token': '<mask>'}

In [4]:
vocab = dict(sorted(tokenizer.vocab.items(), key=lambda item: item[1]))
len(vocab)

30000

In [5]:
def load_json_data(path: str) -> Tuple[List[str], List[List[str]], List[str]]:
    """Load dialogue summarization dataset json files of https://aihub.or.kr/aidata/30714

    Args:
        path: path of json file
    Returns:
        result of file, which is a tuple of ids, dialogues, summaries
    """
    with open(path) as f:
        data = json.load(f)

    ids = []
    dialogues = []
    summaries = []
    for datum in data["data"]:
        ids.append(datum["header"]["dialogueInfo"]["dialogueID"])

        prev_speaker_id = None
        prev_line = ""
        utts = []
        for dialogue in datum["body"]["dialogue"]:
            utterance = dialogue["utterance"].strip()

            if dialogue["participantID"] == prev_speaker_id:
                prev_line += " " + utterance
            else:
                if prev_line:
                    utts.append(prev_line)
                prev_line = utterance
                prev_speaker_id = dialogue["participantID"]
        if prev_line:
            utts.append(prev_line)

        dialogues.append(utts)
        summaries.append(datum["body"].get("summary"))
    return ids, dialogues, summaries

In [6]:
class DialogueSummarizationDataset(torch.utils.data.Dataset):
    """Dataset for Dialogue Summarization

    Attributes:
        sep_token: token to seperate utterances
        ids: id of each example
        dialogues: dialogue of each example
        summaries: summary of each example
        dialogue_input_ids: dialogue input id tokens of each example
        dialogue_attention_masks: dialogue attention masks of each example
        summary_input_ids: summary input id tokens of each example
        summary_attention_masks: summary attention masks of each example
    """

    def __init__(
        self,
        paths: List[str],
        tokenizer: PreTrainedTokenizerBase,
        dialogue_max_seq_len: int,
        summary_max_seq_len: int,
        use_summary: bool,
    ):
        """
        Args:
            paths: list of dataset paths (tsv or json)
            tokenizer: tokenizer to tokenize dialogue and summary string
            dialogue_max_seq_len: max sequence length of dialouge
            summary_max_seq_len: max sequence length of summary
            use_summary: whether to use summary data or not (should be False for inference)
        """
        super().__init__()

        self.sep_token = tokenizer.sep_token
        (
            self.ids,
            self.dialogues,
            self.summaries,
            self.dialogue_input_ids,
            self.dialogue_attention_masks,
            self.summary_input_ids,
            self.summary_attention_masks,
        ) = self.load_dataset(paths, tokenizer, dialogue_max_seq_len, summary_max_seq_len, use_summary)

    def load_dataset(
        self,
        paths: List[str],
        tokenizer: PreTrainedTokenizerBase,
        dialogue_max_seq_len: int,
        summary_max_seq_len: int,
        use_summary: bool,
    ) -> Tuple[
        List[str],
        List[List[str]],
        List[str],
        List[torch.Tensor],
        List[torch.Tensor],
        Optional[List[torch.Tensor]],
        Optional[List[torch.Tensor]],
    ]:
        """Load dataset files and featurize with tokenizer

        Args:
            paths: list of dataset paths (tsv or json)
            tokenizer: tokenizer to tokenize dialogue and summary string
            dialogue_max_seq_len: max sequence length of dialouge
            summary_max_seq_len: max sequence length of summary
            use_summary: whether to use summary data or not (should be False for inference)
        Returns:
            original ids, dialogues, summaries and input ids and attention masks for dialogues and summaries
        """
        ids, dialogues, summaries = [], [], []
        for path in paths:
            loader_fn = load_tsv_data if path.endswith(".tsv") else load_json_data

            file_ids, file_dialogues, file_summaries = loader_fn(path)
            ids.extend(file_ids)
            dialogues.extend(self.sep_token.join(x) for x in file_dialogues)
            summaries.extend(file_summaries)

        bos = tokenizer.bos_token
        eos = tokenizer.eos_token
        dialogue_inputs = tokenizer(
            [bos + x + eos for x in dialogues],
            padding="max_length",
            truncation=True,
            max_length=dialogue_max_seq_len,
            return_tensors="pt",
            return_token_type_ids=False,
        )

        summary_inputs = (
            tokenizer(
                [bos + x + eos for x in summaries],
                padding="max_length",
                truncation=True,
                max_length=summary_max_seq_len,
                return_tensors="pt",
                return_token_type_ids=False,
            )
            if use_summary
            else {}
        )

        return (
            ids,
            dialogues,
            summaries,
            dialogue_inputs["input_ids"],
            dialogue_inputs["attention_mask"],
            summary_inputs.get("input_ids"),
            summary_inputs.get("attention_mask"),
        )

    def __len__(self) -> int:
        return len(self.dialogue_input_ids)

    def __getitem__(self, index: int) -> Dict[str, torch.Tensor]:
        item = {"input_ids": self.dialogue_input_ids[index], "attention_mask": self.dialogue_attention_masks[index]}
        if self.summary_input_ids is not None and self.summary_attention_masks is not None:
            item.update(
                {
                    "decoder_input_ids": self.summary_input_ids[index],
                    "decoder_attention_mask": self.summary_attention_masks[index],
                }
            )
        return item

In [7]:
train_dataset = DialogueSummarizationDataset(
    paths=glob('/home/hywn.dl.study/projects/kobart_lab/kor_conv_data/train/*.json'),
    tokenizer=tokenizer,
    dialogue_max_seq_len=256,
    summary_max_seq_len=64,
    use_summary=True,
)

valid_dataset = DialogueSummarizationDataset(
    paths=glob('/home/hywn.dl.study/projects/kobart_lab/kor_conv_data/valid/*.json'),
    tokenizer=tokenizer,
    dialogue_max_seq_len=256,
    summary_max_seq_len=64,
    use_summary=True,
)

In [8]:
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=128, num_workers=4)
valid_dataloader = DataLoader(valid_dataset, batch_size=256, num_workers=4)

total_steps = len(train_dataloader) * 10

print(total_steps)

21880


In [9]:
total_steps = len(train_dataloader) * 5

print(total_steps)

10940


In [9]:
from lightning_modules import DefaultModule

model_dir = '/home/hywn.dl.study/projects/kobart_lab/output/models'

lightning_module = DefaultModule(
    model, total_steps, 2e-4, 1e-5, 0.05, model_dir
)

In [16]:
output_dir = '/home/hywn.dl.study/projects/kobart_lab/output'

train_loggers = [TensorBoardLogger(output_dir, "", "logs")]

trainer = Trainer(
    logger=train_loggers,
    max_epochs=10,
    log_every_n_steps=100,
    val_check_interval=500,
    accumulate_grad_batches=1,
    callbacks=[LearningRateMonitor(logging_interval="step")],
    gpus=2,
    strategy='dp',
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [17]:
trainer.fit(lightning_module, train_dataloader, valid_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name  | Type                         | Params
-------------------------------------------------------
0 | model | BartForConditionalGeneration | 123 M 
-------------------------------------------------------
123 M     Trainable params
0         Non-trainable params
123 M     Total params
495.440   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  rank_zero_warn(


RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/hywn.dl.study/miniconda3/envs/kobart_lab/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
    output = module(*input, **kwargs)
  File "/home/hywn.dl.study/miniconda3/envs/kobart_lab/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/hywn.dl.study/miniconda3/envs/kobart_lab/lib/python3.8/site-packages/pytorch_lightning/overrides/data_parallel.py", line 63, in forward
    output = super().forward(*inputs, **kwargs)
  File "/home/hywn.dl.study/miniconda3/envs/kobart_lab/lib/python3.8/site-packages/pytorch_lightning/overrides/base.py", line 92, in forward
    output = self.module.validation_step(*inputs, **kwargs)
  File "/home/hywn.dl.study/projects/kobart_lab/lightning_modules/default.py", line 82, in validation_step
    accuracy = torchmetrics.functional.accuracy(logits, labels, ignore_index=self.model.config.pad_token_id)
  File "/home/hywn.dl.study/miniconda3/envs/kobart_lab/lib/python3.8/site-packages/torchmetrics/functional/classification/accuracy.py", line 415, in accuracy
    tp, fp, tn, fn = _accuracy_update(
  File "/home/hywn.dl.study/miniconda3/envs/kobart_lab/lib/python3.8/site-packages/torchmetrics/functional/classification/accuracy.py", line 101, in _accuracy_update
    tp, fp, tn, fn = _stat_scores_update(
  File "/home/hywn.dl.study/miniconda3/envs/kobart_lab/lib/python3.8/site-packages/torchmetrics/functional/classification/stat_scores.py", line 112, in _stat_scores_update
    preds, target, _ = _input_format_classification(
  File "/home/hywn.dl.study/miniconda3/envs/kobart_lab/lib/python3.8/site-packages/torchmetrics/utilities/checks.py", line 411, in _input_format_classification
    preds = select_topk(preds, top_k or 1)
  File "/home/hywn.dl.study/miniconda3/envs/kobart_lab/lib/python3.8/site-packages/torchmetrics/utilities/data.py", line 111, in select_topk
    topk_tensor = zeros.scatter(dim, prob_tensor.argmax(dim=dim, keepdim=True), 1.0)
RuntimeError: CUDA out of memory. Tried to allocate 3.65 GiB (GPU 0; 14.76 GiB total capacity; 12.85 GiB already allocated; 25.75 MiB free; 13.64 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
