The primary codes below are based on [akpe12/JP-KR-ocr-translator-for-travel](https://github.com/akpe12/JP-KR-ocr-translator-for-travel).

## Import

In [1]:
from typing import Dict, List
import csv

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import datasets
import torch
from transformers import (
    PreTrainedTokenizerFast,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    BertJapaneseTokenizer,
    GPT2TokenizerFast,
    Seq2SeqTrainer,
    Trainer
)
from transformers.models.encoder_decoder.modeling_encoder_decoder import EncoderDecoderModel

from datasets import load_dataset

encoder_model_name = "cl-tohoku/bert-base-japanese-v2"
# decoder_model_name = "skt/kogpt2-base-v2"
decoder_model_name = "openai-community/gpt2"

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
device, torch.cuda.device_count()

(device(type='cuda', index=0), 1)

In [3]:
class GPT2TokenizerFast2(GPT2TokenizerFast):
    def build_inputs_with_special_tokens(self, token_ids: List[int]) -> List[int]:
        # print(f'adding {self.eos_token_id}')
        return token_ids + [self.eos_token_id]   

src_tokenizer = BertJapaneseTokenizer.from_pretrained(encoder_model_name)
src_tokenizer.model_max_length = 512
trg_tokenizer = GPT2TokenizerFast2.from_pretrained(decoder_model_name)
trg_tokenizer.model_max_length = 512
# trg_tokenizer = GPT2Tokenizer.from_pretrained(decoder_model_name, bos_token='</s>', eos_token='</s>', unk_token='<unk>',
#   pad_token='<pad>', mask_token='<mask>')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'GPT2TokenizerFast2'.


## Data

In [4]:
class PairedDataset:
    def __init__(self, 
        source_tokenizer: PreTrainedTokenizerFast, target_tokenizer: PreTrainedTokenizerFast,
        file_path: str = None,
        dataset_raw: datasets.Dataset = None
    ):
        self.src_tokenizer = source_tokenizer
        self.trg_tokenizer = target_tokenizer
        
        if file_path is not None:
            with open(file_path, 'r') as fd:
                reader = csv.reader(fd)
                next(reader)
                self.data = [row for row in reader]
        elif dataset_raw is not None:
            self.data = dataset_raw
        else:
            raise ValueError('file_path or dataset_raw must be specified')

    def __getitem__(self, index: int) -> Dict[str, torch.Tensor]:
#         with open('train_log.txt', 'a+') as log_file:
#             log_file.write(f'reading data[{index}] {self.data[index]}\n')
        if isinstance(self.data, datasets.Dataset):
            src, trg = self.data[index]['sourceString'], self.data[index]['targetString']
        else:
            src, trg = self.data[index]
        embeddings = self.src_tokenizer(src, return_attention_mask=False, return_token_type_ids=False, max_length=512, truncation=True)
        embeddings['labels'] = self.trg_tokenizer.build_inputs_with_special_tokens(self.trg_tokenizer(trg, return_attention_mask=False)['input_ids'])

        return embeddings

    def __len__(self):
        return len(self.data)

In [5]:
# DATA_ROOT = '/home/tikim/code/ffat2json/output'
# FILE_FFAC_FULL = 'ffac_full.csv'
# FILE_FFAC_TEST = 'ffac_test.csv'
# FILE_JA_KO_TRAIN = 'ja_ko_train.csv'
# FILE_JA_KO_TEST = 'ja_ko_test.csv'

# train_dataset = PairedDataset(src_tokenizer, trg_tokenizer, file_path=f'{DATA_ROOT}/{FILE_FFAC_FULL}')
# eval_dataset = PairedDataset(src_tokenizer, trg_tokenizer, file_path=f'{DATA_ROOT}/{FILE_FFAC_TEST}') 

# train_dataset = PairedDataset(src_tokenizer, trg_tokenizer, file_path=f'{DATA_ROOT}/{FILE_JA_KO_TRAIN}')
# eval_dataset = PairedDataset(src_tokenizer, trg_tokenizer, file_path=f'{DATA_ROOT}/{FILE_JA_KO_TEST}')

In [6]:
DATA_ROOT = '/home/tikim/dataset/jaen'
FILE_TRAIN = 'train.csv'
FILE_EVAL = 'dev.csv'
FILE_TEST = 'test.csv'

train_dataset = PairedDataset(src_tokenizer, trg_tokenizer, file_path=f'{DATA_ROOT}/{FILE_TRAIN}')
eval_dataset = PairedDataset(src_tokenizer, trg_tokenizer, file_path=f'{DATA_ROOT}/{FILE_EVAL}')

# train_dataset = PairedDataset(src_tokenizer, trg_tokenizer, file_path=f'{DATA_ROOT}/{FILE_EVAL}')
# eval_dataset = PairedDataset(src_tokenizer, trg_tokenizer, file_path=f'{DATA_ROOT}/{FILE_TEST}')

In [7]:
eval_dataset[0]

{'input_ids': [2, 1212, 896, 11680, 897, 14417, 6167, 882, 829, 3], 'labels': [1820, 6125, 318, 21027, 13, 50256]}

In [8]:
# be sure to check the column count of each dataset if you encounter "ValueError: too many values to unpack (expected 2)"
# at the `src, trg = self.data[index]`
# The `cat ffac_full.csv tteb_train.csv > ja_ko_train.csv` command may be the reason.
# the last row of first csv and first row of second csv is merged and that's why 3rd column is created (which arouse ValueError)
# debug_data = train_dataset.data


## Model

In [9]:
print(f'bos_token_id of trg_tokenizer: {trg_tokenizer.bos_token_id}')
model = EncoderDecoderModel.from_encoder_decoder_pretrained(
    encoder_model_name,
    decoder_model_name,
    pad_token_id=trg_tokenizer.bos_token_id,
)
model.config.decoder_start_token_id = trg_tokenizer.bos_token_id

bos_token_id of trg_tokenizer: 50256


Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-v2 were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at openai-community/gpt2 and a

In [10]:
# model

In [11]:
# model.config

In [12]:
# for Trainer
import wandb

collate_fn = DataCollatorForSeq2Seq(src_tokenizer, model)
wandb.init(project="fftr-poc1-en", name='jbert+gpt2')

arguments = Seq2SeqTrainingArguments(
    output_dir='dump',
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=6,
    # num_train_epochs=25,
    # per_device_train_batch_size=1,
    # per_device_train_batch_size=30, # takes 40GB
    per_device_train_batch_size=40,
    per_device_eval_batch_size=2,
    # per_device_eval_batch_size=30,
    # per_device_eval_batch_size=64,
    warmup_ratio=0.1,
    gradient_accumulation_steps=4,
    save_total_limit=5,
    dataloader_num_workers=1,
    # fp16=True, # ENABLE if CUDA is enabled
    load_best_model_at_end=True,
    report_to='wandb'
)

trainer = Seq2SeqTrainer(
    model,
    arguments,
    data_collator=collate_fn,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msappho192[0m. Use [1m`wandb login --relogin`[0m to force relogin


## Training

In [13]:
# model = EncoderDecoderModel.from_encoder_decoder_pretrained("xlm-roberta-base",  "skt/kogpt2-base-v2")

In [14]:
try:
    trainer.train()
    model.save_pretrained("dump/best_model")
    src_tokenizer.save_pretrained("dump/best_model/src_tokenizer")
    trg_tokenizer.save_pretrained("dump/best_model/trg_tokenizer")
    print('Training finished')
except Exception as e:
    print('Training failed')
    print(e)
finally:
    wandb.finish()



Epoch,Training Loss,Validation Loss
0,2.3416,2.24293
1,2.1642,2.12314
2,2.0466,2.076093
4,1.9735,2.053913
4,1.898,2.048181
5,1.8472,2.047947




Training finished


0,1
eval/loss,█▄▂▁▁▁
eval/runtime,▅▂▆▁█▂
eval/samples_per_second,▃▇▃█▁▇
eval/steps_per_second,▃▇▃█▁▇
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,▂▃▅▇███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train/loss,█▆▅▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,2.04795
eval/runtime,12.8118
eval/samples_per_second,126.524
eval/steps_per_second,63.301
train/epoch,6.0
train/global_step,101040.0
train/learning_rate,0.0
train/loss,1.8472
train/total_flos,6.188389784112384e+17
train/train_loss,2.12442
