In [1]:
import argparse
import glob
import os
import json
import time
import logging
import random
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    Trainer,
    TrainingArguments,
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    T5Config
)

# 乱数シードの設定
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
config = T5Config(
    decoder_start_token_id=0,
    vocab_size=32128, 
    n_positions=512, 
    d_model=768, 
    d_kv=64, 
    d_ff=3072, 
    num_beams=4,
    num_layers=12, 
    num_heads=12, 
    relative_attention_num_buckets=32, 
    dropout_rate=0.1, 
    layer_norm_epsilon=1e-06, 
    initializer_factor=1.0, 
    is_encoder_decoder=True, 
    pad_token_id=0, 
    eos_token_id=1)

In [2]:
sono_tokenizer = T5Tokenizer.from_pretrained("sonoisa/t5-base-japanese")
sono_model = T5ForConditionalGeneration.from_pretrained("sonoisa/t5-base-japanese")

In [23]:
mega_tokenizer = T5Tokenizer.from_pretrained("megagonlabs/t5-base-japanese-web")
mega_model = T5ForConditionalGeneration.from_pretrained("megagonlabs/t5-base-japanese-web")

loading file https://huggingface.co/megagonlabs/t5-base-japanese-web/resolve/main/spiece.model from cache at /home/sibava/.cache/huggingface/transformers/b44045e238b3ddbad8287ecc1d53b5509a75f02885397960b31c3f2b2a0559a5.1c2235a13c1aafde0d32c922fbf478c654952647cfc04095d56141a637da65a9
loading file https://huggingface.co/megagonlabs/t5-base-japanese-web/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/megagonlabs/t5-base-japanese-web/resolve/main/special_tokens_map.json from cache at /home/sibava/.cache/huggingface/transformers/dfdb1428c3fb4430abb5afd7b345af884869781d292f21457d5de9e0cdff7760.c94798918c92ded6aeef2d2f0e666d2cc4145eca1aa6e1336fde07f2e13e2f46
loading file https://huggingface.co/megagonlabs/t5-base-japanese-web/resolve/main/tokenizer_config.json from cache at /home/sibava/.cache/huggingface/transformers/927c1345e925a3ea6162bcd32dd81484cc028e0ce8e03db73260fc3a250d6a27.0b382273d4a85691ebca0b29011791d45fd15d012f3d38eef1f8c69ddbe60cda
loading c

In [28]:
mega_model.resize_token_embeddings(len(mega_tokenizer))

Embedding(32100, 768)

In [3]:
input_ids = sono_tokenizer("その<extra_id_0>公園を歩いている", return_tensors="pt").input_ids
labels = sono_tokenizer("<extra_id_0>犬は<extra_id_1>", return_tensors="pt").input_ids

In [33]:
input_ids = mega_tokenizer("その<extra_id_0>公園を歩いている", return_tensors="pt").input_ids
labels = mega_tokenizer("<extra_id_0>犬は<extra_id_1>", return_tensors="pt").input_ids

In [3]:
dataset = load_dataset('json',data_files={"train":'/home/sibava/PAS-T5/pas_dataset/pas_data.train.jsonl',
"dev":'/home/sibava/PAS-T5/pas_dataset/pas_data.dev.jsonl',"test":'/home/sibava/PAS-T5/pas_dataset/pas_data.test.jsonl'})

Using custom data configuration default-1ad9e7611711dc62
Reusing dataset json (/home/sibava/.cache/huggingface/datasets/json/default-1ad9e7611711dc62/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)
100%|██████████| 3/3 [00:00<00:00, 43.80it/s]


In [4]:
dataset["train"] = dataset["train"].remove_columns(['input_tokens','gold_arguments','case_types','alt_type'])
dataset["dev"] = dataset["dev"].remove_columns(['input_tokens','gold_arguments','case_types','alt_type'])

In [5]:
dataset.set_format(type='torch',columns=['input_ids','labels'])

In [6]:
TRAIN_BATCH_SIZE  = 4
EVAL_BATCH_SIZE  = 4
NUM_EPOCHS  = 3

training_args = TrainingArguments(
    "./finetune",
    num_train_epochs = NUM_EPOCHS,
    evaluation_strategy = "steps",
    optim = 'adafactor',
    learning_rate=1e-3,
    lr_scheduler_type="constant",
    per_device_train_batch_size = TRAIN_BATCH_SIZE,
    per_device_eval_batch_size  = EVAL_BATCH_SIZE,
    eval_steps = 2000,
    logging_steps = 2000,
    save_steps = 2000,
)

In [8]:
training_args.device

device(type='cuda', index=0)

In [7]:
trainer = Trainer(
    model=sono_model,
    args=training_args,
    tokenizer=sono_tokenizer,
    train_dataset=dataset["train"],
    eval_dataset=dataset["dev"]
)

In [8]:
trainer.train(resume_from_checkpoint='finetune/checkpoint-12000')

Loading model from finetune/checkpoint-12000).
***** Running training *****
  Num examples = 68737
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 12891
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 2
  Continuing training from global step 12000
  Will skip the first 2 epochs then the first 3406 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.
Skipping the first batches: 100%|██████████| 3406/3406 [00:13<00:00, 252.64it/s]


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=12891, training_loss=0.010802198775050786, metrics={'train_runtime': 551.8009, 'train_samples_per_second': 373.705, 'train_steps_per_second': 23.362, 'total_flos': 1.2557381557026816e+17, 'train_loss': 0.010802198775050786, 'epoch': 3.0})

In [9]:
model_path = 'fast_trainig.pth'
torch.save(sono_model.state_dict(), model_path)

In [10]:
sono_model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dr