In [1]:
import json
import torch
from tqdm import tqdm
from torch import nn
from torch.optim import AdamW
from typing import List, Dict
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader
from data_process import KanbunDataset, kanbun_collate_fn, get_label_map, decode_prediction, character_mark
from model import BaseKanbunModel, OneAuxiliaryTaskModel, TwoAuxiliaryTaskModel, ThreeAuxiliaryTaskModel, FourAuxiliaryTaskModel
from translation import valid_marks, translate_corpus
from evaluation import compute_bleu_score, compute_chrf_score, compute_bert_score, compute_rouge_score, compute_ribes_score, compute_kendalltau_score, compute_pmr_score

  import pynvml  # type: ignore[import]


In [2]:
import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [3]:
l2i, i2l = get_label_map(f"./Kanbuntaikei Data/labels.json")

In [4]:
device = torch.device("cuda")
epoches = 10
batch_size = 64
learning_rate = 5e-5
weight_decay = 0.01

In [5]:
def evaluate(model, data):
    model.eval()
    with torch.no_grad():
        n = 0
        l = 0
        for _, (sentences, labels) in tqdm(enumerate(data), total = len(data)):
            loss, logits_okurigana, logits_particle, logits_position = model(sentences, labels)
            l += loss.item()
            n += logits_okurigana.size(0)
        print("Evaluate loss {}".format(l / n))
    return l / n

def predict(model, data):
    model.eval()
    with torch.no_grad():
        okurigana = []
        particle = []
        position = []
        for _, (sentences, _) in tqdm(enumerate(data), total = len(data)):
            _, logits_okurigana, logits_particle, logits_position = model(sentences, None)
            probabilities_okurigana = nn.functional.softmax(logits_okurigana, dim = 1)
            probabilities_particle = nn.functional.softmax(logits_particle, dim = 1)
            probabilities_position = nn.functional.softmax(logits_position, dim = 1)
            okurigana.extend(torch.argmax(probabilities_okurigana, dim = 1).tolist())
            particle.extend(torch.argmax(probabilities_particle, dim = 1).tolist())
            position.extend(torch.argmax(probabilities_position, dim = 1).tolist())
    return [okurigana, particle, position]

def train(model, train_data, dev_data, optimizer, epoches):
    model.train(True)
    dev_loss = 10
    for e in range(epoches):
        n = 0
        l = 0
        for _, (sentences, labels) in tqdm(enumerate(train_data), total = len(train_data)):
            optimizer.zero_grad()
            loss, logits_okurigana, logits_particle, logits_position = model(sentences, labels)
            loss.backward()
            optimizer.step()
            l += loss.item()
            n += logits_okurigana.size(0)
        print("=" * 20)
        print("Epoch {}".format(e + 1))
        print("Traning loss {}".format(l / n))
    
        current_dev_loss = evaluate(model, dev_data)
        if current_dev_loss <= dev_loss:
            dev_loss = current_dev_loss
        else:
            break

In [6]:
tokenizer = AutoTokenizer.from_pretrained("tohoku-nlp/bert-base-japanese-char")
bert = AutoModel.from_pretrained("tohoku-nlp/bert-base-japanese-char")
loss_function = nn.CrossEntropyLoss()
model = FourAuxiliaryTaskModel(bert, loss_function, 768, [106, 247, 19, 2, 14, 115, 44], ["segmentation", "partofspeech", "dependencyarc", "dependencytype"], (0.6, 0.4), 0.3).to(device)
optimizer = AdamW(model.parameters(), lr = learning_rate, weight_decay = weight_decay)

In [7]:
train_data = KanbunDataset(f"./Kanbuntaikei Data/train.json", "japanese", ["okurigana", "particle", "position", "segmentation", "partofspeech", "dependencyarc", "dependencytype"], tokenizer, l2i, device = device)
dev_data = KanbunDataset(f"./Kanbuntaikei Data/dev.json", "japanese", ["okurigana", "particle", "position", "segmentation", "partofspeech", "dependencyarc", "dependencytype"], tokenizer, l2i, device = device)
test_data = KanbunDataset(f"./Kanbuntaikei Data/test.json", "japanese", ["okurigana", "particle", "position", "segmentation", "partofspeech", "dependencyarc", "dependencytype"], tokenizer, l2i, device = device)
train_loader = DataLoader(dataset = train_data, batch_size = batch_size, shuffle = True, collate_fn = lambda batch:kanbun_collate_fn(batch, tokenizer.pad_token_id))
dev_loader = DataLoader(dataset = dev_data, batch_size = batch_size, shuffle = True, collate_fn = lambda batch:kanbun_collate_fn(batch, tokenizer.pad_token_id))
test_loader = DataLoader(dataset = test_data, batch_size = batch_size, shuffle = False, collate_fn = lambda batch:kanbun_collate_fn(batch, tokenizer.pad_token_id))

In [8]:
train(model, train_loader, dev_loader, optimizer, epoches)

100%|██████████| 117/117 [00:13<00:00,  8.47it/s]


Epoch 1
Traning loss 0.10080696714368656


100%|██████████| 15/15 [00:01<00:00, 14.92it/s]


Evaluate loss 0.07550799205739012


100%|██████████| 117/117 [00:12<00:00,  9.22it/s]


Epoch 2
Traning loss 0.05692008413152901


100%|██████████| 15/15 [00:00<00:00, 15.19it/s]


Evaluate loss 0.049514784607835996


100%|██████████| 117/117 [00:12<00:00,  9.05it/s]


Epoch 3
Traning loss 0.042244988400025496


100%|██████████| 15/15 [00:01<00:00, 14.49it/s]


Evaluate loss 0.04302473375874181


100%|██████████| 117/117 [00:12<00:00,  9.10it/s]


Epoch 4
Traning loss 0.034668156922015914


100%|██████████| 15/15 [00:01<00:00, 14.64it/s]


Evaluate loss 0.04048079188152026


100%|██████████| 117/117 [00:12<00:00,  9.03it/s]


Epoch 5
Traning loss 0.029024016691957432


100%|██████████| 15/15 [00:01<00:00, 14.56it/s]


Evaluate loss 0.03903112949863557


100%|██████████| 117/117 [00:12<00:00,  9.08it/s]


Epoch 6
Traning loss 0.024678608618368937


100%|██████████| 15/15 [00:01<00:00, 12.26it/s]


Evaluate loss 0.0389793316523234


100%|██████████| 117/117 [00:12<00:00,  9.27it/s]


Epoch 7
Traning loss 0.02059439927656301


100%|██████████| 15/15 [00:00<00:00, 15.27it/s]

Evaluate loss 0.039467889519147974





In [9]:
prediction = predict(model, test_loader)
result = character_mark(f"./Kanbuntaikei Data/test.json", decode_prediction(f"./Kanbuntaikei Data/test.json", prediction, i2l))

100%|██████████| 15/15 [00:00<00:00, 15.37it/s]


In [10]:
original_marks, valid_marks, _ = valid_marks(f"./Kanbuntaikei Data/test.json", result)
original_sentences, translated_sentences, valid_count = translate_corpus(original_marks, valid_marks)

In [11]:
compute_bleu_score(original_sentences, translated_sentences)

BLEU = 48.03 77.3/54.5/40.9/30.9 (BP = 1.000 ratio = 1.152 hyp_len = 13860 ref_len = 12034)

In [12]:
compute_chrf_score(original_sentences, translated_sentences)

chrF2 = 46.27

In [13]:
compute_bert_score(original_sentences, translated_sentences)

(0.9109459153035792, 0.9262703106170748, 0.9184405068798763)

In [14]:
compute_rouge_score(original_sentences, translated_sentences)

{'rouge1': {'precision': 0.8932379950047574,
  'recall': 0.7784881205299998,
  'fmeasure': 0.8265151065922299},
 'rouge2': {'precision': 0.6298728729745293,
  'recall': 0.5542495211597761,
  'fmeasure': 0.5857188538978271},
 'rougeL': {'precision': 0.847307743581081,
  'recall': 0.7406523969288453,
  'fmeasure': 0.7853984395245114}}

In [15]:
compute_ribes_score(original_sentences, translated_sentences)

0.490855661440961

In [16]:
compute_kendalltau_score(original_marks, valid_marks)

np.float64(0.9303236486869637)

In [17]:
compute_pmr_score(original_marks, valid_marks)

0.870939893496414

In [18]:
len(original_marks) / len(result)

0.8826695371367062