# Finetune Pretrained Japanese GPT-II Model for Honorific Translation
By Rina Kawamura

In [None]:
# Import necessary packages
!pip install transformers
!pip install datasets
!pip install sentencepiece

In [None]:
# Data processing (Split, Process, Tokenize)
import json

# Add paths for processed training and validation data to be saved (txt file)
train_path = 
dev_path = 

# Add path to train data (JSON file)
train_file = 

train_json = open(train_file, 'r')
train_data = json.load(train_json)
train_json.close()

# Add path to validation data (JSON file)
dev_file = 

dev_json = open(dev_file, 'r')
dev_data = json.load(dev_json)
dev_json.close()

# Put training data in format of [Start Token (<s>)][Regular Sentence][Separation Token ([SEP])][Honorific Sentence][End Token (</s>)]

with open(train_path, 'w') as f:
  bos_tok, sep_tok, eos_tok = '<s>', '[SEP]', '</s>'
  for pair in train_data["data"]:
    full = bos_tok + pair['reg'] + sep_tok + pair['hon'] + eos_tok + '\n'
    f.write(full)

with open(dev_path, 'w') as f:
  bos_tok, sep_tok, eos_tok = '<s>', '[SEP]', '</s>'
  for pair in dev_data["data"]:
    full = bos_tok + pair['reg'] + sep_tok + pair['hon'] + eos_tok + '\n'
    f.write(full)


In [None]:
# Load necessary tokenizer and model
from transformers import T5Tokenizer

# Load Japanese gpt2 model tokenizer
tokenizer = T5Tokenizer.from_pretrained("rinna/japanese-gpt2-small")
tokenizer.do_lower_case = True

# Load pretrained model
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("rinna/japanese-gpt2-small")

In [None]:
# Finetune GPT-II model
from transformers.data.data_collator import DataCollatorForLanguageModeling
from transformers import TrainingArguments, Trainer, DataCollator, TextDataset

train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)

eval_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=dev_path,
          block_size=128)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Specify directory to save checkpoints
training_args = TrainingArguments(
    output_dir="gpt2/", 
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
)

# Specify Trainer model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    eval_dataset=eval_dataset,
)

trainer.train(resume_from_checkpoint = True)
trainer.save_model()

In [None]:
# Load finetuned model for inference
from transformers import pipeline, GPT2LMHeadModel

# Specify path to saved model and config file
my_model = GPT2LMHeadModel.from_pretrained([Path to Model File],config=[Path to Config File])
translator = pipeline('text-generation',model=my_model, tokenizer='rinna/japanese-gpt2-small')

In [None]:
# Get regular test sentences to convert using model
# Write reference sentences to file to compare later

import json

# Specify path to test data (JSON file)
test_file = 

test_json = open(test_file, 'r')
test_data = json.load(test_json)
test_json.close()

# Specify paths to save regular and honorific test sentences
test_ref_path = 
test_reg_path = 

test_reg_sents = []
with open(test_ref_path, 'w') as ref_f, open(test_reg_path, 'w') as reg_f:
  for pair in test_data["data"]:
    test_reg_sents.append(pair['reg'])
    ref_f.write(pair['hon'] + '\n')
    reg_f.write(pair['reg'] + '\n')


In [None]:
# Translate test data
def translate_w_model(sent):
  bos_tok, sep_tok, eos_tok = '<s>', '[SEP]', '</s>'
  input = bos_tok + sent + sep_tok
  return (translator(input))

# Remove special tokens from translated sentences
def process_translated(obj):
  toks = obj[0]['generated_text'].split('[SEP]')
  return toks[-1]

# Specify path to save translated honorific sentences
test_out_path = 
with open(test_out_path, 'a') as out_f:
  for sent in test_reg_sents:
    tr = translate_w_model(sent)
    proc = process_translated(tr)
    out_f.write(proc + '\n')

In [None]:
# Install necessary packages to calculate BLEU score
!pip install mecab-python3
!pip install unidic-lite

In [None]:
# Evaluate using BLEU
import MeCab

# Parse translated and reference sentences using Mecab
wakati = MeCab.Tagger("-Owakati")

# Get translated honorific sentences from file
# Specify path to translated sentences
tr_f = open([Path to Translated Sentences], 'r')
tr_data = tr_f.read()
tr_sents = tr_data.split('\n')
tr_f.close()

# Get reference honorific sentences from file
# Specify path to reference honorific sentences
ref_f = open([Path to References Sentences], 'r')
ref_data = ref_f.read()
ref_sents = ref_data.split('\n')
ref_f.close()

# Specify paths to write MeCab-parsed outputs for the translated and reference sentences
with open([Path to Parsed Translated Sentences], 'w') as tr_out, open([Path to Parsed Reference Sentences], 'w') as ref_out:
  for i in range(len(tr_sents)):
    tr_out.write(wakati.parse(tr_sents[i]))
    ref_out.write(wakati.parse(ref_sents[i]))


In [None]:
# Use OpenNMT BLEU Scorer Tool
# Input paths to Mecab-parsed reference and translated sentence files
!perl  OpenNMT-py/tools/multi-bleu.perl [Path to Parsed References Sentences] < [Path to Parsed Translated Sentences]