# Notebook to Train Neural Translation Models from Scratch using OpenNMT
By Rina Kawamura

In [None]:
# Install necessary package for tokenization
!pip install sentencepiece

In [None]:
# Data Processing
# Split train, dev, test files into separate honorific and regular sentence files
import json

# Input path to train data (JSON file)
train_f = 
train_json = open(train_f, 'r')
data_dict = json.load(train_json)
train_json.close()

# Specify paths for regular and honorific files for train data
with open([Path to Honorific Data], 'w') as hon_f, open([Path to Regular Data], 'w') as reg_f:
  for pair in data_dict["data"]:
    hon_f.write(pair["hon"] + '\n')
    reg_f.write(pair["reg"] + '\n')
  
# Input path to validation data (JSON file)
dev_f = "data/dev.json"
dev_json = open(dev_f, 'r')
data_dict = json.load(dev_json)
dev_json.close()

# Specify paths for regular and honorific files for validation data
with open([Path to Honorific Data], 'w') as hon_f, open([Path to Regular Data], 'w') as reg_f:
  for pair in data_dict["data"]:
    hon_f.write(pair["hon"] + '\n')
    reg_f.write(pair["reg"] + '\n')

# Input path to test data (JSON file)
test_f = "data/test.json"
test_json = open(test_f, 'r')
data_dict = json.load(test_json)
test_json.close()

# Specify paths for regular and honorific files for test data
with open([Path to Honorific Data], 'w') as hon_f, open([Path to Regular Data], 'w') as reg_f:
  for pair in data_dict["data"]:
    hon_f.write(pair["hon"] + '\n')
    reg_f.write(pair["reg"] + '\n')

In [None]:
# Create vocab to train on by combining all training data into one file
# Specify train honorific and regular files, as well as file to put all training sentences in
!cat [Path to Train Honorific] [Path to Train Regular] > [Path to Train All]

# Subword Tokenization
import sentencepiece as spm

# Train sentencepiece tokenizer
sp = spm.SentencePieceProcessor()
spm.SentencePieceTrainer.Train("--input=data/basic/train.all --model_prefix=ja --vocab_size=16000 --character_coverage=0.9995")

In [None]:
# Tokenize all data
# Specify path to trained tokenizer
sp.load([Path to Tokenizer])
def detok(filename, outputfilename):
    f = open(filename, 'r')
    fout = open(outputfilename, 'w')
    content = f.read()
    lines = content.split('\n')
    for line in lines:
        data = sp.EncodeAsPieces(str(line))
        data = ' '.join(data)
        fout.write(data + '\n')
    f.close()
    fout.close()

# Specify paths for tokenized data
detok([Path to Honorific Train Data],[Path to Tokenized Honorific Train Data])
detok([Path to Regular Train Data],[Path to Tokenized Regular Train Data])
detok([Path to Honorific Validation Data],[Path to Tokenized Honorific Validation Data])
detok([Path to Regular Validation Data],[Path to Tokenized Regular Validation Data])
detok([Path to Honorific Test Data],[Path to Tokenized Honorific Test Data])
detok([Path to Regular Test Data],[Path to Tokenized Regular Test Data])

In [None]:
# Use OpenNMT library to train translation model
# Set up OpenNMT
!git clone -b legacy https://github.com/OpenNMT/OpenNMT-py

# Change to OpenNMT directory
%cd OpenNMT-py
!pip install -r requirements.opt.txt
!python setup.py install

In [None]:
# Preprocess data for training
# Specify paths
!python preprocess.py -train_src [Path to Tokenized Regular Train Data] -train_tgt [Path to Tokenized Honorific Train Data] -valid_src [Path to Tokenized Regular Validation Data] -valid_tgt [Path to Tokenized Honorific Validation Data] -save_data [Path to Save Preprocessed Data]

In [None]:
# Setup GPU (Change as necessary)
!export CUDA_VISIBLE_DEVICES=0

# Train transformer model 
# Specify paths to preprocessed data and where to save model (Make other alterations where necessary)
!python train.py -data [Path to Preprocessed Data] -save_model [Path to Save Model] \
-layers 6 -rnn_size 512 -word_vec_size 512 \
-transformer_ff 2048 -heads 8 -encoder_type transformer -decoder_type transformer -position_encoding \
-train_steps 5000 -early_stopping 3 -max_generator_batches 2 -dropout 0.1 -batch_size 4096 -batch_type tokens \
-normalization tokens -accum_count 2 -optim adam -adam_beta2 0.998 -decay_method noam \
-warmup_steps 4000 -learning_rate 2 -max_grad_norm 0 -param_init 0 -param_init_glorot \
-label_smoothing 0.1 -valid_steps 500 -save_checkpoint_steps 500 -world_size 1 -gpu_ranks 0 

In [None]:
# Use transformer model to translate test set
# Specify paths to model, tokenized test data, and save output
!python translate.py -model [Path to Trained Model] -src [Path to Tokenized Regular Test Data] -output [Path to Save Tokenized Translated Sentences] -replace_unk -verbose -gpu 0

In [None]:
# Train basic OpenNMT default model (LSTM)
# Specify paths to preprocessed data and where to save model (Make other alterations where necessary)
!python train.py -data [Path to Preprocessed Data] -save_model [Path to Save Model] -world_size 1 -gpu_ranks 0 # -train_from [Specify Checkpoint If Any]

In [None]:
# Translate sentences with basic (LSTM) model
# Specify paths to model, tokenized test data, and save output
!python translate.py -model [Path to Trained Model] -src [Path to Tokenized Regular Test Data] -output [Path to Save Tokenized Translated Sentences] -replace_unk -verbose -gpu 0

In [None]:
# Detokenize translated text
sp = spm.SentencePieceProcessor()
# Specify path to tokenizer
sp.load([Path to Tokenizer (Trained Above)])

# Transformer Model
# Path to tokenized transformer translated sentences
file_name = 
f = open(file_name, 'r')
data = f.read()
sents = data.split('\n')
f.close()

# Specify path to save detokenized sentences
out_f = open([Path to Detokenized Transformer Sentences], 'w')
for sent in sents:
    toks = sent.split(' ')
    tmp = sp.decode_pieces(toks)
    out_f.write(tmp + '\n')
out_f.close()

# Basic (LSTM) Model
# Path to tokenized LSTM translated sentences
file_name = 
f = open(file_name, 'r')
data = f.read()
sents = data.split('\n')
f.close()

# Specify path to save detokenized sentences
out_f = open([Path to Detokenized LSTM Sentences], 'w')
for sent in sents:
    toks = sent.split(' ')
    tmp = sp.decode_pieces(toks)
    out_f.write(tmp + '\n')
out_f.close()

In [None]:
# Install necessary packages to calculate BLEU score
!pip install mecab-python3
!pip install unidic-lite

In [None]:
# Parse using Mecab
import MeCab

# Parse translated and reference sentences using Mecab
wakati = MeCab.Tagger("-Owakati")

# Get reference honorific sentences from file
# Specify path to reference honorific sentences
ref_file_name = 
ref_f = open(ref_file_name, 'r')
ref_data = ref_f.read()
ref_sents = ref_data.split('\n')
ref_f.close()

# Get transformer translated honorific sentences from file
# Specify path to Transformer-translated honorific sentences
tr_file_name = 
tr_f = open(tr_file_name, 'r')
tr_data = tr_f.read()
tr_sents = tr_data.split('\n')
tr_f.close()

# Get LSTM translated honorific sentences from file
# Specify path to LSTM-translated honorific sentences
basic_file_name = 
basic_f = open(basic_file_name, 'r')
basic_data = basic_f.read()
basic_sents = basic_data.split('\n')
basic_f.close()

# Write Mecab parsed sentences to files
with open([Path to Mecab-parsed Transfomer Sentences], 'w') as tr_out, open([Path to Mecab-parsed LSTM Sentences], 'w') as basic_out, open([Path to Mecab-parsed Reference Sentences], 'w') as ref_out:
  for i in range(len(tr_sents)):
    tr_out.write(wakati.parse(tr_sents[i]))
    basic_out.write(wakati.parse(basic_sents[i]))
    ref_out.write(wakati.parse(ref_sents[i]))


In [None]:
# Evaluation using BLEU

# Use OpenNMT BLEU Scorer Tool
# Transformer Score
!perl tools/multi-bleu.perl [Path to Mecab-parsed Reference Sentences] < [Path to Mecab-parsed Transformer Sentences]
# LSTM Score
!perl tools/multi-bleu.perl [Path to Mecab-parsed Reference Sentences] < [Path to Mecab-parsed LSTM Sentences]