# Use Trained Models (LSTM, Transformer, Finetuned GPT-II)
By Rina Kawamura

In [None]:
# Setup to use OpenNMT Models
!pip install sentencepiece
import sentencepiece as spm

# Specify path to cloned OpenNMT directory
%cd [Path to OpenNMT Directory]
!pip install -r requirements.opt.txt
!python setup.py install

In [None]:
# Tokenize test sentences
sp = spm.SentencePieceProcessor()
# Specify path to trained tokenizer
sp.load([Path to Trained Tokenizer])

def detok(filename, outputfilename):
    f = open(filename, 'r')
    fout = open(outputfilename, 'w')
    content = f.read()
    lines = content.split('\n')
    for line in lines:
        data = sp.EncodeAsPieces(str(line))
        data = ' '.join(data)
        fout.write(data + '\n')
    f.close()
    fout.close()

# Tokenize regular sentences to translate
detok([Path to Regular Sentences],[Path to Save Tokenized Regular Sentences])

In [None]:
# Use transformer model to translate human evaluation test set
# Specify GPU
!export CUDA_VISIBLE_DEVICES=0
# Specify path to transformer model, path to tokenized regular sentences, and path to output tokenized translated sentences
!python translate.py -model [Path to Trained Transformer Model] -src [Path to Tokenized Regular Sentences] -output [Path to Save Tokenized Translated Sentences] -replace_unk -verbose -gpu 0

In [None]:
# Use LSTM model to translate
# Specify path to transformer model, path to tokenized regular sentences, and path to output tokenized translated sentences
!python translate.py -model [Path to Trained LSTM Model] -src [Path to Tokenized Regular Sentences] -output [Path to Save Tokenized Translated Sentences] -replace_unk -verbose -gpu 0

In [None]:
# Detokenize translated tokens
# Transformer Model
# Specify path to tokenized translated sentences
file_name = 
f = open(file_name, 'r')
data = f.read()
sents = data.split('\n')
f.close()

# Specify path to save detokenized translated sentences
out_f = open([Path to Save Transformer-translated Sentences], 'w')
for sent in sents:
    toks = sent.split(' ')
    tmp = sp.decode_pieces(toks)
    out_f.write(tmp + '\n')
out_f.close()

# LSTM Model
# Specify path to tokenized translated sentences
file_name = 
f = open(file_name, 'r')
data = f.read()
sents = data.split('\n')
f.close()

# Specify path to save detokenized translated sentences
out_f = open([Path to Save LSTM-translated Sentences], 'w')
for sent in sents:
    toks = sent.split(' ')
    tmp = sp.decode_pieces(toks)
    out_f.write(tmp + '\n')
out_f.close()

In [None]:
# Leave OpenNMT Directory
%cd ..

In [None]:
# Setup to use finetuned GPT-II model
!pip install transformers
!pip install datasets
!pip install sentencepiece

# Load necessary tokenizer and model
from transformers import T5Tokenizer

# Load Japanese gpt2 model tokenizer
tokenizer = T5Tokenizer.from_pretrained("rinna/japanese-gpt2-small")
tokenizer.do_lower_case = True

# Load pretrained model
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("rinna/japanese-gpt2-small")

In [None]:
from transformers import pipeline, GPT2LMHeadModel

# Specify path to trained model
my_model = GPT2LMHeadModel.from_pretrained([Path to Finetuned Model],config=[Path to Config File])
translator = pipeline('text-generation',model=my_model, tokenizer='rinna/japanese-gpt2-small')

In [None]:
reg_sents = []
# Path to regular sentences to translated
test_file = open([Path to Regular Sentences], 'r')
test_data = test_file.readlines()
test_file.close()

for line in test_data:
  reg_sents.append(line.strip())

In [None]:
# Translate test data
def translate_w_model(sent):
  bos_tok, sep_tok, eos_tok = '<s>', '[SEP]', '</s>'
  input = bos_tok + sent + sep_tok
  return (translator(input))

# Process translated data
def process_translated(obj):
  toks = obj[0]['generated_text'].split('[SEP]')
  return toks[-1]

# Specify path to save translated sentences
test_out_path = 
with open(test_out_path, 'a') as out_f:
  for sent in reg_sents:
    tr = translate_w_model(sent)
    proc = process_translated(tr)
    out_f.write(proc + '\n')