In [14]:
import torch
from transformers import MarianMTModel, MarianTokenizer
from transformers.file_utils import TRANSFORMERS_CACHE

# モデルとトークナイザーの準備
model_name = 'Helsinki-NLP/opus-mt-en-jap'
tokenizer = MarianTokenizer.from_pretrained(model_name, cache_dir=TRANSFORMERS_CACHE, timeout=1000)
model = MarianMTModel.from_pretrained(model_name, cache_dir=TRANSFORMERS_CACHE)

# 翻訳したい文章
sentence = "I like an apple."

# 文章をトークナイザーでトークナイズし、モデルが理解できる形式に変換
inputs = tokenizer(sentence, return_tensors="pt")

# 翻訳の実行
translated = model.generate(**inputs)

# 翻訳結果の表示
tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
print(tgt_text)


['わたし は ひとみ の よう に ひとみ を 守 る.']


In [1]:
text = 'I like an apple.'

import pysbd
seg_en = pysbd.Segmenter(language="en", clean=False)

from transformers import pipeline
fugu_translator = pipeline('translation', model='staka/fugumt-en-ja')
print(fugu_translator(seg_en.segment(text)))

[{'translation_text': '私の好きな林いのんがおパイレーつじ座'}]


In [2]:
from transformers import MarianMTModel, MarianTokenizer

model_name = 'Hoax0930/marian-finetuned-kde4-en-to-ja'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# 翻訳したい文章
sentence = "I like an apple."

# 文章をトークナイザーでトークナイズし、モデルが理解できる形式に変換
inputs = tokenizer(sentence, return_tensors="pt")

# 翻訳の実行
translated = model.generate(**inputs)

# 翻訳結果の表示
tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
print(tgt_text)


tokenizer_config.json:   0%|          | 0.00/382 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/808k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/834k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.83M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/310M [00:00<?, ?B/s]

['アップルが好き。']


# facebookの以下のモデルが最も良い精度だと考えられる．

In [5]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

model_name = 'facebook/mbart-large-50-many-to-many-mmt'
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

# 翻訳したい文章
sentence = "I have a pen."

# 文章をトークナイザーでトークナイズし、モデルが理解できる形式に変換
inputs = tokenizer(sentence, return_tensors="pt")

# 翻訳の実行
generated_tokens = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id["ja_XX"])
translated = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

print(translated)
