<a href="https://colab.research.google.com/github/sasachichito/knowledge/blob/master/computer/%E8%87%AA%E7%84%B6%E8%A8%80%E8%AA%9E%E5%87%A6%E7%90%86_%E7%BF%BB%E8%A8%B3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 依存関係

In [None]:
# transformersのバージョンは、staka/fugumt-en-jaが4.31以下を必要とする
!pip install transformers[ja]==4.31 argostranslate googletrans==4.0.0-rc1 deep-translator

from urllib.request import urlopen
from bs4 import BeautifulSoup

def webpage_to_text(url, selector):
  text = ''
  with urlopen(url) as res:
      html = res.read().decode('UTF-8', 'ignore')
      soup = BeautifulSoup(html, 'html.parser')
      # article = soup.find('div', class_="articleBody")
      # text = article.get_text(strip=True)
      article = soup.select(selector)
      text = ''
      for p in article:
        text += p.get_text(strip=True)
      return text

def trim_last_halfway_sentence(text, period_char):
  if text.endswith(period_char):
    return text

  last_period_index = text.rfind(period_char)
  return text[:last_period_index + 1]

# 英語→日本語

In [None]:
from transformers import pipeline
import argostranslate.package
import argostranslate.translate
from googletrans import Translator
from deep_translator import GoogleTranslator

def translate_text(text):
  print('==============origin================')
  print(text)
  print('====================================')

  print("\n" + '>>>>> staka/fugumt-en-ja' + "\n")
  fugu_translator_en_ja = pipeline('translation', model='staka/fugumt-en-ja')
  translated_fugu = fugu_translator_en_ja(text)
  print([item['translation_text'] for item in translated_fugu][0])

  print("\n" + '>>>>> deep_translator' + "\n")
  dtranslator_ja_en = GoogleTranslator(source='auto', target="ja")
  translated_deep = dtranslator_ja_en.translate(text)
  print(translated_deep)

  print("\n" + '>>>>> googletrans' + "\n")
  gootranslator_en_ja = Translator()
  translated_goo = gootranslator_en_ja.translate(text, src="en", dest="ja")
  print(translated_goo.text)

  print("\n" + '>>>>> facebook/mbart-large-50-one-to-many-mmt' + "\n")
  mbart_translator_en_ja = pipeline('translation', model='facebook/mbart-large-50-one-to-many-mmt', src_lang='en_XX', tgt_lang='ja_XX')
  translated_mbart = mbart_translator_en_ja(text)
  print([item['translation_text'] for item in translated_mbart][0])

  print("\n" + '>>>>> argos' + "\n")
  from_code = "en"
  to_code = "ja"
  argostranslate.package.update_package_index() # Download and install Argos Translate package
  available_packages = argostranslate.package.get_available_packages()
  package_to_install = next(
      filter(
          lambda x: x.from_code == from_code and x.to_code == to_code, available_packages
      )
  )
  argostranslate.package.install_from_path(package_to_install.download())
  translated_argos = argostranslate.translate.translate(text, from_code, to_code)
  print(translated_argos)

  print("\n" + '>>>>> Helsinki-NLP/opus-mt-en-jap' + "\n")
  hel_translator = pipeline('translation', model='Helsinki-NLP/opus-mt-en-jap')
  translated_hel = hel_translator(text)
  print([item['translation_text'] for item in translated_hel][0])

  print("\n" + '>>>>> Helsinki-NLP/opus-tatoeba-en-ja' + "\n")
  hel_translator_ta = pipeline('translation', model='Helsinki-NLP/opus-tatoeba-en-ja')
  translated_hel_ta = hel_translator_ta(text)
  print([item['translation_text'] for item in translated_hel_ta][0])


text = webpage_to_text('https://edition.cnn.com/2024/05/13/politics/takeaways-michael-cohen-testimony-donald-trump-day-16/index.html', '.article__content p')
translate_text(trim_last_halfway_sentence(text[:800], '.'))

# 日本語→英語

In [None]:
from transformers import pipeline
import argostranslate.package
import argostranslate.translate
from googletrans import Translator
from deep_translator import GoogleTranslator

def translate_text(text):
  print('==============origin================')
  print(text)
  print('====================================')

  # 結果を日本語で確認するためにdeep_translatorを使用
  translator_for_print_ja = GoogleTranslator(source='auto', target="ja")

  print("\n" + '>>>>> staka/fugumt-ja-en' + "\n")
  fugu_translator_ja_en = pipeline('translation', model='staka/fugumt-ja-en')
  translated_fugu = fugu_translator_ja_en(text)
  print([item['translation_text'] for item in translated_fugu][0])
  print(translator_for_print_ja.translate([item['translation_text'] for item in translated_fugu][0]))

  print("\n" + '>>>>> deep_translator' + "\n")
  dtranslator_ja_en = GoogleTranslator(source='auto', target="en")
  translated_deep = dtranslator_ja_en.translate(text)
  print(translated_deep)
  print(translator_for_print_ja.translate(translated_deep))

  print("\n" + '>>>>> googletrans' + "\n")
  gootranslator_ja_en = Translator()
  translated_goo = gootranslator_ja_en.translate(text, src="ja", dest="en")
  print(translated_goo.text)
  print(translator_for_print_ja.translate(translated_goo.text))

  print("\n" + '>>>>> facebook/mbart-large-50-many-to-many-mmt' + "\n")
  mbart_translator_ja_en = pipeline('translation', model='facebook/mbart-large-50-many-to-many-mmt', src_lang='ja_XX', tgt_lang='en_XX')
  translated_mbart = mbart_translator_ja_en(text)
  print([item['translation_text'] for item in translated_mbart][0])
  print(translator_for_print_ja.translate([item['translation_text'] for item in translated_mbart][0]))

  print("\n" + '>>>>> argos' + "\n")
  from_code = "ja"
  to_code = "en"
  argostranslate.package.update_package_index() # Download and install Argos Translate package
  available_packages = argostranslate.package.get_available_packages()
  package_to_install = next(
      filter(
          lambda x: x.from_code == from_code and x.to_code == to_code, available_packages
      )
  )
  argostranslate.package.install_from_path(package_to_install.download())
  translated_argos = argostranslate.translate.translate(text, from_code, to_code)
  print(translated_argos)
  print(translator_for_print_ja.translate(translated_argos))

  print("\n" + '>>>>> Helsinki-NLP/opus-mt-ja-en' + "\n")
  hel_translator = pipeline('translation', model='Helsinki-NLP/opus-mt-ja-en')
  translated_hel = hel_translator(text)
  print([item['translation_text'] for item in translated_hel][0])
  print(translator_for_print_ja.translate([item['translation_text'] for item in translated_hel][0]))

text = webpage_to_text('https://xtech.nikkei.com/atcl/nxt/column/18/02828/050900001/', '.articleBody p')
translate_text(trim_last_halfway_sentence(text[:800], '。'))