<a href="https://colab.research.google.com/github/sasachichito/knowledge/blob/master/computer/%E8%87%AA%E7%84%B6%E8%A8%80%E8%AA%9E%E5%87%A6%E7%90%86_%E8%A6%81%E7%B4%84_%E6%8A%BD%E5%87%BA%E5%9E%8B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers[ja]==4.25 deep-translator spacy==3.4.4 ginza==5.2.0 ja_ginza==5.2.0 sumy tinysegmenter
!python -m spacy download ja_core_news_trf
!python -m spacy download ja_core_news_lg

from urllib.request import urlopen
from bs4 import BeautifulSoup

def webpage_to_text(url, selector):
  text = ''
  with urlopen(url) as res:
      html = res.read().decode('UTF-8', 'ignore')
      soup = BeautifulSoup(html, 'html.parser')
      # article = soup.find('div', class_="articleBody")
      # text = article.get_text(strip=True)
      article = soup.select(selector)
      text = ''
      for p in article:
        text += p.get_text(strip=True)
      return text

def trim_last_halfway_sentence(text, period_char):
  if text.endswith(period_char):
    return text

  last_period_index = text.rfind(period_char)
  return text[:last_period_index + 1]

# 抽出型要約

In [None]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer as SumyTokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.sum_basic import SumBasicSummarizer
from sumy.summarizers.kl import KLSummarizer
from sumy.summarizers.edmundson import EdmundsonSummarizer
from sumy.utils import get_stop_words
import spacy

LANGUAGE="japanese"

text = webpage_to_text('https://xtech.nikkei.com/atcl/nxt/column/18/02252/051400006/', '.articleBody p')
text = trim_last_halfway_sentence(text[:800], '。')

# Spacy ja_core_news_trf
class SpacyJcntTokenizer(SumyTokenizer):
  nlp = spacy.load("ja_core_news_trf")
  def to_words(self, text):
    doc = self.nlp(text)
    return [token.text for token in doc]

# Spacy GiNZA
class SpacyGinzaTokenizer(SumyTokenizer):
  nlp = spacy.load("ja_ginza")
  def to_words(self, text):
    doc = self.nlp(text)
    return [token.text for token in doc]

parser_s = PlaintextParser.from_string(text, SumyTokenizer(LANGUAGE))
parser_sj = PlaintextParser.from_string(text, SpacyJcntTokenizer(LANGUAGE))
parser_sg = PlaintextParser.from_string(text, SpacyGinzaTokenizer(LANGUAGE))

summarizers = {
    "LexRank": LexRankSummarizer(),
    "LSA": LsaSummarizer(),
    "Luhn": LuhnSummarizer(),
    "SumBasic": SumBasicSummarizer(),
    "KL-Sum": KLSummarizer(),
    "Edmundson": EdmundsonSummarizer()
}

stop_words = get_stop_words(LANGUAGE)
for name in summarizers:
    summarizer = summarizers[name]
    summarizer.stop_words = stop_words

summarizers["Edmundson"].bonus_words = frozenset([""])
summarizers["Edmundson"].stigma_words = frozenset([""])
summarizers["Edmundson"].null_words = frozenset([""])

summary_sentences = 3

for name, summarizer in summarizers.items():
    print(f"{name} algorithm:")
    print("\n" + ">>>>> sumy tokenizer (default):" + "\n")
    summary = summarizer(parser_s.document, summary_sentences)
    for sentence in summary:
        print(sentence)
    print("\n" + ">>>>> ja_core_news_trf tokenizer:" + "\n")
    summary = summarizer(parser_sj.document, summary_sentences)
    for sentence in summary:
        print(sentence)
    print("\n" + ">>>>> ginza tokenizer:" + "\n")
    summary = summarizer(parser_sg.document, summary_sentences)
    for sentence in summary:
        print(sentence)
    print("\n" + "-"*50 + "\n")