In [1]:
from tqdm import tqdm
import urllib.request
import zipfile
from os.path import join, exists
import re
import joblib

In [2]:
def load_aozora():
    # 太宰治「走れメロス」 https://www.aozora.gr.jp/cards/000035/card1567.html
    url = 'https://www.aozora.gr.jp/cards/000035/files/1567_ruby_4948.zip'
    download_filepath = '/tmp/1567_ruby_4948.zip'

    if not exists(download_filepath):
        urllib.request.urlretrieve(url, download_filepath)

    with zipfile.ZipFile(download_filepath, 'r') as myzipfile:
        myzipfile.extractall('/tmp')
        for myfile in myzipfile.infolist():
            with open(join('/tmp', myfile.filename), encoding='sjis') as file:
                text = file.read()
    # 参考: https://qiita.com/makaishi2/items/63b7986f6da93dc55edd?utm_source=pocket_mylist#step1
    text = re.split('\-{5,}',text)[2]
    text = re.split('底本：',text)[0]
    text = text.replace('|', '')
    text = re.sub('《.+?》', '', text)
    text = re.sub('［＃.+?］', '',text)
    text = re.sub('\n\n', '\n', text) 
    text = re.sub('\r', '', text)
    text = "".join(text.split())
    return text.split("。")

In [3]:
texts = load_aozora()

In [4]:
texts[:3]

['メロスは激怒した', '必ず、かの邪智暴虐の王を除かなければならぬと決意した', 'メロスには政治がわからぬ']

In [5]:
len(texts)

460

## KNP

In [6]:
from pyknp import KNP

In [7]:
knp = KNP()


In [8]:
def analyze_knp(text: str):
    ret = {}
    result = knp.parse(text)
    for bnst in result.bnst_list():
        parent = bnst.parent
        if parent is not None:
            ret[bnst.midasi] = parent.midasi
    return ret

In [9]:
analyze_knp(texts[0])

{'メロスは': '激怒した'}

In [10]:
%%time
result = [ analyze_knp(t) for t in tqdm(texts) ]

 25%|██▍       | 114/460 [00:16<00:38,  8.98it/s];; Invalid morpheme ID: kata(28) kei(11)
 41%|████      | 187/460 [00:30<00:53,  5.14it/s];; Invalid morpheme ID: kata(28) kei(11)
 67%|██████▋   | 307/460 [00:49<00:16,  9.11it/s];; Invalid morpheme ID: kata(28) kei(11)
100%|██████████| 460/460 [01:08<00:00,  6.69it/s]

CPU times: user 1.07 s, sys: 151 ms, total: 1.22 s
Wall time: 1min 8s



