In [2]:
import urllib.request as req
import zipfile
import os.path
import MeCab
from gensim import models

# Mecabの初期化
mecab = MeCab.Tagger()
mecab.parse('')

save_dir = 'book'

# モデル読み込み
model = models.Doc2Vec.load('./aozora.model')

def read_book(url, zipname):
    save_path = os.path.join(save_dir, zipname)
    if not os.path.exists(save_path):
        print(save_path)
        req.urlretrieve(url, save_path)

    with zipfile.ZipFile(save_path, 'r') as zf:
        for filename in zf.namelist():
            with zf.open(filename, 'r') as f:
                return f.read().decode('shift-jis')

def split_words(text):
    node = mecab.parseToNode(text)
    wakati_words = []
    while node is not None:
        hinshi = node.feature.split(',')[0]
        if hinshi in ['名詞']:
            wakati_words.append(node.surface)
        elif hinshi in ['動詞', '形容詞']:
            wakati_words.append(node.feature.split(',')[6])
        node = node.next
    return wakati_words

def similar(title, url):
    zipname = url.split('/')[-1]

    words = read_book(url, zipname)
    wakati_words = split_words(words)
    vector = model.infer_vector(wakati_words)
    print(f'{title}と似た作品は？')
    print(model.docvecs.most_similar([vector], topn=3))

similar(
    '宮沢　賢治: よだかの星',
    'https://www.aozora.gr.jp/cards/000081/files/473_ruby_467.zip'
)
similar(
    '芥川　竜之介: 犬と笛',
    'https://www.aozora.gr.jp/cards/000879/files/56_ruby_845.zip'
)
similar(
    '太宰　治: 純真',
    'https://www.aozora.gr.jp/cards/000035/files/46599_ruby_24668.zip'
)
similar(
    '夏目　漱石: 一夜',
    'https://www.aozora.gr.jp/cards/000148/files/1086_ruby_5742.zip'
)


book/473_ruby_467.zip
宮沢　賢治: よだかの星と似た作品は？
[('芥川\u3000竜之介:老年', 0.9995905756950378), ('芥川\u3000竜之介:羅生門', 0.9995163083076477), ('宮沢\u3000賢治:どんぐりと山猫', 0.9991496801376343)]
book/56_ruby_845.zip
芥川　竜之介: 犬と笛と似た作品は？
[('夏目\u3000漱石:吾輩は猫である', 0.9991558790206909), ('太宰\u3000治:津軽', 0.9983313083648682), ('芥川\u3000竜之介:老年', 0.9983235001564026)]
book/46599_ruby_24668.zip
太宰　治: 純真と似た作品は？
[('芥川\u3000竜之介:羅生門', 0.9993751645088196), ('芥川\u3000竜之介:老年', 0.9990769624710083), ('宮沢\u3000賢治:どんぐりと山猫', 0.9989293217658997)]
book/1086_ruby_5742.zip
夏目　漱石: 一夜と似た作品は？
[('夏目\u3000漱石:虞美人草', 0.9982724189758301), ('夏目\u3000漱石:草枕', 0.9965048432350159), ('宮沢\u3000賢治:注文の多い料理店', 0.9923967123031616)]
