In [6]:
import zipfile
import os.path
import urllib.request as req
import MeCab
from gensim import models
from gensim.models.doc2vec import TaggedDocument

# Mecabの初期化
mecab = MeCab.Tagger()
mecab.parse('')

save_dir = 'book'

# 青空文庫のリスト
list = [
    {
        'author': {
            'name': '宮沢　賢治',
            'url': 'https://www.aozora.gr.jp/cards/000081/files/'
        },
        'book': [
            {'name': '銀河鉄道の夜', 'zipname': '43737_ruby_19028.zip'},
            {'name': '注文の多い料理店', 'zipname': '1927_ruby_17835.zip'},
            {'name': 'セロ弾きのゴーシュ', 'zipname': '470_ruby_3987.zip'},
            {'name': 'やまなし', 'zipname': '46605_ruby_29758.zip'},
            {'name': 'どんぐりと山猫', 'zipname': '43752_ruby_17595.zip'},
        ]
    },
    {
        'author': {
            'name': '芥川　竜之介',
            'url': 'https://www.aozora.gr.jp/cards/000879/files/'
        },
        'book': [
            {'name': '羅生門', 'zipname': '127_ruby_150.zip'},
            {'name': '鼻', 'zipname': '42_ruby_154.zip'},
            {'name': '河童', 'zipname': '69_ruby_1321.zip'},
            {'name': '歯車', 'zipname': '42377_ruby_34744.zip'},
            {'name': '老年', 'zipname': '131_ruby_241.zip'},
        ]
    },
    {
        'author': {
            'name': '太宰　治',
            'url': 'https://www.aozora.gr.jp/cards/000035/files/'
        },
        'book': [
            {'name': '斜陽', 'zipname': '1565_ruby_8220.zip'},
            {'name': '走れメロス', 'zipname': '1567_ruby_4948.zip'},
            {'name': '津軽', 'zipname': '2282_ruby_1996.zip'},
            {'name': 'お伽草紙', 'zipname': '307_ruby_3042.zip'},
            {'name': '人間失格', 'zipname': '301_ruby_5915.zip'},
        ]
    },
    {
        'author': {
            'name': '夏目　漱石',
            'url': 'https://www.aozora.gr.jp/cards/000148/files/'
        },
        'book': [
            {'name': '吾輩は猫である', 'zipname': '789_ruby_5639.zip'},
            {'name': '坊ちゃん', 'zipname': '752_ruby_2438.zip'},
            {'name': '草枕', 'zipname': '776_ruby_6020.zip'},
            {'name': '虞美人草', 'zipname': '761_ruby_1861.zip'},
            {'name': '三四郎', 'zipname': '794_ruby_4237.zip'},
        ]
    },
]

def book_list():
    for novel in list:
        author = novel['author']
        for book in novel['book']:
            yield author, book

def read_book(author, book):
    zipname = book['zipname']
    save_path = os.path.join(save_dir, zipname)
    if not os.path.exists(save_path):
        print(save_path)
        req.urlretrieve(author['url'] + zipname, save_path)

    with zipfile.ZipFile(save_path, 'r') as zf:
        for filename in zf.namelist():
            with zf.open(filename, 'r') as f:
                return f.read().decode('shift-jis')

def split_words(text):
    node = mecab.parseToNode(text)
    wakati_words = []
    while node is not None:
        hinshi = node.feature.split(',')[0]
        if hinshi in ['名詞']:
            wakati_words.append(node.surface)
        elif hinshi in ['動詞', '形容詞']:
            wakati_words.append(node.feature.split(',')[6])
        node = node.next
    return wakati_words

# 小説をDoc2Vecで認識できるように解析
doc_list = []
for author, book in book_list():
    words = read_book(author, book)
    wakati_words = split_words(words)
    doc = TaggedDocument(wakati_words, [author['name'] + ':' + book['name']])
    doc_list.append(doc)

model = models.Doc2Vec(doc_list, dm=1, vector_size=300, window=5, min_count=1)

# モデル保存
model.save('aozora.model')

print('モデル作成完了')


book/42377_ruby_34744.zip
book/131_ruby_241.zip
book/1565_ruby_8220.zip
book/1567_ruby_4948.zip
book/2282_ruby_1996.zip
book/307_ruby_3042.zip
book/301_ruby_5915.zip
book/789_ruby_5639.zip
book/752_ruby_2438.zip
book/776_ruby_6020.zip
book/761_ruby_1861.zip
book/794_ruby_4237.zip
モデル作成完了
