In [1]:
import os
import glob
import re
import numpy as np

## 前処理

In [2]:
from janome.tokenizer import Tokenizer
from janome.analyzer import Analyzer
from janome.tokenfilter import POSKeepFilter

t = Tokenizer()
token_filters = [POSKeepFilter(['名詞'])]
a = Analyzer([], t, token_filters)

In [3]:
# ファイルからノイズをとってわかち書きlistを返す
def wakatigaki(file_path):
    tokens = []
    with open(file_path, mode="r", encoding="utf-8") as f:
        text = f.read()

        # ノイズを削除
        text = re.sub(r'[0-9a-zA-Z]+', '', text)
        text = re.sub(r'[:;/+\.-]', '', text)
        text = re.sub(r'[\s\n]', '', text)

        # 形態素解析してわかち書きにする
        for token in a.analyze(text):
            tokens.append(token.surface)
    return tokens

In [4]:
# ラベルとわかち書きlistを返す（学習データ用）
def label_wakatigaki_mapper_for_train(dir_list):
    res = []
    for dir_name in dir_list:
        data = {
            'label': [],
            'wakatigaki': []
        }
        print("処理中:{}".format(dir_name))
        data['label'].append(re.match('(\.\/.*\/)(.*)(\/)', dir_name).group(2))
        for file_path in glob.glob(dir_name + "/*.txt"):
            if "LICENSE" in file_path:
                continue        
            data['wakatigaki'].extend(wakatigaki(file_path))
        res.append(data)
    print("完了")
    return res

In [5]:
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

In [6]:
tagged_document = [
    TaggedDocument(
        words=data['wakatigaki'],
        tags=data['label']
    ) for data in label_wakatigaki_mapper_for_train(glob.glob("./train-text/*/"))
]

処理中:./train-text/dokujo-tsushin/
処理中:./train-text/it-life-hack/
処理中:./train-text/kaden-channel/
処理中:./train-text/livedoor-homme/
処理中:./train-text/movie-enter/
処理中:./train-text/peachy/
処理中:./train-text/smax/
処理中:./train-text/sports-watch/
処理中:./train-text/topic-news/
完了


## モデルの作成と学習

In [7]:
model = Doc2Vec(tagged_document, dm=1, vector_size=300, window=5, min_count=1)

In [8]:
model.save('./model/doc2vec.model')
model = Doc2Vec.load('./model/doc2vec.model')

In [9]:
# TODO: 学習データ用のやつとかぶっているのでリファクタリング。学習用のテキストデータをひとつのファイルに先にまとめればこの関数が使える。
# ラベルとわかち書きを返す（テストデータ用）
def label_wakatigaki_mapper_for_test(dir_list):
    res = []
    for dir_name in dir_list:
        print("処理中:{}".format(dir_name))
        for file_path in glob.glob(dir_name + "/*.txt"):
            if "LICENSE" in file_path:
                continue        
            docterm = {
                'label': [],
                'wakatigaki': []
            }
            docterm['label'].append(re.match('(\.\/.*\/)(.*)(\/)', dir_name).group(2))
            docterm['wakatigaki'].extend(wakatigaki(file_path))
            res.append(docterm)
    print("完了")
    return res

In [10]:
test_dict = label_wakatigaki_mapper_for_test(glob.glob('./test-text/*/'))

処理中:./test-text/dokujo-tsushin/
処理中:./test-text/it-life-hack/
処理中:./test-text/kaden-channel/
処理中:./test-text/livedoor-homme/
処理中:./test-text/movie-enter/
処理中:./test-text/peachy/
処理中:./test-text/smax/
処理中:./test-text/sports-watch/
処理中:./test-text/topic-news/
完了


## 予測と評価

In [11]:
res_list = []
for i in range(len(test_dict)):
    res_dict = {}
    res_dict['label'] = test_dict[i]['label'][0] # ラベル
    res_dict['prediction'] = model.docvecs.most_similar([model.infer_vector(test_dict[i]['wakatigaki'])], topn=1)[0][0] # 予測
    res_dict['flg'] = (res_dict['prediction'] == res_dict['label']) # 正否
    res_list.append(res_dict)

全体の精度

In [12]:
np.sum([elem['flg'] for elem in res_list]) / len(res_list)

0.41140529531568226

カテゴリ別精度

In [13]:
cat_list = [re.match('(\.\/.*\/)(.*)(\/)', dir_name).group(2) for dir_name in glob.glob("./train-text/*/")]
for cat_name in cat_list:
    print("{}:{}".format(cat_name,np.count_nonzero([elem['prediction'] == elem['label']  for elem in res_list if elem['label'] == cat_name]) / np.count_nonzero([elem for elem in res_list if elem['label'] == cat_name])))

dokujo-tsushin:0.7413793103448276
it-life-hack:0.1781609195402299
kaden-channel:0.3583815028901734
livedoor-homme:0.0196078431372549
movie-enter:0.4540229885057471
peachy:0.13690476190476192
smax:0.6896551724137931
sports-watch:0.55
topic-news:0.3961038961038961
