In [1]:
from gensim.models import doc2vec
import os
import sys
import logging
import gensim
import numpy as np
import pandas as pd

os.chdir("../src")

from module import swap, read_docs

os.chdir("..")

def train(model_type, model_name):
    """
    gensimを使用して単語ベクトルを学習, モデルの保存を行う.
    各種学習アルゴリズムは下記関数にて呼び出す.
    
    doc2vec: d2v

    実行例(fasttextを使用)
    python nlp_with_gensim.py fasttext
    """

    # corpus_file = "data.txt"
    iter_count = 1

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    if(model_type=="doc2vec"):
        d2v(model_name, iter_count)
    else:
        print("only doc2vec.")

def d2v(model_name, iter_count):
    """
    doc2vec
    """

    print("prepare data.")
    os.chdir("data")
    sentences = list(read_docs())

    print("train model.")
    # workers=1にしなければseed固定は意味がない(ドキュメントより)
    model = doc2vec.Doc2Vec(min_count=1, seed=1, workers=1, iter=iter_count)
    model.build_vocab(sentences)
    model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)

    print("save model.")
    os.chdir("..")
    model.save("model/%s" % model_name)

def test(model_type, model_name):
    """
    各種ファイルの類似度を計算
    計算結果を出力

    保存ファイル例
    data/csv/doc2vec_tag_1.csv
    """

    model = load_model(model_type, model_name)


def load_model(model_type, model_name):
    """
    モデルの読み込み
    """

    print("load model.")
    if(model_type=="doc2vec"):
        model = doc2vec.Doc2Vec.load("model/%s" % model_name)

    return model


def save_csv():
    """"""

In [2]:
os.getcwd()

'/mnt/Data/program/project/NLP/NLP_test'

In [3]:
def calc_sim_dv(dv1, dv2):
    """
    文章ベクトル間の類似度計算と表示

    dv1: 文書タグ
    dv2: 文書タグ
    """
    return model.docvecs.similarity(dv1, dv2)

In [4]:
def calc():
    model.docvecs.doctags = os.listdir("data/tmp_file")
    sim = []
    for i in range(model.docvecs.count):
        tmp = []
        for j in range(model.docvecs.count):
            tmp.append(calc_sim_dv(i, j))
        sim.append(tmp)
    
    sim = np.array(sim)
    df = pd.DataFrame(sim)
    df.index = model.docvecs.doctags
    df.columns = model.docvecs.doctags
    
    return df

In [6]:
model_name = "doc2vec_tag_1.model"
model = load_model("doc2vec", model_name)

load model.


In [7]:
calc()

Unnamed: 0,m:A_1.txt,m:A_2.txt,m:B_1.txt,m:B_2.txt,m:C_1.txt,m:C_2.txt,t:a_1.txt,t:a_2.txt,t:b_1.txt,t:b_2.txt,t:c_1.txt,t:c_2.txt
m:A_1.txt,1.0,0.032312,-0.196912,0.06734,0.073744,0.078322,0.223437,0.210962,0.032059,0.094102,0.157412,0.129969
m:A_2.txt,0.032312,1.0,0.231307,0.173392,0.341601,0.181009,0.185303,0.236213,0.166949,-0.031256,0.298171,0.132943
m:B_1.txt,-0.196912,0.231307,1.0,0.09018,0.245516,0.243318,0.283163,0.076652,0.043457,-0.058766,0.119267,-0.001595
m:B_2.txt,0.06734,0.173392,0.09018,1.0,0.123425,0.183956,0.083514,0.098446,0.278242,-0.067396,0.12339,-0.016442
m:C_1.txt,0.073744,0.341601,0.245516,0.123425,1.0,0.347821,0.304701,0.114815,0.329342,0.077163,0.357755,0.263068
m:C_2.txt,0.078322,0.181009,0.243318,0.183956,0.347821,1.0,0.33091,0.075793,0.179568,-0.118901,0.171477,0.30089
t:a_1.txt,0.223437,0.185303,0.283163,0.083514,0.304701,0.33091,1.0,0.003074,0.17374,0.064094,0.126919,0.243154
t:a_2.txt,0.210962,0.236213,0.076652,0.098446,0.114815,0.075793,0.003074,1.0,-0.025346,0.036253,0.335857,0.14879
t:b_1.txt,0.032059,0.166949,0.043457,0.278242,0.329342,0.179568,0.17374,-0.025346,1.0,0.003635,0.279328,0.026027
t:b_2.txt,0.094102,-0.031256,-0.058766,-0.067396,0.077163,-0.118901,0.064094,0.036253,0.003635,1.0,0.04396,0.219531


In [8]:
model.docvecs.doctags

['m:A_1.txt',
 'm:A_2.txt',
 'm:B_1.txt',
 'm:B_2.txt',
 'm:C_1.txt',
 'm:C_2.txt',
 't:a_1.txt',
 't:a_2.txt',
 't:b_1.txt',
 't:b_2.txt',
 't:c_1.txt',
 't:c_2.txt']