In [1]:
from gensim.models import word2vec, fasttext, doc2vec, TfidfModel
from gensim.corpora import Dictionary
from gensim.matutils import corpus2dense
import logging
import os
import sys
import gensim
import smart_open
import numpy as np
import pandas as pd

In [2]:
os.chdir("..")
home_dir = os.getcwd()

In [3]:
os.chdir("src")
from module import set_data, bow_read_docs
from train import bow, tfidf
os.chdir(home_dir)

In [4]:
os.chdir("data")
set_data()

## Bag-of-Words

In [5]:
def vec2dense(vec, num_terms):
    return list(corpus2dense([vec], num_terms=num_terms).T[0])

In [6]:
corpus = bow_read_docs("tmp_file")
dic = Dictionary(corpus)

# bowの行列を作成
bow_matrix = np.array([vec2dense(dic.doc2bow(corpus[i]),len(dic)) for i in range(len(corpus))])

In [7]:
# 総単語数
print("総単語数: ", len(dic))
# 総文書数
print("総文書数: ", dic.num_docs)

総単語数:  690
総文書数:  12


In [8]:
file_names = os.listdir("tmp_file")
bow_df = pd.DataFrame(bow_matrix)
bow_df.index = file_names
bow_df.columns = dic.token2id.keys()

## tf-idf

In [9]:
corpus = bow_read_docs("tmp_file")
dic = Dictionary(corpus)

bow_corpus = [dic.doc2bow(d) for d in corpus]

# tfidfの計算
model = TfidfModel(bow_corpus)
model = model[bow_corpus]

In [10]:
# tfidfの行列を作成
tfidf_data = np.array([vec2dense(doc,len(dic)) for doc in model])

In [11]:
file_names = os.listdir("tmp_file")
tfidf_df = pd.DataFrame(tfidf_data)
tfidf_df.index = file_names
tfidf_df.columns = dic.token2id.keys()

## bowとtfidfの比較

In [12]:
bow_df.head()

Unnamed: 0,*,ある,いる,こと,する,そう,それ,それぞれ,それら,なる,...,保守,力不足,北米,域内,実勢,現実,発,航路,近い,高め
m:A_1_wakati.txt,81.0,2.0,1.0,2.0,13.0,1.0,1.0,1.0,1.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:A_2_wakati.txt,78.0,2.0,0.0,1.0,13.0,0.0,0.0,1.0,1.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:B_1_wakati.txt,91.0,1.0,1.0,2.0,11.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:B_2_wakati.txt,34.0,0.0,2.0,1.0,4.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:C_1_wakati.txt,59.0,4.0,6.0,3.0,16.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
tfidf_df.head()

Unnamed: 0,*,ある,いる,こと,する,そう,それ,それぞれ,それら,なる,...,保守,力不足,北米,域内,実勢,現実,発,航路,近い,高め
m:A_1_wakati.txt,0.0,0.0139,0.003317,0.006633,0.0,0.041877,0.068299,0.020546,0.052843,0.027799,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:A_2_wakati.txt,0.0,0.014572,0.0,0.003477,0.0,0.0,0.0,0.02154,0.0554,0.014572,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:B_1_wakati.txt,0.0,0.006847,0.003268,0.006535,0.0,0.0,0.0,0.020242,0.052062,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:B_2_wakati.txt,0.0,0.0,0.010763,0.005382,0.0,0.0,0.0,0.0,0.0,0.011277,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:C_1_wakati.txt,0.0,0.029909,0.021411,0.010705,0.0,0.0,0.0,0.022105,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## bowの読み込み

In [14]:
from gensim.models import word2vec, fasttext, doc2vec, TfidfModel
from gensim.corpora import Dictionary
from gensim.matutils import corpus2dense
import logging
import os
import sys
import gensim
import smart_open
import numpy as np
import pandas as pd

os.chdir(home_dir)
os.chdir("data")
corpus = bow_read_docs("tmp_file")
os.chdir(home_dir)
dic = Dictionary.load("model/bow_1.model")

bow_matrix = np.array([vec2dense(dic.doc2bow(corpus[i]),len(dic)) for i in range(len(corpus))])

os.chdir("data")
file_names = os.listdir("tmp_file")
bow_df = pd.DataFrame(bow_matrix)
bow_df.index = file_names
bow_df.columns = dic.token2id.keys()

In [15]:
bow_df

Unnamed: 0,*,ある,いる,こと,する,そう,それ,それぞれ,それら,なる,...,保守,力不足,北米,域内,実勢,現実,発,航路,近い,高め
m:A_1_wakati.txt,81.0,2.0,1.0,2.0,13.0,1.0,1.0,1.0,1.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:A_2_wakati.txt,78.0,2.0,0.0,1.0,13.0,0.0,0.0,1.0,1.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:B_1_wakati.txt,91.0,1.0,1.0,2.0,11.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:B_2_wakati.txt,34.0,0.0,2.0,1.0,4.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:C_1_wakati.txt,59.0,4.0,6.0,3.0,16.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:C_2_wakati.txt,91.0,3.0,3.0,2.0,20.0,0.0,0.0,2.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
t:a_1_wakati.txt,82.0,2.0,5.0,3.0,12.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
t:a_2_wakati.txt,71.0,0.0,7.0,2.0,15.0,0.0,0.0,1.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
t:b_1_wakati.txt,74.0,3.0,5.0,0.0,10.0,1.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
t:b_2_wakati.txt,81.0,1.0,1.0,2.0,10.0,0.0,1.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## tfidfの読み込み

In [16]:
from gensim.models import word2vec, fasttext, doc2vec, TfidfModel
from gensim.corpora import Dictionary
from gensim.matutils import corpus2dense
import logging
import os
import sys
import gensim
import smart_open
import numpy as np
import pandas as pd

os.chdir(home_dir)
os.chdir("data")
corpus = bow_read_docs("tmp_file")
os.chdir(home_dir)

model = TfidfModel.load("model/tfidf_1.model")
dic = Dictionary.load("model/bow_1.model")

bow_corpus = [dic.doc2bow(d) for d in corpus]
model = model[bow_corpus]

# tfidfの行列を作成
tfidf_data = np.array([vec2dense(doc,len(dic)) for doc in model])

os.chdir("data")
file_names = os.listdir("tmp_file")
tfidf_df = pd.DataFrame(tfidf_data)
tfidf_df.index = file_names
tfidf_df.columns = dic.token2id.keys()

In [17]:
tfidf_df

Unnamed: 0,*,ある,いる,こと,する,そう,それ,それぞれ,それら,なる,...,保守,力不足,北米,域内,実勢,現実,発,航路,近い,高め
m:A_1_wakati.txt,0.0,0.0139,0.003317,0.006633,0.0,0.041877,0.068299,0.020546,0.052843,0.027799,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:A_2_wakati.txt,0.0,0.014572,0.0,0.003477,0.0,0.0,0.0,0.02154,0.0554,0.014572,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:B_1_wakati.txt,0.0,0.006847,0.003268,0.006535,0.0,0.0,0.0,0.020242,0.052062,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:B_2_wakati.txt,0.0,0.0,0.010763,0.005382,0.0,0.0,0.0,0.0,0.0,0.011277,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:C_1_wakati.txt,0.0,0.029909,0.021411,0.010705,0.0,0.0,0.0,0.022105,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:C_2_wakati.txt,0.0,0.015903,0.00759,0.00506,0.0,0.0,0.0,0.031344,0.0,0.015903,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
t:a_1_wakati.txt,0.0,0.010499,0.012526,0.007516,0.0,0.031632,0.0,0.0,0.0,0.00525,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
t:a_2_wakati.txt,0.0,0.0,0.018687,0.005339,0.0,0.0,0.0,0.016537,0.0,0.016782,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
t:b_1_wakati.txt,0.0,0.019103,0.015195,0.0,0.0,0.03837,0.0,0.0,0.0,0.031839,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
t:b_2_wakati.txt,0.0,0.006593,0.003146,0.006293,0.0,0.0,0.064789,0.0,0.0,0.019778,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
