In [1]:
from gensim.models import word2vec, fasttext, doc2vec, TfidfModel
from gensim.corpora import Dictionary
from gensim.matutils import corpus2dense
import logging
import os
import sys
import gensim
import smart_open
import numpy as np
import pandas as pd

In [2]:
os.chdir("..")
home_dir = os.getcwd()

In [3]:
os.chdir("src")
from module import set_data, read_docs
from train import bow, tfidf
os.chdir(home_dir)

In [4]:
os.chdir("data")
set_data()

## Bag-of-Words

In [5]:
def vec2dense(vec, num_terms):
    return list(corpus2dense([vec], num_terms=num_terms).T[0])

In [6]:
corpus = read_docs(mode="bow")
dic = Dictionary(corpus)

# bowの行列を作成
bow_matrix = np.array([vec2dense(dic.doc2bow(corpus[i]),len(dic)) for i in range(len(corpus))])

In [7]:
# 総単語数
print("総単語数: ", len(dic))
# 総文書数
print("総文書数: ", dic.num_docs)

総単語数:  1044
総文書数:  12


In [8]:
file_names = os.listdir("tmp_file")
bow_df = pd.DataFrame(bow_matrix)
bow_df.index = file_names
bow_df.columns = dic.token2id.keys()

## tf-idf

In [9]:
corpus = read_docs(mode="bow")
dic = Dictionary(corpus)

bow_corpus = [dic.doc2bow(d) for d in corpus]

# tfidfの計算
model = TfidfModel(bow_corpus)
model = model[bow_corpus]

In [10]:
# tfidfの行列を作成
tfidf_data = np.array([vec2dense(doc,len(dic)) for doc in model])

In [11]:
file_names = os.listdir("tmp_file")
tfidf_df = pd.DataFrame(tfidf_data)
tfidf_df.index = file_names
tfidf_df.columns = dic.token2id.keys()

## bowとtfidfの比較

In [12]:
bow_df.head()

Unnamed: 0,",",.,/,0,000,08,1,11,12,13,...,北米,回っ,域内,実勢,現実,発,約,航路,近い,高め
m:A_1.txt,5.0,1.0,16.0,1.0,1.0,1.0,7.0,2.0,4.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:A_2.txt,2.0,1.0,15.0,0.0,2.0,0.0,5.0,1.0,4.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:B_1.txt,6.0,1.0,17.0,0.0,1.0,0.0,10.0,3.0,3.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:B_2.txt,0.0,0.0,4.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:C_1.txt,8.0,6.0,18.0,0.0,0.0,0.0,9.0,5.0,9.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
tfidf_df.head()

Unnamed: 0,",",.,/,0,000,08,1,11,12,13,...,北米,回っ,域内,実勢,現実,発,約,航路,近い,高め
m:A_1.txt,0.014527,0.006088,0.0,0.036683,0.029232,0.082972,0.042614,0.0,0.038423,0.013539,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:A_2.txt,0.005985,0.00627,0.0,0.0,0.060214,0.0,0.03135,0.0,0.039573,0.027888,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:B_1.txt,0.017896,0.00625,0.0,0.0,0.030011,0.0,0.062499,0.0,0.029585,0.013899,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:B_2.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:C_1.txt,0.016003,0.02515,0.0,0.0,0.0,0.0,0.037725,0.0,0.059525,0.027965,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## bowの読み込み

In [14]:
os.chdir(home_dir)
os.chdir("data")
corpus = read_docs(mode="bow")
os.chdir(home_dir)
dic = Dictionary.load("model/bow_1.model")

bow_matrix = np.array([vec2dense(dic.doc2bow(corpus[i]),len(dic)) for i in range(len(corpus))])

os.chdir("data")
file_names = os.listdir("tmp_file")
bow_df = pd.DataFrame(bow_matrix)
bow_df.index = file_names
bow_df.columns = dic.token2id.keys()

In [15]:
bow_df

Unnamed: 0,",",.,/,0,000,08,1,11,12,13,...,北米,回っ,域内,実勢,現実,発,約,航路,近い,高め
m:A_1.txt,5.0,1.0,16.0,1.0,1.0,1.0,7.0,2.0,4.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:A_2.txt,2.0,1.0,15.0,0.0,2.0,0.0,5.0,1.0,4.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:B_1.txt,6.0,1.0,17.0,0.0,1.0,0.0,10.0,3.0,3.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:B_2.txt,0.0,0.0,4.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:C_1.txt,8.0,6.0,18.0,0.0,0.0,0.0,9.0,5.0,9.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:C_2.txt,4.0,4.0,12.0,0.0,0.0,0.0,7.0,4.0,8.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
t:a_1.txt,3.0,2.0,11.0,1.0,1.0,0.0,3.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
t:a_2.txt,6.0,2.0,9.0,0.0,0.0,0.0,2.0,2.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
t:b_1.txt,1.0,3.0,9.0,1.0,0.0,0.0,2.0,3.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
t:b_2.txt,7.0,1.0,12.0,1.0,0.0,0.0,7.0,6.0,3.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## tfidfの読み込み

In [16]:
os.chdir(home_dir)
os.chdir("data")
corpus = read_docs(mode="bow")
os.chdir(home_dir)

model = TfidfModel.load("model/tfidf_1.model")
dic = Dictionary.load("model/bow_1.model")

bow_corpus = [dic.doc2bow(d) for d in corpus]
model = model[bow_corpus]

# tfidfの行列を作成
tfidf_data = np.array([vec2dense(doc,len(dic)) for doc in model])

os.chdir("data")
file_names = os.listdir("tmp_file")
tfidf_df = pd.DataFrame(tfidf_data)
tfidf_df.index = file_names
tfidf_df.columns = dic.token2id.keys()

In [17]:
tfidf_df

Unnamed: 0,",",.,/,0,000,08,1,11,12,13,...,北米,回っ,域内,実勢,現実,発,約,航路,近い,高め
m:A_1.txt,0.014527,0.006088,0.0,0.036683,0.029232,0.082972,0.042614,0.0,0.038423,0.013539,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:A_2.txt,0.005985,0.00627,0.0,0.0,0.060214,0.0,0.03135,0.0,0.039573,0.027888,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:B_1.txt,0.017896,0.00625,0.0,0.0,0.030011,0.0,0.062499,0.0,0.029585,0.013899,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:B_2.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:C_1.txt,0.016003,0.02515,0.0,0.0,0.0,0.0,0.037725,0.0,0.059525,0.027965,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:C_2.txt,0.011708,0.024533,0.0,0.0,0.0,0.0,0.042932,0.0,0.077419,0.01364,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
t:a_1.txt,0.006768,0.009454,0.0,0.028484,0.022699,0.0,0.014181,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
t:a_2.txt,0.014634,0.010221,0.0,0.0,0.0,0.0,0.010221,0.0,0.016128,0.011366,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
t:b_1.txt,0.00263,0.016533,0.0,0.033207,0.0,0.0,0.011022,0.0,0.043478,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
t:b_2.txt,0.019854,0.005943,0.0,0.03581,0.0,0.0,0.041601,0.0,0.028132,0.026433,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
