In [73]:
from gensim.models import word2vec, fasttext, doc2vec, TfidfModel
from gensim.corpora import Dictionary
from gensim.matutils import corpus2dense
import logging
import os
import sys
import gensim
import smart_open
import numpy as np
import pandas as pd

In [74]:
os.chdir("..")
home_dir = os.getcwd()

In [75]:
os.chdir("src")
from module import set_data, read_docs
os.chdir(home_dir)

In [76]:
os.chdir("data")
set_data()

## Bag-of-Words

In [77]:
def vec2dense(vec, num_terms):
    return list(corpus2dense([vec], num_terms=num_terms).T[0])

corpus = read_docs(mode="bow")
dic = Dictionary(corpus)

# bowの行列を作成
bow_matrix = np.array([vec2dense(dic.doc2bow(corpus[i]),len(dic)) for i in range(len(corpus))])

In [78]:
# 総単語数
print("総単語数: ", len(dic))
# 総文書数
print("総文書数: ", dic.num_docs)

総単語数:  1044
総文書数:  12


In [80]:
file_names = os.listdir("tmp_file")
bow_df = pd.DataFrame(bow_matrix)
bow_df.index = file_names
bow_df.columns = dic.token2id.keys()

## tf-idf

In [81]:
corpus = read_docs(mode="bow")
dic = Dictionary(corpus)

bow_corpus = [dic.doc2bow(d) for d in corpus]

# tfidfの計算
model = TfidfModel(bow_corpus)
model = model[bow_corpus]

In [82]:
# tfidfの行列を作成
tfidf_data = np.array([vec2dense(doc,len(dic)) for doc in model])

In [83]:
file_names = os.listdir("tmp_file")
tfidf_df = pd.DataFrame(tfidf_data)
tfidf_df.index = file_names
tfidf_df.columns = dic.token2id.keys()

## bowとtfidfの比較

In [86]:
bow_df.head()

Unnamed: 0,",",.,/,000,1,10,11,12,13,150,...,設備,認知,質,足許,運転,鉛,銀,顕在,魅力,－
m:B_1.txt,6.0,1.0,17.0,1.0,10.0,3.0,3.0,3.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:B_2.txt,0.0,0.0,4.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:C_2.txt,4.0,4.0,12.0,0.0,7.0,0.0,4.0,8.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:A_1.txt,5.0,1.0,16.0,1.0,7.0,0.0,2.0,4.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:C_1.txt,8.0,6.0,18.0,0.0,9.0,0.0,5.0,9.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [87]:
tfidf_df.head()

Unnamed: 0,",",.,/,000,1,10,11,12,13,150,...,設備,認知,質,足許,運転,鉛,銀,顕在,魅力,－
m:B_1.txt,0.017896,0.00625,0.0,0.030011,0.062499,0.071282,0.0,0.029585,0.013899,0.085181,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:B_2.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:C_2.txt,0.011708,0.024533,0.0,0.0,0.042932,0.0,0.0,0.077419,0.01364,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:A_1.txt,0.014527,0.006088,0.0,0.029232,0.042614,0.0,0.0,0.038423,0.013539,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m:C_1.txt,0.016003,0.02515,0.0,0.0,0.037725,0.0,0.0,0.059525,0.027965,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
