# 特徴量2 Bag of Words + tf-idf

1-1-bag-of-wordsで算出したBag of Wordsにtf-idfを掛け合わせます
idfの計算方法は[sklearn.TfidfTransformer](https://github.com/scikit-learn/scikit-learn/blob/14031f6/sklearn/feature_extraction/text.py#L941)に準拠しました

idf = log( (1 + N) / (1 + d) ) + 1

0除算を防ぎつつ、idfが0になることを防ぐことで全ての文書に登場する単語を完全には無視しないようになっています

In [1]:
import math
import csv
import pandas as pd
from tqdm import tqdm_notebook as tqdm
tqdm.monitor_interval = 0

In [2]:
# 1-1で算出したbowをロード
bows = pd.read_csv('../../data/bow.csv', header=None) # (500, 49815)

In [3]:
def calc_tfidf(bows):
    tf = calc_tf(bows) # (500, 49815)
    idf = calc_tf(bows)
    bows_tfidf = tf.mul(idf, axis=1)
    return bows_tfidf

In [4]:
def calc_tf(bows):
    word_num = bows.sum(axis=1)
    tf = bows.div(word_num, axis=0)
    return tf

In [5]:
def calc_idf(bows):
    doc_num = len(bows)
    doc_num_has_word = bows.applymap(bool).sum(axis=0) # (49815,)
    idf = ( (1+ doc_num) / (1 + doc_num_has_word) ).apply(math.log) + 1 # (49815,)
    return idf

In [6]:
bows_tfidf = calc_tfidf(bows)

In [7]:
# 保存
bows_tfidf.to_csv('../../data/bow_tfidf.csv', index=False, header=False)