# 第9章 トピック

In [1]:
# これまでに定義した関数の読み込み

from chapter01 import get_string_from_file
from chapter02 import get_words_from_file, configure_fonts_for_japanese
from chapter03 import get_words, bows_to_cfs, load_aozora_corpus, get_bows,  add_to_corpus,\
    get_weights, translate_bows, get_tfidfmodel_and_weights
from chapter04 import vsm_search, get_list_from_file
from chapter05 import top_n, get_pr_curve, get_average_precision

## 9.2 潜在的意味インデキシング

In [2]:
# Listing 9.1 #

# data/ch05にある各文書から文書ベクトルを作成
book_texts = [get_string_from_file('data/ch05/%d.txt' % i) for i in range(10)]
tfidf_model, dic, tfidf_weights = get_tfidfmodel_and_weights(book_texts)

In [3]:
# Listing 9.2 #

from gensim.models import LsiModel

# トピックの数
num_topics = 5

# モデルの生成
lsi_model = LsiModel(corpus=tfidf_weights, id2word=dic, num_topics=num_topics)

In [4]:
# Listing 9.3 #

print(lsi_model.print_topic(0, 3))

-0.624*"知能" + -0.484*"人工" + -0.204*"マービン・ミンスキー"


In [5]:
# Listing 9.4 #

from gensim.similarities import MatrixSimilarity

def lsi_search(texts, query, num_topics):
    # tfidfに基づいて語の重みを計算
    tfidf_model, dic, text_tfidf_weights = get_tfidfmodel_and_weights(texts)

    # LSIモデルを生成し，トピックの重みを計算
    lsi_model = LsiModel(corpus=text_tfidf_weights, id2word=dic,
                         num_topics=num_topics)
    lsi_weights = lsi_model[text_tfidf_weights]
    index = MatrixSimilarity(lsi_weights, num_features=len(dic))

    # queryのbag-of-wordsを作成し，重みを計算
    query_bows = get_bows([query], dic)
    query_tfidf_weights = get_weights(query_bows, dic, tfidf_model)
    query_lsi_weights = lsi_model[query_tfidf_weights]

    # 類似度計算
    sims = index[query_lsi_weights[0]]

    # 類似度で降順にソート
    return sorted(enumerate(sims), key=lambda x: x[1], reverse=True)

In [6]:
# Listing 9.5 #

from pprint import pprint

query = '人工知能'

# tfidfモデルに基づく検索（Listing 5.2と同じ）
# 検索結果は関連度の高さで順位づけされている．
tfidf_result = vsm_search(book_texts, query)

# 文書番号と関連度の組を出力
pprint(tfidf_result)

[(1, 0.7481151),
 (8, 0.6076249),
 (5, 0.31722325),
 (0, 0.21160641),
 (2, 0.18004589),
 (3, 0.0),
 (4, 0.0),
 (6, 0.0),
 (7, 0.0),
 (9, 0.0)]


In [7]:
# Listing 9.6 #

num_topics = 5

lsi_result = lsi_search(book_texts, query, num_topics)
pprint(lsi_result)

[(8, 0.99998426),
 (1, 0.99996907),
 (5, 0.9991018),
 (0, 0.5014957),
 (2, 0.40058395),
 (9, 0.0017561095),
 (3, 0.0),
 (6, 0.0),
 (7, 0.0),
 (4, -0.0029632207)]


In [8]:
# Listing 9.7 #

# '人工知能' をクエリとする検索の正解（5.2節）
right_answer = [0, 1, 0, 1, 0, 1, 0, 0, 1, 1]

# ランキングの計算（5.3節）
tfidf_ranking = tuple([x[0] for x in tfidf_result])
lsi_ranking = tuple([x[0] for x in lsi_result])

# 平均適合率（5.3節）による検索性能評価の比較
print('TFIDF: %.4f' % get_average_precision(tfidf_ranking, right_answer))
print('LSI:   %.4f' % get_average_precision(lsi_ranking, right_answer))

TFIDF: 0.8211
LSI:   0.8648


## 9.3 非負値行列因子分解

In [9]:
# Listing 9.8 #

from gensim.models.nmf import Nmf

def nmf_search(texts, query, num_topics, passes=20, random_state=None):
    tfidf_model, dic, text_tfidf_weights = get_tfidfmodel_and_weights(texts)

    # NMFモデルを作成
    nmf_model = Nmf(corpus=text_tfidf_weights, id2word=dic, 
                    num_topics=num_topics, passes=passes, random_state=random_state)

    # TF・IDFによる文書ベクトルをトピックベースのベクトルに変換
    nmf_weights = nmf_model[text_tfidf_weights]

    index = MatrixSimilarity(nmf_weights, num_features=len(dic))

    # クエリのトピックベースのベクトルを作成
    query_bows = get_bows([query], dic)
    query_tfidf_weights = get_weights(query_bows, dic, tfidf_model)
    query_nmf_weights = nmf_model[query_tfidf_weights]

    # クエリとの類似性で文書をランキング
    sims = index[query_nmf_weights[0]]
    return sorted(enumerate(sims), key=lambda x: x[1], reverse=True)

In [10]:
# Listing 9.9 #

# トピック数を5に設定
num_topics = 5

# book_texts, queryはこれまでと同じ
# 結果を再現するために random_state を設定
nmf_result = nmf_search(book_texts, query, num_topics, random_state=7)
pprint(nmf_result)

[(5, 0.98998123),
 (1, 0.987107),
 (8, 0.9571822),
 (9, 0.75527954),
 (0, 0.14077142),
 (7, 0.14077142),
 (2, 0.07617857),
 (4, 0.009192428),
 (6, 0.0014736673),
 (3, 0.0)]


In [11]:
# Listing 9.10 #

nmf_ranking = tuple([x[0] for x in nmf_result])
print('%.4f' % get_average_precision(nmf_ranking, right_answer))

0.8944


## 9.4 潜在的ディリクレ配分法

In [12]:
# Listing 9.11 #

from gensim.models import LdaModel

# LDAモデルの作成(この処理の中でトピックが計算される)．
# dic, tfidf_weights などは Listing 9.1 で定義されたもの．
# 再現性を持たせるため random_state に特定の値(6)を設定．
lda_model = LdaModel(corpus=tfidf_weights, id2word=dic, num_topics=5,
                     passes=20, random_state=6)

In [13]:
# Listing 9.12 #

# 文書ベクトルからトピックの分布を計算
lda_weights = lda_model[tfidf_weights]

# 1番文書の内容を表示
print(book_texts[1])

# 1番文書のトピックの確率分布を表示
pprint(lda_weights[1])

マービン・ミンスキーは，人工知能という分野の黎明期に活躍した研究者で，「人工知能の父」と呼ばれています．

[(0, 0.77690285),
 (1, 0.055816952),
 (2, 0.055802517),
 (3, 0.055735476),
 (4, 0.055742186)]


In [14]:
# Listing 9.14 #

# 0番トピックの確率分布のうち上位4語を表示
print(lda_model.print_topic(0, 4))

0.041*"知能" + 0.030*"人工" + 0.021*"ネットワーク" + 0.017*"マービン・ミンスキー"


In [15]:
# Listing 9.14 #

def lda_search(texts, query, num_topics, passes=20, random_state=None):
    tfidf_model, dic, text_tfidf_weights = get_tfidfmodel_and_weights(texts)

    # LDAモデルを作成
    lda_model = LdaModel(corpus=text_tfidf_weights, id2word=dic,
                 num_topics=num_topics, passes=passes, random_state=random_state)

    lda_weights = lda_model[text_tfidf_weights]
    index = MatrixSimilarity(lda_weights, num_features=len(dic))

    query_bows = get_bows([query], dic)
    query_tfidf_weights = get_weights(query_bows, dic, tfidf_model)
    query_lda_weights = lda_model[query_tfidf_weights]

    sims = index[query_lda_weights[0]]
    return sorted(enumerate(sims), key=lambda x: x[1], reverse=True)

In [16]:
# Listing 9.15 #

num_topics = 5

lda_result = lda_search(book_texts, query, num_topics, random_state=6)
lda_ranking = tuple([x[0] for x in lda_result])
print('%.4f' % get_average_precision(lda_ranking, right_answer))

0.9633


In [17]:
# Listing 9.16 #

# トピック数を5に設定
num_topics = 5
# 検索の試行回数を5に設定
num_trials = 5
sum_of_ap = 0.0
for i in range(num_trials):
    lda_result = lda_search(book_texts, query, num_topics)
    lda_ranking = tuple([x[0] for x in lda_result])
    ap = get_average_precision(lda_ranking, right_answer)
    print('%d: %.4f' % (i, ap))                                                 
    sum_of_ap += ap
print('平均: %.4f' %  (sum_of_ap/num_trials))

0: 0.5677
1: 0.8463
2: 0.5230
3: 0.9183
4: 0.8648
平均: 0.7440
