# 篇章文本匹配

1. 主题提取，比较doc相似度
2. 长文本分类(参考04_文本分类)

In [1]:
from nltk.tokenize import RegexpTokenizer
from gensim import corpora, models
import gensim

In [2]:
tokenizer = RegexpTokenizer(r'\w+')
    
# create sample documents
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health." 

# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

# list for tokenized documents in loop
texts = []

# loop through document list
for i in doc_set:
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)
    # add tokens to list
    texts.append(tokens)

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
    
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)

In [3]:
ldamodel.show_topics(num_topics=2, num_words=4)

[(0, '0.087*"to" + 0.087*"my" + 0.047*"brother" + 0.047*"mother"'),
 (1, '0.074*"health" + 0.053*"that" + 0.032*"driving" + 0.032*"is"')]

In [4]:
ldamodel.show_topics(num_topics=3, num_words=3)

[(0, '0.087*"to" + 0.087*"my" + 0.047*"brother"'),
 (1, '0.074*"health" + 0.053*"that" + 0.032*"driving"')]

保存模型：

In [5]:
ldamodel.save('lda.model')
del ldamodel

加载模型并预测：

In [6]:
ldamodel = gensim.models.ldamodel.LdaModel.load('lda.model')
ldamodel.show_topics(num_topics=3, num_words=3)

[(0, '0.087*"to" + 0.087*"my" + 0.047*"brother"'),
 (1, '0.074*"health" + 0.053*"that" + 0.032*"driving"')]

In [7]:
ldamodel.get_topics()

array([[0.0334055 , 0.0466727 , 0.03333215, 0.03332681, 0.03340471,
        0.01991689, 0.01998265, 0.0466696 , 0.08670212, 0.01998262,
        0.0867081 , 0.01997586, 0.01997579, 0.0199755 , 0.01984536,
        0.01997548, 0.01997629, 0.0199762 , 0.01997623, 0.01997717,
        0.00668887, 0.00668821, 0.00668856, 0.00668809, 0.00669494,
        0.00668847, 0.00668797, 0.01995114, 0.006689  , 0.00668838,
        0.00668861, 0.00669338, 0.01999401, 0.01999387, 0.01999429,
        0.01999382, 0.0199941 , 0.01999413, 0.01999382, 0.01999411,
        0.01999412, 0.01999333, 0.0199936 , 0.01999399, 0.00669491,
        0.00669497, 0.00669473, 0.00669486],
       [0.03180207, 0.01068167, 0.01067352, 0.01068201, 0.03180332,
        0.03202962, 0.01067961, 0.0106866 , 0.01069377, 0.01067967,
        0.01068425, 0.01069043, 0.01069054, 0.01069099, 0.03214349,
        0.01069102, 0.01068974, 0.01068988, 0.01068983, 0.01068833,
        0.03184251, 0.03184355, 0.031843  , 0.03184375, 0.07432368,
   

In [8]:
ldamodel.top_topics(corpus)

[([(0.0867081, 'to'),
   (0.086702116, 'my'),
   (0.0466727, 'brother'),
   (0.046669602, 'mother'),
   (0.033405498, 'brocolli'),
   (0.03340471, 'good'),
   (0.033332147, 'but'),
   (0.033326812, 'eat'),
   (0.019994289, 'do'),
   (0.019994127, 'i'),
   (0.019994117, 'perform'),
   (0.01999411, 'often'),
   (0.019994099, 'feel'),
   (0.019994013, 'at'),
   (0.01999399, 'well'),
   (0.019993873, 'better'),
   (0.019993823, 'drive'),
   (0.019993816, 'never'),
   (0.019993596, 'seems'),
   (0.019993326, 'school')],
  -5.435445407792313),
 ([(0.074323684, 'health'),
   (0.053080745, 'that'),
   (0.032143492, 'driving'),
   (0.032029618, 'is'),
   (0.031975087, 'pressure'),
   (0.031843934, 'may'),
   (0.031843748, 'experts'),
   (0.031843554, 'blood'),
   (0.031843293, 'suggest'),
   (0.031843133, 'increased'),
   (0.031843, 'cause'),
   (0.03184292, 'tension'),
   (0.031842507, 'and'),
   (0.031842303, 'some'),
   (0.03183317, 'say'),
   (0.031832974, 'your'),
   (0.031832885, 'for'),


In [9]:
ldamodel[corpus]

<gensim.interfaces.TransformedCorpus at 0x7fa158574e50>

In [10]:
index = gensim.similarities.MatrixSimilarity(ldamodel[corpus])
index.save('simIndex.index')
index

<gensim.similarities.docsim.MatrixSimilarity at 0x7fa158581820>

In [11]:
def get_text_sim(text):
    doc_bow = [dictionary.doc2bow(text) for text in [tokenizer.tokenize(text.lower())]]
    print(doc_bow)
    vec_lda = ldamodel[doc_bow]
    sims = index[vec_lda]

    return sims

In [12]:
get_text_sim(doc_a)

[[(0, 2), (1, 1), (2, 1), (3, 2), (4, 2), (5, 1), (6, 1), (7, 1), (8, 2), (9, 1), (10, 2)]]


array([[0.99999994, 0.99999577, 0.07365547, 0.9999525 , 0.08877519]],
      dtype=float32)

In [13]:
get_text_sim(doc_e)

[[(0, 1), (4, 1), (5, 1), (24, 2), (31, 1), (44, 1), (45, 1), (46, 1), (47, 1)]]


array([[0.08877692, 0.09167764, 0.99988496, 0.07906448, 1.        ]],
      dtype=float32)

可以看出`doc_e`距离 主题2 的语义更接近。

In [14]:
import os
os.remove('lda.model')
os.remove('lda.model.state')
os.remove('lda.model.id2word')
os.remove('lda.model.expElogbeta.npy')
os.remove('simIndex.index')

本节完。