https://www.oreilly.com/learning/how-do-i-compare-document-similarity-using-python

In [5]:
import gensim

In [6]:
print(dir(gensim))

['NullHandler', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '__version__', '_matutils', 'corpora', 'interfaces', 'logger', 'logging', 'matutils', 'models', 'parsing', 'scripts', 'similarities', 'summarization', 'topic_coherence', 'utils']


In [10]:
raw_documents = ["I'm taking the show on the road.",
                 "My socks are a force multiplier.",
             "I am the barber who cuts everyone's hair who doesn't cut their own.",
             "Legend has it that the mind is a mad monkey.",
            "I make my own fun."]

In [11]:
raw_documents

["I'm taking the show on the road.",
 'My socks are a force multiplier.',
 "I am the barber who cuts everyone's hair who doesn't cut their own.",
 'Legend has it that the mind is a mad monkey.',
 'I make my own fun.']

In [12]:
print("Number of documents:",len(raw_documents))

Number of documents: 5


In [13]:
from nltk.tokenize import word_tokenize

In [14]:
gen_docs = [[w.lower() for w in word_tokenize(text)] 
            for text in raw_documents]
print(gen_docs)

[['i', "'m", 'taking', 'the', 'show', 'on', 'the', 'road', '.'], ['my', 'socks', 'are', 'a', 'force', 'multiplier', '.'], ['i', 'am', 'the', 'barber', 'who', 'cuts', 'everyone', "'s", 'hair', 'who', 'does', "n't", 'cut', 'their', 'own', '.'], ['legend', 'has', 'it', 'that', 'the', 'mind', 'is', 'a', 'mad', 'monkey', '.'], ['i', 'make', 'my', 'own', 'fun', '.']]


In [15]:
dictionary = gensim.corpora.Dictionary(gen_docs)

In [16]:
dictionary

<gensim.corpora.dictionary.Dictionary at 0x277fea77278>

In [17]:
print(dictionary[5])

show


In [20]:
type(dictionary)

gensim.corpora.dictionary.Dictionary

In [26]:
for x in dictionary:
    print(x, dictionary[x])

0 'm
1 .
2 i
3 on
4 road
5 show
6 taking
7 the
8 a
9 are
10 force
11 multiplier
12 my
13 socks
14 's
15 am
16 barber
17 cut
18 cuts
19 does
20 everyone
21 hair
22 n't
23 own
24 their
25 who
26 has
27 is
28 it
29 legend
30 mad
31 mind
32 monkey
33 that
34 fun
35 make


In [23]:
print(dictionary.token2id['road'])

4


In [24]:
print("Number of words in dictionary:",len(dictionary))

Number of words in dictionary: 36


In [27]:
for i in range(len(dictionary)):
    print(i, dictionary[i])

0 'm
1 .
2 i
3 on
4 road
5 show
6 taking
7 the
8 a
9 are
10 force
11 multiplier
12 my
13 socks
14 's
15 am
16 barber
17 cut
18 cuts
19 does
20 everyone
21 hair
22 n't
23 own
24 their
25 who
26 has
27 is
28 it
29 legend
30 mad
31 mind
32 monkey
33 that
34 fun
35 make


#### A corpus is a list of bags of words. 

#### A bag-of-words representation for a document just lists the number of times each word occurs in the document.

In [28]:
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
print(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 2)], [(1, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1)], [(1, 1), (2, 1), (7, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 2)], [(1, 1), (7, 1), (8, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1)], [(1, 1), (2, 1), (12, 1), (23, 1), (34, 1), (35, 1)]]


In [29]:
tf_idf = gensim.models.TfidfModel(corpus)
print(tf_idf)
s = 0
for i in corpus:
    s += len(i)
print(s)

TfidfModel(num_docs=5, num_nnz=47)
47


In [38]:
sims = gensim.similarities.Similarity('c:/test/',tf_idf[corpus],
                                      num_features=len(dictionary))
print(sims)
print(type(sims))

Similarity index with 5 documents in 0 shards (stored under c:/test/)
<class 'gensim.similarities.docsim.Similarity'>


Now, get a new sentence and start comparing them

In [43]:
query_doc = [w.lower() for w in word_tokenize("Socks are a force for good.")]
print(query_doc)

['socks', 'are', 'a', 'force', 'for', 'good', '.']


In [44]:
query_doc_bow = dictionary.doc2bow(query_doc)
print(query_doc_bow)

[(1, 1), (8, 1), (9, 1), (10, 1), (13, 1)]


In [45]:
query_doc_tf_idf = tf_idf[query_doc_bow]
print(query_doc_tf_idf)

[(8, 0.31226270667960454), (9, 0.5484803253891997), (10, 0.5484803253891997), (13, 0.5484803253891997)]


In [46]:
sims[query_doc_tf_idf]

array([0.        , 0.84565616, 0.        , 0.06124881, 0.        ],
      dtype=float32)