In [1]:
import gensim
import multiprocessing
from gensim.corpora.csvcorpus import CsvCorpus
from gensim import utils
from gensim.models.word2vec import LineSentence, Word2Vec
from gensim.corpora import Dictionary, MmCorpus
from gensim.models import LsiModel
from gensim.similarities import MatrixSimilarity

In [2]:
texts = gensim.models.word2vec.LineSentence('Corpus/dump_mp.csv')

In [3]:
%time word2vec_model = Word2Vec(texts, size=400, window=5, min_count=5, workers=multiprocessing.cpu_count())

CPU times: user 24min 8s, sys: 6.43 s, total: 24min 14s
Wall time: 8min 27s


In [4]:
word2vec_model.init_sims(replace=True)
word2vec_model.save('Corpus/mp.word2vec.model')

In [5]:
dictionary = Dictionary(texts)

In [6]:
dictionary.save('Corpus/mp.dict')
dictionary = Dictionary.load('Corpus/mp.dict')

In [7]:
corpus = [dictionary.doc2bow(text) for text in texts]
MmCorpus.serialize('Corpus/mp.mm', corpus)
%time corpus = MmCorpus('Corpus/mp.mm')

CPU times: user 1.2 s, sys: 116 ms, total: 1.31 s
Wall time: 1.31 s


In [45]:
%time lsi = LsiModel(corpus, id2word=dictionary, num_topics=5)

CPU times: user 2min 46s, sys: 11.4 s, total: 2min 57s
Wall time: 2min 9s


In [46]:
lsi.save('Corpus/mp.lsi.model')

In [47]:
doc = "ativo financeiro"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow] # convert the query to LSI space

In [48]:
%time index = MatrixSimilarity(lsi[corpus]) # transform corpus to LSI space and index it



CPU times: user 2min 55s, sys: 938 ms, total: 2min 56s
Wall time: 2min 56s


In [49]:
index.save('Corpus/mp.index')
index = MatrixSimilarity.load('Corpus/mp.index')

In [50]:
sims = index[vec_lsi] # perform a similarity query against the corpus
#print(list(enumerate(sims))) # print (document_number, document_similarity) 2-tuples

In [51]:
sims = sorted(enumerate(sims), key=lambda item: -item[1])
print(sims) # print sorted (document number, similarity score) 2-tuples

[(18855, 0.99951351), (17670, 0.99915648), (34119, 0.99877572), (32597, 0.99873018), (11325, 0.99869323), (11198, 0.99867618), (34116, 0.99855161), (23986, 0.99852705), (32594, 0.99844927), (35039, 0.99835002), (32637, 0.998299), (24500, 0.99825704), (36882, 0.9982214), (34612, 0.99820912), (13119, 0.99820638), (23991, 0.99820578), (13238, 0.99819767), (34649, 0.99814475), (38260, 0.99808776), (32640, 0.99807382), (24501, 0.99804366), (37686, 0.99802703), (24974, 0.99796546), (37684, 0.99786294), (11332, 0.99776691), (36883, 0.99773747), (20490, 0.99773169), (33709, 0.99768662), (5427, 0.99766016), (11327, 0.99763429), (20492, 0.9975338), (24499, 0.9975031), (1001, 0.99742746), (23989, 0.99742377), (36488, 0.99726772), (34654, 0.99724233), (11333, 0.99720281), (25247, 0.99716699), (34618, 0.99716181), (16456, 0.99706686), (20472, 0.99702251), (19218, 0.99700713), (25250, 0.99700439), (34623, 0.99692059), (13241, 0.99680924), (32642, 0.99680531), (37663, 0.99666393), (35034, 0.99666131)

In [52]:
lsi.num_topics

5

In [54]:
lsi.print_topics(5)

[(0,
  u'0.445*"rg" + 0.362*"sao" + 0.278*"paulo" + 0.251*"partir" + 0.249*"dias" + 0.196*"ee" + 0.173*"art" + 0.140*"sp" + 0.138*"lei" + 0.133*"efp"'),
 (1,
  u'-0.556*"rg" + 0.469*"sao" + 0.352*"paulo" + -0.275*"ee" + 0.149*"efp" + 0.148*"art" + 0.112*"ltda" + 0.112*"dias" + -0.110*"peb" + -0.102*"sqc"'),
 (2,
  u'0.395*"ltda" + -0.293*"partir" + -0.272*"efp" + -0.261*"dias" + 0.252*"sp" + -0.239*"art" + 0.207*"valor" + 0.164*"item" + 0.137*"municipal" + 0.129*"processo"'),
 (3,
  u'-0.505*"sao" + -0.363*"paulo" + 0.341*"dias" + 0.318*"partir" + 0.299*"art" + 0.279*"efp" + 0.179*"ltda" + 0.146*"lei" + -0.138*"rg" + -0.130*"ee"'),
 (4,
  u'-0.627*"merc" + -0.573*"panamericanoarr" + -0.519*"sa" + -0.061*"panamericano" + -0.033*"arr" + -0.031*"banco" + -0.020*"aarr" + -0.019*"leas" + -0.019*"dibens" + 0.012*"ltda"')]