In [28]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from gensim import corpora, models, similarities
from collections import defaultdict
#dokumen
documents = ["Human machine interface for lab abc computer applications",
             "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]

# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
    for document in documents]
# remove words that appear only once
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1]
         for text in texts]
from pprint import pprint   # pretty-printer
pprint(texts)


[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]


In [5]:
dictionary = corpora.Dictionary(texts)
dictionary.save('/home/robihidayat/Workshop/Python/Project DataBase/deerwester.dict') # store the dictionary, for future reference
print(dictionary)

Dictionary(12 unique tokens: [u'minors', u'graph', u'system', u'trees', u'eps']...)


In [6]:
print(dictionary.token2id)

{u'minors': 11, u'graph': 10, u'system': 6, u'trees': 9, u'eps': 8, u'computer': 1, u'survey': 5, u'user': 7, u'human': 2, u'time': 4, u'interface': 0, u'response': 3}


In [7]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('/home/robihidayat/Workshop/Python/Project DataBase/deerwester.mm', corpus) # store to disk, for later use
print(corpus)

[[(0, 1), (1, 1), (2, 1)], [(1, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(0, 1), (6, 1), (7, 1), (8, 1)], [(2, 1), (6, 2), (8, 1)], [(3, 1), (4, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(5, 1), (10, 1), (11, 1)]]


In [18]:
#convert ke bentuk lain corpus lain
#corpora.SvmLightCorpus.serialize('/home/robihidayat/Workshop/Python/Project DataBase/corpus.svmlight', corpus)
#corpora.BleiCorpus.serialize('/home/robihidayat/Workshop/Python/Project DataBase/corpus.lda-c', corpus)
#corpora.LowCorpus.serialize('/home/robihidayat/Workshop/Python/Project DataBase/corpus.low', corpus)

In [8]:
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)
doc = "Human"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow] # convert the query to LSI space
print(vec_lsi)

[(0, 0.22135077844281972), (1, 0.11317961736693796)]


In [22]:
index = similarities.MatrixSimilarity(lsi[corpus]) 
# transform corpus to LSI space and index it
#index.save('/home/robihidayat/Workshop/Python/Project DataBase/deerwester.index')
#index = similarities.MatrixSimilarity.load('/home/robihidayat/Workshop/Python/Project DataBase/deerwester.index')
sims = index[vec_lsi]
print(list(enumerate(sims)))
sims = sorted(enumerate(sims), key=lambda item: -item[1])
# documents[1]



[(0, 0.96628642), (1, 0.7790606), (2, 0.96472478), (3, 0.98751092), (4, 0.72790748), (5, -0.4319548), (6, -0.41574723), (7, -0.40878829), (8, -0.26876175)]


In [25]:
for ls in sims:
    print documents[ls[0]], ls[1]

System and human system engineering testing of EPS ,  0.987511
Human machine interface for lab abc computer applications ,  0.966286
The EPS user interface management system ,  0.964725
A survey of user opinion of computer system response time ,  0.779061
Relation of user perceived response time to error measurement ,  0.727907
Graph minors A survey ,  -0.268762
Graph minors IV Widths of trees and well quasi ordering ,  -0.408788
The intersection graph of paths in trees ,  -0.415747
The generation of random binary unordered trees ,  -0.431955
