Added sample code.

sandinmyjoints · Apr 23, 2012 · 17a67f6 · 17a67f6
1 parent 51d21c3
commit 17a67f6
Show file tree

Hide file tree

Showing 2 changed files with 90 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+data
diff --git a/gensim_example.py b/gensim_example.py
@@ -0,0 +1,89 @@
+import logging, sys, pprint
+
+
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+
+### Generating a training/background corpus
+from gensim.corpora import TextCorpus, MmCorpus, Dictionary
+
+# Provide a filename or a file-like object as input and TextCorpus will be automatically initialized with a
+# dictionary in `self.dictionary`and will support the `iter` corpus method. For other kinds of corpora, you only
+# need to override `get_texts` and provide your own implementation..
+background_corpus = TextCorpus(input=YOUR_CORPUS)
+
+background_corpus.dictionary.save("my_dict.dict")  # Important -- save the dictionary generated by the corpus, or future operations will not be able to map results back to original words.
+
+MmCorpus.serialize("background_corpus.mm", background_corpus)  #  Uses numpy to persist wiki corpus in Matrix Market format. File will be several GBs.
+
+
+### Generating a large training/background corpus using Wikipedia
+from gensim.corpora import WikiCorpus, wikicorpus
+
+articles = "enwiki-latest-pages-articles.xml.bz2"  # available from http://en.wikipedia.org/wiki/Wikipedia:Database_download
+
+wiki_corpus = WikiCorpus(articles)  # This will take many hours! Output is Wikipedia in bucket-of-words (BOW) sparse matrix.
+wiki_corpus.dictionary.save("wiki_dict.dict")
+
+MmCorpus.serialize("wiki_corpus.mm", wiki_corpus)  #  File will be several GBs.
+
+
+### Working with persisted corpus and dictionary
+bow_corpus = MmCorpus("wiki_corpus.mm")  # Revive a corpus
+
+dictionary = Dictionary.load("wiki_dict.dict")  # Load a dictionary
+
+
+### Transformations among vector spaces
+from gensim.models import LsiModel, LogEntropyModel
+
+logent_transformation = LogEntropyModel(wiki_corpus, id2word=dictionary)  # Log Entropy weights frequencies of all document features in the corpus
+
+tokenize_func = wikicorpus.tokenize  # The tokenizer used to create the Wikipedia corpus
+document = "Some text to be transformed."
+bow_document = dictionary.doc2bow(tokenize_func(document))  # First, tokenize document using the same tokenization as was used on the background corpus, and then convert it to BOW representation using the dictionary created when generating the background corpus.
+logent_document = logent_transformation[[bow_document]]  # converts a single document to log entropy representation. document must be in the same vector space as corpus.
+
+documents = ["Some iterable", "containing multiple", "documents", "..."]
+bow_documents = (dictionary.doc2bow(tokenize_func(document)) for document in documents)  # use a generator expression because...
+logent_documents = logent_transformation[bow_documents]  # ...transformation is done during iteration of documents using generators, so this uses constant memory
+
+### Chained transformations
+logent_corpus = MmCorpus(corpus=logent_transformation[bow_corpus], id2word=dictionary)  # builds corpus from iterating over documents of bow_corpus as transformed to log entropy representation. Will also take many hours with Wikipedia corpus.
+
+lsi_transformation = LsiModel(corpus=logent_corpus, id2word=dictionary, num_features=400)  # creates LSI transformation model from log entropy corpus representation. Takes several hours with Wikipedia corpus.
+
+lsi_transformation = LsiModel(corpus=logent_transformation[bow_corpus], id2word=dictionary, num_features=400)  # Performs same operation as above, but with implicit chaining
+
+# Can persist transformation models, too.
+logent_transformation.save("logent.model")
+lsi_transformation.save("lsi.model")
+
+
+### Similarities (the best part)
+from gensim.similarities import Similarity
+
+documents = ["A bear walked in the dark forest.",
+             "Tall trees have many more leaves than short bushes.",
+             "A starship may someday travel across vast reaches of space to other stars.",
+             "Difference is the concept of how two or more entities are not the same."]
+# A corpus can be anything, as long as iterating over it produces a representation of the corpus documents as vectors.
+corpus = (dictionary.doc2bow(tokenize_func(document)) for document in documents)
+
+index = Similarity(corpus=lsi_transformation[logent_transformation[corpus]], num_features=400, output_prefix="shard")
+
+print "Index corpus:"
+pprint.pprint(documents)
+
+print "Similarities of index corpus documents to one another:"
+pprint.pprint([s for s in index])
+
+query = "In the face of ambiguity, refuse the temptation to guess."
+sims_to_query = index[lsi_transformation[logent_transformation[dictionary.doc2bow(tokenize_func(query))]]]
+print "Similarities of index corpus documents to '%s'" % query
+pprint.pprint(sims_to_query)
+
+best_score = max(sims_to_query)
+index = sims_to_query.tolist().index(best_score)
+most_similar_doc = documents[index]
+print "The document most similar to the query is '%s' with a score of %.2f." % (most_similar_doc, best_score)