In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# 1. From String to Vectors

In [19]:
from gensim import corpora

# This is a tiny corpus of nine documents, each consisting of only a single sentence.
documents = ["Human machine interface for lab abc computer applications",
              "A survey of user opinion of computer system response time",
              "The EPS user interface management system",
              "System and human system engineering testing of EPS",
              "Relation of user perceived response time to error measurement",
              "The generation of random binary unordered trees",
              "The intersection graph of paths in trees",
              "Graph minors IV Widths of trees and well quasi ordering",
              "Graph minors A survey"]

Tokenize the documents, remove common words (using a toy stoplist) as well as words that only appear once in the corpus.

In [20]:
from pprint import pprint
# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
        for document in documents]
pprint(texts)

# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
pprint(frequency)

texts = [[token for token in text if frequency[token] > 1]
        for text in texts]
pprint(texts)

[['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'],
 ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'management', 'system'],
 ['system', 'human', 'system', 'engineering', 'testing', 'eps'],
 ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
 ['generation', 'random', 'binary', 'unordered', 'trees'],
 ['intersection', 'graph', 'paths', 'trees'],
 ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'],
 ['graph', 'minors', 'survey']]
defaultdict(<class 'int'>,
            {'abc': 1,
             'applications': 1,
             'binary': 1,
             'computer': 2,
             'engineering': 1,
             'eps': 2,
             'error': 1,
             'generation': 1,
             'graph': 3,
             'human': 2,
             'interface': 2,
             'intersection': 1,
             'iv': 1,
             'lab': 1,
             'machine': 1,
       

convert documents to vectors, we’ll use a document representation called bag-of-words.

In [21]:
dictionary = corpora.Dictionary(texts)
dictionary.save('01data/deerwester.dict')
print(dictionary)

2018-03-14 15:58:27,552 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-03-14 15:58:27,555 : INFO : built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)
2018-03-14 15:58:27,556 : INFO : saving Dictionary object under 01out/deerwester.dict, separately None
2018-03-14 15:58:27,560 : INFO : saved 01out/deerwester.dict


Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)


In [22]:
print(dictionary.token2id)

{'computer': 0, 'human': 1, 'interface': 2, 'response': 3, 'survey': 4, 'system': 5, 'time': 6, 'user': 7, 'eps': 8, 'trees': 9, 'graph': 10, 'minors': 11}


In [23]:
# The function doc2bow() simply counts the number of occurrences of each distinct word, 
# converts the word to its integer word id and returns the result as a sparse vector.

new_doc = 'Human computer interaction'
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)

[(0, 1), (1, 1)]


The sparse vector [(0, 1), (1, 1)] therefore reads: in the document “Human computer interaction”, the words computer (id 0) and human (id 1) appear once; the other ten dictionary words appear (implicitly) zero times.

In [24]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('01data/deerwester.mm', corpus)
pprint(corpus)

2018-03-14 15:58:30,442 : INFO : storing corpus in Matrix Market format to 01out/deerwester.mm
2018-03-14 15:58:30,445 : INFO : saving sparse matrix to 01out/deerwester.mm
2018-03-14 15:58:30,447 : INFO : PROGRESS: saving document #0
2018-03-14 15:58:30,449 : INFO : saved 9x12 matrix, density=25.926% (28/108)
2018-03-14 15:58:30,453 : INFO : saving MmCorpus index to 01out/deerwester.mm.index


[[(0, 1), (1, 1), (2, 1)],
 [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(2, 1), (5, 1), (7, 1), (8, 1)],
 [(1, 1), (5, 2), (8, 1)],
 [(3, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(4, 1), (10, 1), (11, 1)]]


# 2. Corpus Streaming – One Document at a Time
Assume the documents are stored in a file on disk, one document per line. Gensim only requires that a corpus must be able to return one document vector at a time

In [None]:
class MyCorpus(object):
    def __iter__(self):
        for line in open('01data/mycorpus.txt'):
            yield dictionary.doc2bow(line.lower().split())