In [19]:
### This will fail since we are not importing os and calling os.walk
def fetch_documents(top_directory):
    """
    Generator: iterate over all relevant documents, yielding one
    document (=list of utf8 tokens) at a time.
    """
    # find all .txt documents, no matter how deep under top_directory
    for root, dirs, files in os.walk(top_directory):
        for fname in filter(lambda fname: fname.endswith('.txt'), files):
            # read each document as one big string
            return fname

In [20]:
exec_obj = fetch_documents("./dataset")
#print(obj)

NameError: name 'os' is not defined

In [1]:
### This will not fail since the evaluation is lazy 
def iter_documents(top_directory):
    """
    Generator: iterate over all relevant documents, yielding one
    document (=list of utf8 tokens) at a time.
    """
    # find all .txt documents, no matter how deep under top_directory
    for root, dirs, files in os.walk(top_directory):
        for fname in filter(lambda fname: fname.endswith('.txt'), files):
            # read each document as one big string
            yield fname

In [4]:
iter_obj = iter_documents("./dataset")
iter_obj

<generator object iter_documents at 0x0000028777A0AE60>

In [5]:
## this will now fail when we try to fetch doc
next(iter_obj)

NameError: name 'os' is not defined

### Lets do something meaningful... 

The task is tokenize all the .txt files in the dataset directory... 

#### Why tokenization ???
All the Algorthims from Bag of Words, to LDA to Word2Vec take tokens as the input... 

We need to transform our text files so that we can pass them into Algos in downstream operations

In [6]:
def iter_documents(top_directory):
    """
    Generator: iterate over all relevant documents, yielding one
    document (=list of utf8 tokens) at a time.
    """
    # find all .txt documents, no matter how deep under top_directory
    for root, dirs, files in os.walk(top_directory):
        for fname in filter(lambda fname: fname.endswith('.txt'), files):
            # read each document as one big string
            document = open(os.path.join(root, fname)).read()
            # break document into utf8 tokens
            yield gensim.utils.tokenize(document, lower=True, errors='ignore')

##### We need to create a class that overrides __iter__ else we Generators is the only solution  

What are Generators ??? Language provided Iterators that can be used only once.. refer to the notebook on Generators

In [7]:
class TxtSubdirsCorpus(object):
    """
    Iterable: on each iteration, return bag-of-words vectors,
    one vector for each document.
 
    Process one document at a time using generators, never
    load the entire corpus into RAM.
 
    """
    def __init__(self, top_dir):
        self.top_dir = top_dir
        # create dictionary = mapping for documents => sparse vectors
        self.dictionary = gensim.corpora.Dictionary(iter_documents(top_dir))
 
    def __iter__(self):
        """
        Again, __iter__ is a generator => TxtSubdirsCorpus is a streamed iterable.
        """
        for tokens in iter_documents(self.top_dir):
            # transform tokens (strings) into a sparse vector, one at a time
            yield self.dictionary.doc2bow(tokens)

In [None]:
# that's it! the streamed corpus of sparse vectors is ready
corpus = TxtSubdirsCorpus('./dataset/')
 
# print the corpus vectors
for vector in corpus:
    print vector