In [2]:
import os, gensim

def iter_documents(top_directory):
    """Iterate over all documents, yielding a document (=list of utf8 tokens) at a time."""
    for root, dirs, files in os.walk(top_directory):
        for file in filter(lambda file: file.endswith('.txt'), files):
            document = open(os.path.join(root, file), encoding='utf8').read() # read the entire document, as one big string
            yield gensim.utils.tokenize(document, lower=True) # or whatever tokenization suits you

class MyCorpus(object):
    def __init__(self, top_dir):
        self.top_dir = top_dir
        self.dictionary = gensim.corpora.Dictionary(iter_documents(top_dir))
        self.dictionary.filter_extremes(no_below=1, keep_n=30000) # check API docs for pruning params

    def __iter__(self):
        for tokens in iter_documents(self.top_dir):
            yield self.dictionary.doc2bow(tokens)

corpus = MyCorpus('Annual_Text') # create a dictionary

In [3]:
import logging
import os
from gensim import corpora, utils
from gensim.models.wrappers.dtmmodel import DtmModel
import numpy as np

os.environ['DTM_PATH'] = r"C:\Users\cgn31\OneDrive\Documents\GitHub\Capstone-Project\download test\dtm-win64.exe"

if not os.environ.get('DTM_PATH', None):
    raise ValueError("SKIP: You need to set the DTM path")

In [4]:
time_seq = [1, 1, 1, 1, 1]

In [5]:
dtm_path = os.environ['DTM_PATH']

In [6]:
model = DtmModel(dtm_path, corpus, time_seq, num_topics=2,
                 id2word=corpus.dictionary, initialize_lda=True)

In [7]:
topics = model.show_topic(topicid=1, time=1, num_words=10)



In [8]:
topics

[(0.03664654251207798, 'cty'),
 (0.02252856189870575, 'grandfathered'),
 (0.01980893607634881, 'purdue'),
 (0.01980893607634881, 'pharma'),
 (0.014654836145355487, 'omni'),
 (0.013961373636768297, 'comm'),
 (0.013047094744340184, 'flipkart'),
 (0.012314654499869652, 'super'),
 (0.010664254330411353, 'saye'),
 (0.010062931414443576, 'cir')]