In [1]:
import os, gensim

def iter_documents(top_directory):
    """Iterate over all documents, yielding a document (=list of utf8 tokens) at a time."""
    for root, dirs, files in os.walk(top_directory):
        for file in filter(lambda file: file.endswith('.txt'), files):
            document = open(os.path.join(root, file), encoding='utf8').read() # read the entire document, as one big string
            yield gensim.utils.tokenize(document, lower=True) # or whatever tokenization suits you

class MyCorpus(object):
    def __init__(self, top_dir):
        self.top_dir = top_dir
        self.dictionary = gensim.corpora.Dictionary(iter_documents(top_dir))
        self.dictionary.filter_extremes(no_below=1, keep_n=30000) # check API docs for pruning params

    def __iter__(self):
        for tokens in iter_documents(self.top_dir):
            yield self.dictionary.doc2bow(tokens)

            
folder = 'WMT_XML'       

corpus = MyCorpus(folder) # create a dictionary

In [2]:
import logging
import os
from gensim import corpora, utils
from gensim.models.wrappers.dtmmodel import DtmModel
import numpy as np

os.environ['DTM_PATH'] = r"dtm-win64.exe"

if not os.environ.get('DTM_PATH', None):
    raise ValueError("SKIP: You need to set the DTM path")

In [3]:
import glob

time_seq = []

for year in range(2015, 2020):
    docs_num = len(glob.glob(folder + "\\" + str(year) + "*"))
    print(f"Year {year} has {docs_num} documents")
    time_seq.append(docs_num)

Year 2015 has 139 documents
Year 2016 has 142 documents
Year 2017 has 204 documents
Year 2018 has 222 documents
Year 2019 has 183 documents


In [4]:
time_seq

[139, 142, 204, 222, 183]

In [5]:
dtm_path = os.environ['DTM_PATH']

In [6]:
num_topics = 5

model = DtmModel(dtm_path, corpus, time_seq, num_topics=num_topics,
                 id2word=corpus.dictionary, initialize_lda=True)

ValueError: mismatched timeslices %890 for corpus of len 1655

In [None]:
topic_list = []
show_top = 7

for id in range(num_topics):
    topic = [model.show_topic(topicid=id, time=time, topn=show_top) for time in range(5)]
    topic_list.append(topic)

In [None]:
topic_list[0]

In [None]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [None]:
x = range(2015,2015+len(time_seq))

for index, topic in enumerate(topic_list):
    plt.title("Topic " + str(index+1))

    for word in range(show_top):
        prob = [sub[word] for sub in topic]
        values = [probability[0] for probability in prob]
        plt.plot(x, values, label = prob[0][1])
    
    plt.xticks(np.arange(2015, 2020, step = 1))
    plt.ylabel("Word Distribution")
    plt.xlabel("Year")
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))

    plt.show()

In [None]:
for topic in range(num_topics):
    print(f'Topic {topic}')
    print(model.print_topics()[topic])
    print()