In [2]:
import os, gensim

def iter_documents(top_directory):
    """Iterate over all documents, yielding a document (=list of utf8 tokens) at a time."""
    for root, dirs, files in os.walk(top_directory):
        for file in filter(lambda file: file.endswith('.txt'), files):
            document = open(os.path.join(root, file), encoding='utf8').read() # read the entire document, as one big string
            yield gensim.utils.tokenize(document, lower=True) # or whatever tokenization suits you

class MyCorpus(object):
    def __init__(self, top_dir):
        self.top_dir = top_dir
        self.dictionary = gensim.corpora.Dictionary(iter_documents(top_dir))
        self.dictionary.filter_extremes(no_below=1, keep_n=30000) # check API docs for pruning params

    def __iter__(self):
        for tokens in iter_documents(self.top_dir):
            yield self.dictionary.doc2bow(tokens)

In [106]:
walmart_corpus = MyCorpus('WMT_XML')
amazon_corpus  = MyCorpus('AMZN_XML')
costco_corpus  = MyCorpus('COST_XML')
target_corpus  = MyCorpus('TGT_XML')
kroger_corpus  = MyCorpus('KR_XML')

In [107]:
from gensim.models import LdaModel
ntopic = 20

lda_walmart = LdaModel(walmart_corpus, num_topics=ntopic, id2word=walmart_corpus.dictionary)
lda_amazon  = LdaModel(amazon_corpus , num_topics=ntopic, id2word=amazon_corpus.dictionary)
lda_costco  = LdaModel(costco_corpus , num_topics=ntopic, id2word=costco_corpus.dictionary)
lda_target  = LdaModel(target_corpus , num_topics=ntopic, id2word=target_corpus.dictionary)
lda_kroger  = LdaModel(kroger_corpus , num_topics=ntopic, id2word=kroger_corpus.dictionary)

In [108]:
names = [("Walmart", walmart_corpus, lda_walmart), 
         ("Amazon", amazon_corpus, lda_amazon),
         ("Costco", costco_corpus, lda_costco), 
         ("Target", target_corpus, lda_target), 
         ("Kroger", kroger_corpus, lda_kroger)]

In [109]:
import itertools
names_prod = list(itertools.product(names, names))

In [110]:
flatten = lambda l: [item for sublist in l for item in sublist]

In [111]:
from collections import OrderedDict

def average_topic(flat_list):

    d = OrderedDict()
    for prob, topic in flat_list:
        d.setdefault(topic, []).append(prob)

    d = [(sum(v) / len(v), k) for k, v in d.items()]
    
    return(d)

In [116]:
for i in range(20):
    print(lda_walmart.show_topic(topicid=i))

[('po', 0.010568354), ('overflow', 0.010198054), ('proxy', 0.009976957), ('middle', 0.009881891), ('director', 0.009625579), ('meeting', 0.0087029645), ('publisher', 0.008557172), ('incentive', 0.00793756), ('role', 0.007853022), ('hidden', 0.0077880383)]
[('middle', 0.02719071), ('hidden', 0.026834177), ('overflow', 0.02457924), ('role', 0.013327985), ('solid', 0.012283826), ('publisher', 0.012236), ('fae', 0.012065034), ('topic', 0.011734632), ('codification', 0.011656206), ('meeting', 0.009445529)]
[('publisher', 0.026780529), ('role', 0.025339868), ('overflow', 0.024932716), ('solid', 0.023256136), ('hidden', 0.022070892), ('middle', 0.019262971), ('codification', 0.014741223), ('topic', 0.012385511), ('double', 0.01102255), ('prefix', 0.010890941)]
[('hidden', 0.019747863), ('overflow', 0.01911846), ('solid', 0.016834969), ('span', 0.016690116), ('indenture', 0.015170936), ('role', 0.014171551), ('middle', 0.0135618225), ('codification', 0.013368087), ('publisher', 0.009387346), (

In [113]:
import operator

for index, pair in enumerate(names_prod[:5]):
    model_txt  = pair[0][0]
    corpus_txt = pair[1][0]
    
    model  = pair[0][2]
    corpus = pair[1][1]
    
    print(f"{model_txt} model applied to {corpus_txt} documents")
    tag = [model.get_document_topics(item) for item in corpus]
    tag = [tup[::-1] for tup in flatten(tag)]
    
    topic_avg = average_topic(tag)
    topic_avg = sorted(topic_avg, key = lambda x: x[1])
    
    print(topic_avg)
    print()

Walmart model applied to Walmart documents
[(0.5351285361469938, 0), (0.3512907636097886, 1), (0.2870668436993252, 2), (0.6492803201852542, 3), (0.43556032329797745, 4), (0.38744891180910845, 5), (0.1578330531483516, 6), (0.275230953656137, 7), (0.5797262836633057, 8), (0.3702181575113329, 9), (0.18750847125839856, 10), (0.4245526646781299, 11), (0.4487048659939319, 12), (0.40819737014289087, 13), (0.7536786735057831, 14), (0.5062841631401822, 15), (0.3428046562605434, 16), (0.09892254579989683, 17), (0.561461015811397, 18), (0.4102288049590938, 19)]

Walmart model applied to Amazon documents
[(0.2984454673786576, 0), (0.1552486481765906, 1), (0.06550457083027471, 2), (0.12270808423107321, 3), (0.07920647785067558, 6), (0.03276656794228724, 7), (0.18427568819310705, 8), (0.043299071863293646, 9), (0.049793027341365814, 10), (0.05468397711714109, 11), (0.048771288358803955, 12), (0.025112731692691643, 13), (0.36130399197122476, 15), (0.026213195830307626, 16), (0.14118828591616714, 18),