In [2]:
import os, gensim

def iter_documents(top_directory):
    """Iterate over all documents, yielding a document (=list of utf8 tokens) at a time."""
    for root, dirs, files in os.walk(top_directory):
        for file in filter(lambda file: file.endswith('.txt'), files):
            document = open(os.path.join(root, file), encoding='utf8').read() # read the entire document, as one big string
            yield gensim.utils.tokenize(document, lower=True) # or whatever tokenization suits you

class MyCorpus(object):
    def __init__(self, top_dir):
        self.top_dir = top_dir
        self.dictionary = gensim.corpora.Dictionary(iter_documents(top_dir))
        self.dictionary.filter_extremes(no_below=1, keep_n=30000) # check API docs for pruning params

    def __iter__(self):
        for tokens in iter_documents(self.top_dir):
            yield self.dictionary.doc2bow(tokens)

In [25]:
walmart_corpus = MyCorpus('WMT_XML')
amazon_corpus  = MyCorpus('AMZN_XML')
costco_corpus  = MyCorpus('COST_XML')
target_corpus  = MyCorpus('TGT_XML')
kroger_corpus  = MyCorpus('KR_XML')

In [37]:
from gensim.models import LdaModel
lda_walmart = LdaModel(walmart_corpus, num_topics=10, id2word=walmart_corpus.dictionary)
lda_amazon  = LdaModel(amazon_corpus , num_topics=10, id2word=amazon_corpus.dictionary)
lda_costco  = LdaModel(costco_corpus , num_topics=10, id2word=costco_corpus.dictionary)
lda_target  = LdaModel(target_corpus , num_topics=10, id2word=target_corpus.dictionary)
lda_kroger  = LdaModel(kroger_corpus , num_topics=10, id2word=kroger_corpus.dictionary)

In [80]:
names = [("Walmart", walmart_corpus, lda_walmart), 
         ("Amazon", amazon_corpus, lda_amazon),
         ("Costco", costco_corpus, lda_costco), 
         ("Target", target_corpus, lda_target), 
         ("Kroger", kroger_corpus, lda_kroger)]


In [81]:
import itertools
names_prod = list(itertools.product(names, names))

In [38]:
flatten = lambda l: [item for sublist in l for item in sublist]

In [67]:
from collections import OrderedDict

def average_topic(flat_list):

    d = OrderedDict()
    for prob, topic in flat_list:
        d.setdefault(topic, []).append(prob)

    d = [(sum(v) / len(v), k) for k, v in d.items()]
    
    return(d)

In [103]:
import operator

for index, pair in enumerate(names_prod[:5]):
    model_txt  = pair[0][0]
    corpus_txt = pair[1][0]
    
    model  = pair[0][2]
    corpus = pair[1][1]
    
    print(f"{model_txt} model applied to {corpus_txt} documents")
    tag = [model.get_document_topics(item) for item in corpus]
    tag = [tup[::-1] for tup in flatten(tag)]
    
    topic_avg = average_topic(tag)
    topic_avg = sorted(topic_avg, key = lambda x: x[1])
    
    print(topic_avg)
    print()

Walmart model applied to Walmart documents
[(0.31775094853448016, 0), (0.6921117425738991, 1), (0.4539582208888803, 2), (0.587554996434067, 3), (0.5921518906521109, 4), (0.3510176306590438, 5), (0.23573007574304938, 6), (0.03703961428254843, 7), (0.4609524873235533, 8), (0.587985881648603, 9)]

Walmart model applied to Amazon documents
[(0.08312106970697641, 0), (0.2005708650976885, 1), (0.41248021469348006, 2), (0.23937216152747473, 3), (0.2566982574125593, 4), (0.019593778997659683, 5), (0.050565017946064474, 6), (0.10522330237122682, 7), (0.14487355951111294, 8), (0.08065227537216353, 9)]

Walmart model applied to Costco documents
[(0.06984300389885903, 0), (0.24439339786065076, 1), (0.34193369271760843, 2), (0.3664987232941802, 4), (0.02445489577949047, 6), (0.25081570621293325, 8), (0.09493264649063349, 9)]

Walmart model applied to Target documents
[(0.02922035888251331, 0), (0.2843753030716345, 1), (0.4404372685784652, 2), (0.025487614950786035, 3), (0.2778961125406481, 4), (0.0