In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ast import literal_eval

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.concat(map(pd.read_csv, 
                   ['walmart.csv', 
                    'target.csv',
                    'amazon.csv',
                    'costco.csv',
                    'kroger.csv']),ignore_index = True)

df.loc[:,'tokens_final'] = \
df.loc[:,'tokens_final'].apply(lambda x: literal_eval(x))

def replace_in_list(lis, old, new):
    for i in range(len(lis)):
        if lis[i] == old:
            lis[i] = new
    return(lis)

df['tokens_final'] = \
df['tokens_final'].map(lambda x: replace_in_list(x,"employee", "associate"))

df['tokens_final'] = \
df['tokens_final'].map(lambda x: replace_in_list(x,"guest", "customer"))

df['tokens_final'] = \
df['tokens_final'].map(lambda x: replace_in_list(x,"consumer", "customer"))

remove_words =['performance','officer','value','award','amount',
               'cash','chairman','vice','option','president',
               'base','inc.','grant','voting','election','unit',
               'audit','benefit','date','service','management',
               'include','number','name','person','proposal',
               'section','report','cost','receive','pension',
               'rate','interest','fuel','rate','serve','sale','pension']

df["tokens_final"] = \
    df["tokens_final"].map(lambda x: \
    [word for word in x if word.lower() not in remove_words])

In [3]:
df

Unnamed: 0,year,company,tokens_final
0,2010,Walmart,"[street, website, www.walmartstores.com, notic..."
1,2010,Walmart,"[wal-mart, store, street, website, www.walmart..."
2,2010,Walmart,"[nonqualified, compensation, potential, paymen..."
3,2010,Walmart,"[compensation, store, compensation, amend, jan..."
4,2010,Walmart,"[nominee, statement, company, ratification, ap..."
...,...,...,...
3710,2019,Kroger,"[company, termination, cause, treatment, provi..."
3711,2019,Kroger,"[purpose, approve, company, provide, provision..."
3712,2019,Kroger,"[security, restriction, represent, book, accou..."
3713,2019,Kroger,"[case, jurisdiction, approve, discretion, prov..."


In [4]:
import gensim
from gensim.utils import simple_preprocess
from gensim.test.utils import common_corpus, common_dictionary

import pprint
pp = pprint.PrettyPrinter(indent=4)

In [5]:
#ran once to write files

# import os
# os.mkdir("temp_text")

# companies = df["company"].drop_duplicates().values

# for company in companies:
#     os.mkdir("temp_text\\" + company)
    
# for index, row in df.iterrows():
#     company = row["company"]
#     year = row["year"]
#     text = ' '.join(row["tokens_final"])
    
#     with open(f"temp_text\\{company}\\{year}_{company}_{index}.txt", 'w', encoding='utf8') as f:
#         f.write(text)

In [6]:
import os, gensim

def iter_documents(top_directory):
    """Iterate over all documents, yielding a document (=list of utf8 tokens) at a time."""
    for root, dirs, files in os.walk(top_directory):
        for file in filter(lambda file: file.endswith('.txt'), files):
            document = open(os.path.join(root, file), encoding='utf8').read() # read the entire document, as one big string
            yield gensim.utils.tokenize(document, lower=True) # or whatever tokenization suits you

class MyCorpus(object):
    def __init__(self, top_dir):
        self.top_dir = top_dir
        self.dictionary = gensim.corpora.Dictionary(iter_documents(top_dir))
        self.dictionary.filter_extremes(no_below=1, keep_n=30000) # check API docs for pruning params

    def __iter__(self):
        for tokens in iter_documents(self.top_dir):
            yield self.dictionary.doc2bow(tokens)

In [7]:
walmart_corpus = MyCorpus('temp_text/walmart')
amazon_corpus  = MyCorpus('temp_text/amazon')
costco_corpus  = MyCorpus('temp_text/costco')
target_corpus  = MyCorpus('temp_text/target')
kroger_corpus  = MyCorpus('temp_text/kroger')

In [8]:
from gensim.models import LdaModel
ntopic = 10

lda_walmart = LdaModel(walmart_corpus, num_topics=ntopic, id2word=walmart_corpus.dictionary)
lda_amazon  = LdaModel(amazon_corpus , num_topics=ntopic, id2word=amazon_corpus.dictionary)
lda_costco  = LdaModel(costco_corpus , num_topics=ntopic, id2word=costco_corpus.dictionary)
lda_target  = LdaModel(target_corpus , num_topics=ntopic, id2word=target_corpus.dictionary)
lda_kroger  = LdaModel(kroger_corpus , num_topics=ntopic, id2word=kroger_corpus.dictionary)

In [9]:
names = [("Walmart", walmart_corpus, lda_walmart), 
         ("Amazon", amazon_corpus, lda_amazon),
         ("Costco", costco_corpus, lda_costco), 
         ("Target", target_corpus, lda_target), 
         ("Kroger", kroger_corpus, lda_kroger)]

In [10]:
import itertools
names_prod = list(itertools.product(names, names))

In [11]:
flatten = lambda l: [item for sublist in l for item in sublist]

In [12]:
from collections import OrderedDict

def average_topic(flat_list):

    d = OrderedDict()
    for prob, topic in flat_list:
        d.setdefault(topic, []).append(prob)

    d = [(sum(v) / len(v), k) for k, v in d.items()]
    
    return(d)

In [16]:
for company in names:
    for i in range(ntopic):
        pp.pprint(lda_walmart.show_topic(topicid=i))
        print()

[   ('member', 0.020815318),
    ('experience', 0.015948776),
    ('cngc', 0.014865066),
    ('business', 0.011187044),
    ('incentive', 0.010436272),
    ('corporation', 0.009863281),
    ('review', 0.009304732),
    ('governance', 0.009019247),
    ('group', 0.008369606),
    ('rule', 0.008257974)]

[   ('column', 0.017674008),
    ('equity', 0.0146302385),
    ('incentive', 0.010269762),
    ('payment', 0.009645595),
    ('information', 0.00938826),
    ('table', 0.008672465),
    ('review', 0.0083433),
    ('douglas', 0.008292766),
    ('rule', 0.00827823),
    ('market', 0.007920543)]

[   ('transaction', 0.016856004),
    ('member', 0.014321849),
    ('review', 0.012210391),
    ('policy', 0.011638354),
    ('goal', 0.011564127),
    ('incentive', 0.01017085),
    ('cngc', 0.009424425),
    ('accountant', 0.009326033),
    ('recipient', 0.009256202),
    ('target', 0.008195261)]

[   ('transaction', 0.014363549),
    ('business', 0.0126234),
    ('material', 0.01241821),
    ('a

In [27]:
names_prod[5]

(('Amazon',
  <__main__.MyCorpus at 0x175fb1a2f28>,
  <gensim.models.ldamodel.LdaModel at 0x175fb21e390>),
 ('Walmart',
  <__main__.MyCorpus at 0x175fb18d320>,
  <gensim.models.ldamodel.LdaModel at 0x175faec3f28>))

In [30]:
import operator

for index, pair in enumerate(names_prod):
    model_txt  = pair[0][0]
    corpus_txt = pair[1][0]
    
    model  = pair[0][2]
    corpus = pair[1][1]
    
    print(f"{model_txt} model applied to {corpus_txt} documents")
    
    try:
        tag = [model.get_document_topics(item) for item in corpus]
        tag = [tup[::-1] for tup in flatten(tag)]

        topic_avg = average_topic(tag)
        topic_avg = sorted(topic_avg, key = lambda x: x[1])

        pp.pprint(topic_avg)
    except:
        print("Fail to converge")
        
    print()

Walmart model applied to Walmart documents
[   (0.5750969839551383, 0),
    (0.6567462415793367, 1),
    (0.46689547139305804, 2),
    (0.4433712647719817, 3),
    (0.462936506491735, 4),
    (0.4790720435708886, 5),
    (0.37562689524410026, 6),
    (0.4259581234158769, 7),
    (0.5650457358206896, 8),
    (0.5063072928142819, 9)]

Walmart model applied to Amazon documents
[   (0.45825801297443075, 0),
    (0.15812429751488655, 1),
    (0.19264146202275978, 2),
    (0.2131068251193176, 3),
    (0.13568668933585287, 4),
    (0.16309470708171528, 5),
    (0.07787389209603562, 6),
    (0.06566434313676187, 7),
    (0.27153033536711807, 8),
    (0.2864854831229612, 9)]

Walmart model applied to Costco documents
[   (0.47039772111312145, 0),
    (0.18984703453915083, 1),
    (0.16073144665669378, 2),
    (0.1693489032157627, 3),
    (0.18867502465995178, 4),
    (0.2093837199328871, 5),
    (0.0680996984243393, 6),
    (0.07958730074266593, 7),
    (0.23288092675929267, 8),
    (0.30212513