In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ast import literal_eval

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

import warnings
warnings.filterwarnings('ignore')

In [9]:
df = pd.concat(map(pd.read_csv, 
                   ['walmart.csv', 
                    'target.csv',
                    'amazon.csv',
                    'costco.csv',
                    'kroger.csv']),ignore_index = True)

df.loc[:,'tokens_final'] = \
df.loc[:,'tokens_final'].apply(lambda x: literal_eval(x))

def replace_in_list(lis, old, new):
    for i in range(len(lis)):
        if lis[i] == old:
            lis[i] = new
    return(lis)

df['tokens_final'] = \
df['tokens_final'].map(lambda x: replace_in_list(x,"employee", "associate"))

df['tokens_final'] = \
df['tokens_final'].map(lambda x: replace_in_list(x,"guest", "customer"))

df['tokens_final'] = \
df['tokens_final'].map(lambda x: replace_in_list(x,"consumer", "customer"))

remove_words =['performance','officer','value','award','amount',
               'cash','chairman','vice','option','president',
               'base','inc.','grant','voting','election','unit',
               'audit','benefit','date','service','management',
               'include','number','name','person','proposal',
               'section','report','cost','receive','pension',
               'rate','interest','fuel','rate','serve','sale','pension']

df["tokens_final"] = \
    df["tokens_final"].map(lambda x: \
    [word for word in x if word.lower() not in remove_words])

In [10]:
df

Unnamed: 0,year,company,tokens_final
0,2010,Walmart,"[street, website, www.walmartstores.com, notic..."
1,2010,Walmart,"[wal-mart, store, street, website, www.walmart..."
2,2010,Walmart,"[nonqualified, compensation, potential, paymen..."
3,2010,Walmart,"[compensation, store, compensation, amend, jan..."
4,2010,Walmart,"[nominee, statement, company, ratification, ap..."
...,...,...,...
3710,2019,Kroger,"[company, termination, cause, treatment, provi..."
3711,2019,Kroger,"[purpose, approve, company, provide, provision..."
3712,2019,Kroger,"[security, restriction, represent, book, accou..."
3713,2019,Kroger,"[case, jurisdiction, approve, discretion, prov..."


In [11]:
import gensim
from gensim.utils import simple_preprocess
from gensim.test.utils import common_corpus, common_dictionary

import pprint
pp = pprint.PrettyPrinter(indent=4)

In [43]:
import os
os.mkdir("temp_text")

companies = df["company"].drop_duplicates().values

for company in companies:
    os.mkdir("temp_text\\" + company)

In [44]:
for index, row in df.iterrows():
    company = row["company"]
    year = row["year"]
    text = ' '.join(row["tokens_final"])
    
    with open(f"temp_text\\{company}\\{year}_{company}_{index}.txt", 'w', encoding='utf8') as f:
        f.write(text)

In [45]:
import os, gensim

def iter_documents(top_directory):
    """Iterate over all documents, yielding a document (=list of utf8 tokens) at a time."""
    for root, dirs, files in os.walk(top_directory):
        for file in filter(lambda file: file.endswith('.txt'), files):
            document = open(os.path.join(root, file), encoding='utf8').read() # read the entire document, as one big string
            yield gensim.utils.tokenize(document, lower=True) # or whatever tokenization suits you

class MyCorpus(object):
    def __init__(self, top_dir):
        self.top_dir = top_dir
        self.dictionary = gensim.corpora.Dictionary(iter_documents(top_dir))
        self.dictionary.filter_extremes(no_below=1, keep_n=30000) # check API docs for pruning params

    def __iter__(self):
        for tokens in iter_documents(self.top_dir):
            yield self.dictionary.doc2bow(tokens)

In [47]:
walmart_corpus = MyCorpus('temp_text/walmart')
amazon_corpus  = MyCorpus('temp_text/amazon')
costco_corpus  = MyCorpus('temp_text/costco')
target_corpus  = MyCorpus('temp_text/target')
kroger_corpus  = MyCorpus('temp_text/kroger')

In [70]:
from gensim.models import LdaModel
ntopic = 10

lda_walmart = LdaModel(walmart_corpus, num_topics=ntopic, id2word=walmart_corpus.dictionary)
lda_amazon  = LdaModel(amazon_corpus , num_topics=ntopic, id2word=amazon_corpus.dictionary)
lda_costco  = LdaModel(costco_corpus , num_topics=ntopic, id2word=costco_corpus.dictionary)
lda_target  = LdaModel(target_corpus , num_topics=ntopic, id2word=target_corpus.dictionary)
lda_kroger  = LdaModel(kroger_corpus , num_topics=ntopic, id2word=kroger_corpus.dictionary)

In [71]:
names = [("Walmart", walmart_corpus, lda_walmart), 
         ("Amazon", amazon_corpus, lda_amazon),
         ("Costco", costco_corpus, lda_costco), 
         ("Target", target_corpus, lda_target), 
         ("Kroger", kroger_corpus, lda_kroger)]

In [72]:
import itertools
names_prod = list(itertools.product(names, names))

In [73]:
flatten = lambda l: [item for sublist in l for item in sublist]

In [74]:
from collections import OrderedDict

def average_topic(flat_list):

    d = OrderedDict()
    for prob, topic in flat_list:
        d.setdefault(topic, []).append(prob)

    d = [(sum(v) / len(v), k) for k, v in d.items()]
    
    return(d)

In [75]:
for i in range(ntopic):
    print(lda_walmart.show_topic(topicid=i))
    print()

[('incentive', 0.02142479), ('defer', 0.021224383), ('cngc', 0.014651998), ('program', 0.009155399), ('income', 0.0088225305), ('payment', 0.008607056), ('result', 0.008327849), ('contribution', 0.008182822), ('account', 0.007996664), ('goal', 0.0074817943)]

[('incentive', 0.017756734), ('target', 0.011836097), ('goal', 0.009922712), ('equity', 0.009922202), ('accountant', 0.009170052), ('governance', 0.008469286), ('associate', 0.008039322), ('rule', 0.0077500935), ('risk', 0.0073270584), ('result', 0.0072643426)]

[('payment', 0.015311355), ('employment', 0.011849031), ('business', 0.010751324), ('income', 0.009644408), ('incentive', 0.009532485), ('time', 0.009353391), ('agreement', 0.008744033), ('vest', 0.008499161), ('equity', 0.007924628), ('hold', 0.0073880786)]

[('incentive', 0.014110572), ('member', 0.012632844), ('policy', 0.011758959), ('program', 0.01056016), ('associate', 0.0093910275), ('transaction', 0.008442447), ('material', 0.008108177), ('make', 0.007444198), ('me

In [76]:
import operator

for index, pair in enumerate(names_prod[:5]):
    model_txt  = pair[0][0]
    corpus_txt = pair[1][0]
    
    model  = pair[0][2]
    corpus = pair[1][1]
    
    print(f"{model_txt} model applied to {corpus_txt} documents")
    tag = [model.get_document_topics(item) for item in corpus]
    tag = [tup[::-1] for tup in flatten(tag)]
    
    topic_avg = average_topic(tag)
    topic_avg = sorted(topic_avg, key = lambda x: x[1])
    
    print(topic_avg)
    print()

Walmart model applied to Walmart documents
[(0.4597179211639598, 0), (0.45101957516513824, 1), (0.46591635599475484, 2), (0.4539851728729459, 3), (0.47907849926864954, 4), (0.5195802708865324, 5), (0.5420506339043056, 6), (0.5447015820229624, 7), (0.5301722562034971, 8), (0.4870988519213508, 9)]

Walmart model applied to Amazon documents
[(0.1327859936425319, 0), (0.09570725506969861, 1), (0.11437531503970208, 2), (0.21399529271626047, 3), (0.16768148763415713, 4), (0.118088659318164, 5), (0.43415580976493, 6), (0.31699287521226677, 7), (0.1512126382088886, 8), (0.2313662926963669, 9)]

Walmart model applied to Costco documents
[(0.1514108526579877, 0), (0.1483152644231734, 1), (0.11793755789772725, 2), (0.21332072737345167, 3), (0.15815852240969738, 4), (0.16531541884966916, 5), (0.3547402023309245, 6), (0.2781293787574642, 7), (0.15612627776746435, 8), (0.33317492798267945, 9)]

Walmart model applied to Target documents
[(0.14421049265130872, 0), (0.148442068794178, 1), (0.1789753571