# Culture Measures Based on Company Reviews

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk, re, math, csv
# nltk.download('wordnet')
from string import punctuation
from functools import partial
import concurrent.futures as cf
from collections import defaultdict

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

Load your dataset.

In [2]:
df = pd.read_csv('clean_gs.csv')
df.head()

Unnamed: 0,employer,id,pros,cons
0,American Express,44001,Still not big enough in market place,"Great brand , Good leadership , Clear business..."
1,Eventum IT Solutions,44004,Nothing important on my point of view.,"Learn new technologies, helpful people, good m..."
2,Eventum IT Solutions,44004,Alot of friends working together which isn't v...,Very good opportunities to learn technologies
3,Eventum IT Solutions,44004,Working hours are not good and need to add the...,You can learn technically a lot in this company.
4,Eventum IT Solutions,44004,No Real Cons at all,- Very friendly environment.\r\n- Highly exper...


The following function will remove the company names from their respective reviews.

In [3]:
stopwords = nltk.corpus.stopwords.words('english')

def comp_name_out(data, col_to_search, col_reviews, companies):
    """
    This function takes in a dataframe, the name of the column with all of 
    the companies, the name of the column with the reviews, and an iterable
    with the companies names that are in the dataset. The latter could be a list,
    set, Series, tuple, etc.
    """
    for company in companies:
        condition = (data[col_to_search] == company)
        data.loc[condition, col_reviews] = data.loc[condition, col_reviews].str.lower().str.replace(company.lower(), '', regex=False)
    
    return data

Function to get the root of the word. You can get all three (lemma, stem, and snow) or use them separately with the partial functions below.

In [4]:
def root_of_word(docs, root_word_method='lemma'):
    porter_stemmer = nltk.stem.PorterStemmer()
    snowball_stemmer = nltk.stem.SnowballStemmer('english')
    lemma = nltk.wordnet.WordNetLemmatizer()
    
    tokens = nltk.word_tokenize(docs)
    
    if root_word_method == 'lemma':
        doc = ' '.join([lemma.lemmatize(w) for w in tokens])
    elif root_word_method == 'stemm':
        doc = ' '.join([porter_stemmer.stem(w) for w in tokens])
    elif root_word_method == 'snowball':
        doc = ' '.join([snowball_stemmer.stem(w) for w in tokens])
        
    return doc

stemming = partial(root_of_word, root_word_method='stemm')
snowball = partial(root_of_word, root_word_method='snowball')

The following function helps with the preprocessing of the data. It runs after the lemmatizer, stemmer, snowball, etc. If you want to include stopwords and take them out at a later stage, uncomment the first `filtered_tokens` below and comment out the second one.

In [5]:
def normalize_doc(doc):
    """
    This function normalizes your list of documents by taking only
    words, numbers, and spaces in between them. It then filters out
    stop words if you want to.
    """
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    tokens = nltk.word_tokenize(doc)
#     filtered_tokens = [token for token in tokens]
    filtered_tokens = [token for token in tokens if token not in stopwords]
    doc = ' '.join(filtered_tokens)
    return doc

Functions to get the top topics and to run the LDA models.

In [6]:
def show_topics(vectorizer, lda_model, n_words=15):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

our_range = 2, 10, 50, 100, 150, 200, 250, 300

def get_models(topics, tf, tup_num):
    """
    This functions takes in the number of topics to run the model for,
    a tuple with the name of the company and the sparse matix and
    a number for the element in the tuple that has the sparse matix.
    It then returns a tuple with (company name, topic #, comph, and the model)
    """
    lda = LatentDirichletAllocation(n_components=topics, max_iter=100, learning_method='online', learning_offset=10., random_state=1234)
    lda_model = lda.fit(tf[tup_num])
    topicsOverWords = lda_model.components_ / lda_model.components_.sum(axis=1)[:, np.newaxis]
    return (tf[0], topics, comph(topicsOverWords), lda_model)

In [7]:
def jsd(p, q, base=np.e): # JS distance between probability vectors, used to compute compH
    '''
        Implementation of pairwise `jsd` based on  
        https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence
    '''
    p = np.asarray(p)
    q = np.asarray(q)
    m = (1 / 2 * (p + q))
    return sp.stats.entropy(p, m, base) / 2 +  sp.stats.entropy(q, m, base) / 2

def conth(data): # function to measure content heterogeneity given a topic (prob) matrix
    return (1 / ((sum(map(sum, np.square(data.values)))) / data.shape[0]))

In [8]:
def comph(data): 
    #Transform probMatrix_df into 2D array
        
    df = pd.DataFrame()
    for x in range(len(data)): 
        jsd_list = []
        for y in range(len(data)): 
            jsd_list.append(jsd(data[x], data[y]))
        df[str(x)] = jsd_list

    #Get df lower diagonal
    mask = np.ones(df.shape, dtype='bool')
    mask[np.triu_indices(len(df))] = False
    df_lower_diagonal = df[(df>-1) & mask]
    
    distance_list = []
    for k in range(len(df)): 
    #Transform each column of df_lower_diagonal into list
        column_list = df_lower_diagonal[str(k)].values.tolist()
        #Drop nan values from column_list - to retain only actual values from lower diagonal 
        column_lower_diagonal_list = [l for l in column_list if (math.isnan(l) == False)]
        for d in column_lower_diagonal_list: 
            distance_list.append(d)
            
    return sum(distance_list) / float(len(distance_list))

Create an array with the unique employers in the dataset.

In [9]:
companies = df['employer'].unique()

Remove the company names from the reviews, and extract the reviews into a numpy array.

In [10]:
%%time

df = comp_name_out(df, 'employer', 'pros', companies)
data_pros = df['pros'].values

CPU times: user 1min 34s, sys: 386 ms, total: 1min 34s
Wall time: 1min 34s


The text preprocessing of the corpus takes place in parallel. You first normalize the reviews and then take the root of the words.

In [17]:
%%time

with cf.ProcessPoolExecutor() as e:
    data_pros_cleaned = e.map(normalize_doc, data_pros)
    data_pros_cleaned = list(e.map(stemming, data_pros_cleaned))

df['pros_clean'] = data_pros_cleaned

CPU times: user 36.7 s, sys: 9.99 s, total: 46.7 s
Wall time: 47.2 s


Here you create an array with all of the companies and the amount of reviews they have. So far, only companies with at least 2 reviews make it to the modeling stage.

In [18]:
comps_of_interest = df.employer.value_counts()
comps_of_interest = (comps_of_interest[(comps_of_interest < 5) & (comps_of_interest > 1)]).index
len(comps_of_interest), comps_of_interest

(3207,
 Index(['Oney', 'Symantec', 'The NPD Group', 'Immedis', 'TMNA Services',
        'Sprout Social', 'Isabel Group', 'Dachser', 'Noble Studios',
        'LogiSense',
        ...
        'Jooble', 'EG', 'LeasePlan', 'Flying Tiger Copenhagen', 'Barhale',
        'Rapyd', 'Klohn Crippen Berger', 'SwiftAnt IT Solutions', 'GRASS',
        'Minami Studios'],
       dtype='object', length=3207))

Select only the employers that meet the condition above by creating a boolean with True for yes and False for no.

In [19]:
cond2 = df['employer'].isin(comps_of_interest) # create the condition
df_interest = df[cond2].copy() # get the new dataset
unique_ids = df_interest['employer'].unique() # get the unique IDs or unique employers in the dataset

The following loop will create sparse matrices for all companies and return a list of tuples with the name of the company, its sparse matrix, and the fitted vectorizer.

In [20]:
%%time

vectorizers_list = []
for comp_id in unique_ids:
    cond = (df_interest['employer'] == comp_id) # condition to get a dataset for each company
    revs_clean = df_interest.loc[cond, 'pros_clean'].values # get an array of reviews for such company
    count_vect = CountVectorizer() # instantiate a vectorizer
    vect = count_vect.fit_transform(revs_clean) # fit it to the selected company reviews
    vectorizers_list.append((comp_id, vect, count_vect))

CPU times: user 3.75 s, sys: 35.9 ms, total: 3.79 s
Wall time: 3.86 s


Calculate the total words in the dictionary of review words, and get the percentage of words in the final dictionary that can be found in the full corpus.

In [15]:
# TotalWords_vectorizer = CountVectorizer()
# TotalWords_tf = TotalWords_vectorizer.fit_transform(data_pros)
# totWords = len(TotalWords_vectorizer.get_feature_names())
# tf_vectorizer = CountVectorizer(max_df = 0.90, min_df=0.01)
# tf = tf_vectorizer.fit_transform(data_pros_cleaned)
# tf_feature_names = tf_vectorizer.get_feature_names()

# percVoc = len(tf_feature_names) / totWords * 100
# percVoc

Run the models in parallel and return a dictionary with the output of the get_models function for each company.

In [None]:
%%time

bad_output = {}
output_dictionary = {} # dictionary for the output

for sparse_tup in vectorizers_list:
    # to run get models in parallel, some parameters have to be fixed into a new function
    # so we do that here with partial
    partial_func = partial(get_models, tf=sparse_tup, tup_num=1)
    try:
        # you will run several topics at a time for each company
        with cf.ProcessPoolExecutor() as e:
            output = list(e.map(partial_func, our_range))
        output_dictionary[sparse_tup[0]] = output
    except:
        bad_tup = vectorizers_list.pop(vectorizers_list.index(sparse_tup))
        bad_output[bad_tup[0]] = bad_tup

In [None]:
type(bad_output)

You will now iterate over the output from above, add each dataset into a list, and then concatenate them all into one dataset.

In [22]:
%%time

dfs_list = []
for data in output_dictionary.keys(): # you can use the keys to get the data
    temp_df = pd.DataFrame.from_dict(output_dictionary[data])
    dfs_list.append(temp_df)

output_dfs = pd.concat(dfs_list)
output_dfs.columns = ['company', 'topics', 'coherence', 'models']
output_dfs.head(10)

CPU times: user 45.4 ms, sys: 11.4 ms, total: 56.8 ms
Wall time: 59.9 ms


Unnamed: 0,company,topics,coherence,models
0,Hays,2,0.274745,LatentDirichletAllocation(learning_method='onl...
1,Hays,10,0.238256,LatentDirichletAllocation(learning_method='onl...
2,Hays,50,0.166055,LatentDirichletAllocation(learning_method='onl...
3,Hays,100,0.151692,LatentDirichletAllocation(learning_method='onl...
4,Hays,150,0.128355,LatentDirichletAllocation(learning_method='onl...
5,Hays,200,0.118317,LatentDirichletAllocation(learning_method='onl...
6,Hays,250,0.075295,LatentDirichletAllocation(learning_method='onl...
7,Hays,300,0.067352,LatentDirichletAllocation(learning_method='onl...
0,Boston Consulting Group,2,0.204476,LatentDirichletAllocation(learning_method='onl...
1,Boston Consulting Group,10,0.230088,LatentDirichletAllocation(learning_method='onl...


The following loop iterates over the new dataframe, searches for the top 2 topics based on highest coherence, and appends to a list a tuple containing the company, a tuple with the top two topic numbers, and the fitted vectorizer from the original `vectorizers_list`.

In [23]:
sorted_topics = []
for comp, tup in zip(unique_ids, vectorizers_list):
    condition = output_dfs['company'] == comp # get each company
    the_data = output_dfs[condition] # get an exclusive dataset for a company
    top_condition = the_data['coherence'].argsort() # get a sorted index based on coherence
    top_topics = the_data.loc[top_condition, 'topics'].values # get the sorted topics based on coherence
    sorted_topics.append((comp, (top_topics[-2], top_topics[-1]), tup[1])) # put all together

Now run the `get_models` function again over the new space of topics. You will  need to
1. sort the tuple with the top two topics.
2. create a linearly spaced array with 10 elements between the top 2 topics, turn it into integers, make the array a set to eliminate any duplicates that might arise if there is a 2 in the top two topics, and then turn that into a list.
3. get your fixed partial function again
4. the output is the same as before

In [24]:
%%time

output_dictionary2 = {}

for tup in sorted_topics:
    start, end = sorted(tup[1]) # since the top 2 topics might not be sorted, sort them first
    the_range = list(set(np.linspace(start, end, 10).astype(int))) 
    partial_func = partial(get_models, tf=tup, tup_num=2)
    with cf.ProcessPoolExecutor() as e:
        output = list(e.map(partial_func, the_range))
    output_dictionary2[tup[0]] = output

CPU times: user 1.6 s, sys: 4.13 s, total: 5.73 s
Wall time: 7min 54s


Create multiple dataframes from dictionaries again and collapse them into 1.

In [25]:
%%time

dfs_list = []
for data in output_dictionary2.keys():
    temp_df = pd.DataFrame.from_dict(output_dictionary2[data])
    dfs_list.append(temp_df)

output_dfs = pd.concat(dfs_list)
output_dfs.columns = ['company', 'topics', 'coherence', 'models']
output_dfs.head(15)

CPU times: user 58.6 ms, sys: 11.9 ms, total: 70.5 ms
Wall time: 72.6 ms


Unnamed: 0,company,topics,coherence,models
0,Hays,2,0.274745,LatentDirichletAllocation(learning_method='onl...
1,Hays,3,0.265255,LatentDirichletAllocation(learning_method='onl...
2,Hays,4,0.219521,LatentDirichletAllocation(learning_method='onl...
3,Hays,5,0.243558,LatentDirichletAllocation(learning_method='onl...
4,Hays,6,0.262954,LatentDirichletAllocation(learning_method='onl...
5,Hays,7,0.249298,LatentDirichletAllocation(learning_method='onl...
6,Hays,8,0.240442,LatentDirichletAllocation(learning_method='onl...
7,Hays,9,0.219684,LatentDirichletAllocation(learning_method='onl...
8,Hays,10,0.238256,LatentDirichletAllocation(learning_method='onl...
0,Boston Consulting Group,32,0.22457,LatentDirichletAllocation(learning_method='onl...


Search for the best topic based on the new output, and get the top 10 words per topic. At the moment, you are only adding 1 of the topics for each company but you can change this by removing the indexing in `top_topics` below.

In [26]:
%%time

best_topics_model = defaultdict(tuple) # the output goes here

for company, vrizer in zip(unique_ids, vectorizers_list):
    cond = output_dfs['company'] == company # get each company
    filtered_data = output_dfs[cond] # to get a single dataframe
    the_topic = int(filtered_data.loc[filtered_data['coherence'].idxmax(), 'topics']) # get the best topic based on max coherence
    the_model = filtered_data.loc[filtered_data['coherence'].idxmax(), 'models'] # get the best model based on max coherence
    top_topics = show_topics(vrizer[2], the_model, 10) # get the top 10 words for each topic in each company
    best_topics_model[company] = (the_topic, the_model, top_topics[0])

CPU times: user 183 ms, sys: 28.3 ms, total: 211 ms
Wall time: 211 ms


Check out your output.

In [27]:
best_topics_model

defaultdict(tuple,
            {'Hays': (2,
              LatentDirichletAllocation(learning_method='online', max_iter=100,
                                        n_components=2, random_state=1234),
              array(['de', 'que', 'la', 'el', 'lo', 'en', 'para', 'es', 'manag', 'te'],
                    dtype='<U19')),
             'Boston Consulting Group': (36,
              LatentDirichletAllocation(learning_method='online', max_iter=100,
                                        n_components=36, random_state=1234),
              array(['onli', 'one', 'get', 'project', 'und', 'stori', 'reprimand',
                     'divers', 'im', 'auf'], dtype='<U24')),
             'Oracle': (20,
              LatentDirichletAllocation(learning_method='online', max_iter=100,
                                        n_components=20, random_state=1234),
              array(['corpor', 'decis', 'process', 'sometim', 'feel', 'cog', 'strong',
                     'rule', 'organis', 'understand'], dty

Get the probabilities dataframes for each company and add them to a dictionary.

In [28]:
#generate matrix summarizing distribution of docs (reviews) over topics
docs_of_probas = defaultdict(pd.DataFrame)

for tup in vectorizers_list:
    docs_of_probas[tup[0]] = pd.DataFrame(best_topics_model[tup[0]][1].transform(tup[1]))

Test it with any company.

In [29]:
docs_of_probas['Hays'].head()

Unnamed: 0,0,1
0,0.074576,0.925424
1,0.414625,0.585375
2,0.131323,0.868677
3,0.015325,0.984675
4,0.064864,0.935136


### Calculate the measures of interest

In [30]:
%%time

comP_h_results = defaultdict(float)

for company, proba_df in docs_of_probas.items():
    comP_h_results[company] = comph(proba_df.values)

CPU times: user 1min 59s, sys: 337 ms, total: 2min
Wall time: 2min


In [31]:
comP_h_results

defaultdict(float,
            {'Hays': 0.15636654768515945,
             'Boston Consulting Group': 0.46648964955638167,
             'Oracle': 0.478968100200936,
             'Philips': 0.5726772280642697,
             'IBM': 0.5139404811169749,
             'Amazon': 0.33162896146287035,
             'Orange': 0.48196172764276923,
             'DXC Technology': 0.5419433639470068,
             'Deloitte': 0.55674284690722,
             'Citi': 0.5635069996183281,
             'Microsoft': 0.5397208589996931,
             'Altran Group': 0.23670762559293304,
             'NTT': 0.5886421360935645,
             'Continental': 0.05630794090764686,
             'Thales': 0.6012950497663045,
             'Thermo Fisher Scientific': 0.5917394824443586,
             'Google': 0.5706160051317885,
             'Nokia': 0.5749893790231159,
             'Ericsson-Worldwide': 0.5749165685067223,
             'Procter & Gamble': 0.5330230949281948,
             'ABB': 0.5748935030627028,
       

In [32]:
%%time


comT_h_results = defaultdict(float)

for company, proba_df in docs_of_probas.items():
    comT_h_results[company] = conth(proba_df)

CPU times: user 65.9 ms, sys: 1.7 ms, total: 67.6 ms
Wall time: 66.8 ms


In [33]:
comT_h_results

defaultdict(float,
            {'Hays': 1.15990630696603,
             'Boston Consulting Group': 1.5273851689351268,
             'Oracle': 1.5290793170685046,
             'Philips': 1.3521083775745097,
             'IBM': 1.480713987271778,
             'Amazon': 1.5031065497406941,
             'Orange': 1.4141583176362618,
             'DXC Technology': 1.3233398744456017,
             'Deloitte': 1.5290347104782267,
             'Citi': 1.397235559085598,
             'Microsoft': 1.5557032540745293,
             'Altran Group': 1.1383887181516652,
             'NTT': 1.2821028386420261,
             'Continental': 1.1503332294182536,
             'Thales': 1.2368725879167042,
             'Thermo Fisher Scientific': 1.3139249617242328,
             'Google': 1.3848285769709197,
             'Nokia': 1.3921235975819903,
             'Ericsson-Worldwide': 1.3925109933961002,
             'Procter & Gamble': 1.3516376642093169,
             'ABB': 1.3285467519560543,
             '

In [34]:
def ent_avg(probMatrix):
    entropy_list = []
    for i in range(len(probMatrix)): 
        entropy_list.append(sp.stats.entropy(probMatrix[i]))
    return np.mean(entropy_list)

In [35]:
%%time

entropy_avg_results = defaultdict(float)

for company, proba_df in docs_of_probas.items():
    entropy_avg_results[company] = ent_avg(proba_df.values)

CPU times: user 144 ms, sys: 2.17 ms, total: 147 ms
Wall time: 147 ms


In [36]:
entropy_avg_results

defaultdict(float,
            {'Hays': 0.25135673394381847,
             'Boston Consulting Group': 0.98287822415024,
             'Oracle': 0.909744182722897,
             'Philips': 0.9012621672696225,
             'IBM': 0.9283546982620743,
             'Amazon': 0.8260683717319083,
             'Orange': 0.708551114672254,
             'DXC Technology': 0.7686060257191989,
             'Deloitte': 1.1169513340931527,
             'Citi': 1.0123599254882198,
             'Microsoft': 1.069389442006894,
             'Altran Group': 0.2293772321887769,
             'NTT': 0.7454709264978693,
             'Continental': 0.24634447937508844,
             'Thales': 0.7089317576354653,
             'Thermo Fisher Scientific': 0.8965154993454862,
             'Google': 0.9873047542527129,
             'Nokia': 1.0012034203667712,
             'Ericsson-Worldwide': 1.0119539909111064,
             'Procter & Gamble': 0.7714543303000102,
             'ABB': 0.8171575765752437,
             

In [37]:
# function to compute the cross-entropy of two probability distributions
def cross_entropy(p, q):
    for i in range(len(p)):
        p[i] = p[i]+1e-12
    for i in range(len(q)):
        q[i] = q[i]+1e-12

    return -sum([p[i] * np.log2(q[i]) for i in range(len(p))])

# function to compute the average cross-entropy of a matrix
def avg_crossEnt(probMatrix): 
#    NOTE: Cross entropy is not symmetric. 
#    This function takes both cross-entropy(p,q) and cross-entropy(q,p) 
#    into account when computing the avg
    crossEntropy_list = []
    for i in range(len(probMatrix)):
        for j in range(len(probMatrix)): 
            if i != j:
                crossEntropy_list.append(cross_entropy(probMatrix[i], probMatrix[j]))
    return np.mean(crossEntropy_list)

In [38]:
%%time

cross_entropy_results = defaultdict(float)

for company, proba_df in docs_of_probas.items():
    cross_entropy_results[company] = avg_crossEnt(proba_df.values)
    
# avg_crossEnt(docs_topics_df.values)

CPU times: user 3min 5s, sys: 470 ms, total: 3min 6s
Wall time: 3min 7s


In [39]:
cross_entropy_results

defaultdict(float,
            {'Hays': 1.6315595042873519,
             'Boston Consulting Group': 6.855773551378029,
             'Oracle': 6.369534734002661,
             'Philips': 8.523173231911164,
             'IBM': 6.880260728543732,
             'Amazon': 4.446740058161019,
             'Orange': 5.729308382583838,
             'DXC Technology': 7.475667923235373,
             'Deloitte': 8.774646798356992,
             'Citi': 8.586383204505747,
             'Microsoft': 7.988058224658032,
             'Altran Group': 2.2218737902786168,
             'NTT': 8.45915008216844,
             'Continental': 0.8037882588572419,
             'Thales': 8.95540857906868,
             'Thermo Fisher Scientific': 9.047061148966662,
             'Google': 8.576177298839772,
             'Nokia': 8.769721655648048,
             'Ericsson-Worldwide': 8.874126661607887,
             'Procter & Gamble': 6.932245553702421,
             'ABB': 8.13213977591657,
             'Hewlett Packard E