In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD, PCA
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
import nltk, re
# nltk.download('wordnet')
import scipy as sp
import math
import numpy as np
from string import punctuation
import csv
from functools import partial
import concurrent.futures as cf
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

%matplotlib inline

In [None]:
# pd.read_parquet('nameoffile', compression='gzip')

In [2]:
# netflix_df = pd.read_pickle("Netflix_Data")

In [5]:
df = pd.read_csv('clean_gs.csv')
df.head()

Unnamed: 0,employer,id,pros,cons
0,American Express,44001,Still not big enough in market place,"Great brand , Good leadership , Clear business..."
1,Eventum IT Solutions,44004,Nothing important on my point of view.,"Learn new technologies, helpful people, good m..."
2,Eventum IT Solutions,44004,Alot of friends working together which isn't v...,Very good opportunities to learn technologies
3,Eventum IT Solutions,44004,Working hours are not good and need to add the...,You can learn technically a lot in this company.
4,Eventum IT Solutions,44004,No Real Cons at all,- Very friendly environment.\r\n- Highly exper...


The following function will remove the company names from their respective reviews.

In [6]:
stopwords = nltk.corpus.stopwords.words('english')

def comp_name_out(data, col_to_search, col_reviews, companies_list):
    
    """
    This function takes in a dataframe, the name of the column with all of 
    the companies, the name of the column with the reviews, and an iterable
    with the companies names that are in the dataset. The latter could be a list,
    set, Series, tuple, etc.
    """
    
    for company in companies_list:
        condition = (data[col_to_search] == company)
        data.loc[condition, col_reviews] = data.loc[condition, col_reviews].str.lower().str.strip(company.lower())
    
    return data

The following function helps with the preprocessing of the data. It runs after the lemmatizer, stemmer, snowball, etc.

In [7]:
def normalize_doc(doc):
    """
    This function normalizes your list of documents by taking only
    words, numbers, and spaces in between them. It then filters out
    stop words if you want to.
    """
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    tokens = nltk.word_tokenize(doc)
#     filtered_tokens = [token for token in tokens]
    filtered_tokens = [token for token in tokens if token not in stopwords]
    doc = ' '.join(filtered_tokens)
    return doc

Function to get the root of the word. You can get all three (lemma, stem, and snow) or use them separately with the partial functions below.

In [8]:
def root_of_word(docs, root_word_method='lemma'):
    porter_stemmer = nltk.stem.PorterStemmer()
    snowball_stemmer = nltk.stem.SnowballStemmer('english')
    lemma = nltk.wordnet.WordNetLemmatizer()
    
    tokens = nltk.word_tokenize(docs)
    
    if root_word_method == 'lemma':
        doc = ' '.join([lemma.lemmatize(w) for w in tokens])
    elif root_word_method == 'stemm':
        doc = ' '.join([porter_stemmer.stem(w) for w in tokens])
    elif root_word_method == 'snowball':
        doc = ' '.join([snowball_stemmer.stem(w) for w in tokens])
        
    return doc

stemming = partial(root_of_word, root_word_method='stemm')
snowball = partial(root_of_word, root_word_method='snowball')

In [9]:
def jsd(p, q, base=np.e): # JS distance between probability vectors, used to compute compH
    '''
        Implementation of pairwise `jsd` based on  
        https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence
    '''
    p = np.asarray(p)
    q = np.asarray(q)
    m = (1 / 2 * (p + q))
    return sp.stats.entropy(p, m, base) / 2 +  sp.stats.entropy(q, m, base) / 2

In [10]:
def conth(data): # function to measure content heterogeneity given a topic (prob) matrix
    return (1 / ((sum(map(sum, np.square(data.values)))) / data.shape[0]))

In [11]:
def comph(data): 
    #Transform probMatrix_df into 2D array
        
    df = pd.DataFrame()
    for x in range(len(data)): 
        jsd_list = []
        for y in range(len(data)): 
            jsd_list.append(jsd(data[x], data[y]))
        df[str(x)] = jsd_list

    #Get df lower diagonal
    mask = np.ones(df.shape, dtype='bool')
    mask[np.triu_indices(len(df))] = False
    df_lower_diagonal = df[(df>-1) & mask]
    
    distance_list = []
    for k in range(len(df)): 
    #Transform each column of df_lower_diagonal into list
        column_list = df_lower_diagonal[str(k)].values.tolist()
        #Drop nan values from column_list - to retain only actual values from lower diagonal 
        column_lower_diagonal_list = [l for l in column_list if (math.isnan(l) == False)]
        for d in column_lower_diagonal_list: 
            distance_list.append(d)
            
    return sum(distance_list) / float(len(distance_list))

Remove the company names from the reviews, and extract the reviews into a numpy array.

In [12]:
companies = df['employer'].unique()
companies[:4]

array(['American Express', 'Eventum IT Solutions', 'Hays',
       'Verizon\xa0Connect'], dtype=object)

In [13]:
%%time

df = comp_name_out(df, 'employer', 'pros', companies)
data_pros = df['pros'].values

CPU times: user 2min 4s, sys: 708 ms, total: 2min 4s
Wall time: 2min 6s


The text preprocessing of the corpus takes place in parallel.

In [20]:
%%time

with cf.ProcessPoolExecutor() as e:
    data_pros_cleaned = e.map(stemming, data_pros)
    data_pros_cleaned = list(e.map(normalize_doc, data_pros_cleaned))

df['pros_clean'] = data_pros_cleaned

Calculate the total words in the dictionary of review words.

In [15]:
TotalWords_vectorizer = CountVectorizer()
TotalWords_tf = TotalWords_vectorizer.fit_transform(data_pros)
totWords = len(TotalWords_vectorizer.get_feature_names())

Get the cleaned dictionary.

In [135]:
df.employer.value_counts().head(60)

Amazon                              561
Oracle                              422
Microsoft                           349
Siemens                             343
Dell Technologies                   338
IBM                                 337
EY                                  324
PwC                                 300
SAP                                 295
Citi                                253
Accenture                           250
DXC Technology                      249
Cisco Systems                       232
Google                              221
Deloitte                            215
Ericsson-Worldwide                  211
Nokia                               204
Capgemini                           198
Procter & Gamble                    191
IQVIA                               191
McKinsey & Company                  185
Vodafone                            169
Honeywell                           164
Intel Corporation                   163
Philips                             162


In [47]:
comps_of_interest = df.employer.value_counts()
comps_of_interest = (comps_of_interest[comps_of_interest > 100]).index
comps_of_interest

Index(['Amazon', 'Oracle', 'Microsoft', 'Siemens', 'Dell Technologies', 'IBM',
       'EY', 'PwC', 'SAP', 'Citi', 'Accenture', 'DXC Technology',
       'Cisco Systems', 'Google', 'Deloitte', 'Ericsson-Worldwide', 'Nokia',
       'Capgemini', 'Procter & Gamble', 'IQVIA', 'McKinsey & Company',
       'Vodafone', 'Honeywell', 'Intel Corporation', 'Philips',
       'Hewlett Packard Enterprise | HPE', 'Hays', 'Orange', 'Nielsen', 'ABB',
       'Schneider Electric', 'Roche', 'Boston Consulting Group', 'VMware',
       'Salesforce', 'Continental', 'Altran Group', 'Johnson & Johnson',
       'Thales', 'Marriott International', 'Synopsys', 'J.P. Morgan', 'Luxoft',
       'KPMG', 'Amdocs', 'Cognizant Technology Solutions', 'Shell', 'NCR',
       'NTT', 'Verizon', 'JLL', 'Thermo Fisher Scientific', 'Société Générale',
       'HP Inc.', 'Mondelēz International', 'Uber'],
      dtype='object')

In [48]:
cond2 = df['employer'].isin(comps_of_interest)
df.loc[cond2].head()

Unnamed: 0,employer,id,pros,cons,pros_clean
6,Hays,44012,- sink or swim (but this is the nature of most...,- Clear career path.\r\n- Motivated management...,sink swim thi natur recruit firm
7,Hays,44012,- management does not listen to advice from no...,- Excellent view from the office\r\n- Friendly...,manag doe listen advic nonmanageri staff lack ...
8,Hays,44012,it has a result-oriented nature,Great opportunity to learn the recruitment pro...,ha resultori natur
9,Hays,44012,- high pressure culture instead of high perfor...,- Location\r\n- International transfer\r\n- Af...,high pressur cultur instead high perform work ...
10,Hays,44012,those who are most successful at hays are indi...,Hays' culture is build on meritocracy. The Com...,success hay individu look onli meet expect exceed


In [49]:
df_interest = df[cond2].copy()
df_interest.shape

(10380, 5)

In [53]:
df_interest.loc[df_interest['employer'] == 'Hays', 'id'].unique()

array([ 44012,  44217,  50043,     49,    299,  44921,  45078,  55507,
        61101,  72119,  72338,  80275,  83451,  74278,  99683, 106752,
        86243, 122908, 126224, 110150, 127512, 110590,  18125,  33433,
       125242, 105872, 156170, 156784,  20989,  39478,  73626,  77815,
        40902, 130784, 151854, 155333,  14069,   4757,  51658,  78820,
        79094, 153812])

In [55]:
unique_ids = df_interest['employer'].unique()
unique_ids[:4]

array(['Hays', 'Boston Consulting Group', 'Oracle', 'Philips'],
      dtype=object)

In [62]:
%%time

vectorizers_list = []
for comp_id in unique_ids:
    cond = (df_interest['employer'] == comp_id)
    revs_clean = df_interest.loc[cond, 'pros_clean'].values
    vect = CountVectorizer().fit_transform(revs_clean)
    vectorizers_list.append((comp_id, vect))

CPU times: user 336 ms, sys: 48.9 ms, total: 385 ms
Wall time: 417 ms


In [64]:
vectorizers_list[:4]

[('Hays',
  <155x1290 sparse matrix of type '<class 'numpy.int64'>'
  	with 2417 stored elements in Compressed Sparse Row format>),
 ('Boston Consulting Group',
  <146x916 sparse matrix of type '<class 'numpy.int64'>'
  	with 1750 stored elements in Compressed Sparse Row format>),
 ('Oracle',
  <422x1631 sparse matrix of type '<class 'numpy.int64'>'
  	with 4710 stored elements in Compressed Sparse Row format>),
 ('Philips',
  <162x1015 sparse matrix of type '<class 'numpy.int64'>'
  	with 1758 stored elements in Compressed Sparse Row format>)]

In [16]:
# tf_vectorizer = CountVectorizer(max_df = 0.90, min_df=0.01)
# tf = tf_vectorizer.fit_transform(data_pros_cleaned)
# tf_feature_names = tf_vectorizer.get_feature_names()

Percentage of words in the final dictionary that can be found in the full corpus.

In [17]:
percVoc = len(tf_feature_names) / totWords * 100
percVoc

0.39332742564867934

Function to parallelize the models.

In [65]:
our_range = range(2, 300, 5)

def get_models(topics, tf):
    lda = LatentDirichletAllocation(n_components=topics, max_iter=200, learning_method='batch', learning_offset=10.,evaluate_every=2, random_state=1234)
    lda_model = lda.fit(tf[1])
    topicsOverWords = lda_model.components_ / lda_model.components_.sum(axis=1)[:, np.newaxis]
    return (tf[0], topics, comph(topicsOverWords), lda_model)

Run the models in parallel.

In [66]:
%%time

output_dictionary = {}
for sparse_tup in vectorizers_list:
    partial_func = partial(get_models, tf=sparse_tup)
    with cf.ProcessPoolExecutor() as e:
        output = list(e.map(partial_func, our_range))
    output_dictionary[sparse_tup[0]] = output

CPU times: user 15.8 s, sys: 32 s, total: 47.8 s
Wall time: 1h 55min 7s


In [84]:
%%time

dfs_list = []
for data in output_dictionary.keys():
    temp_df = pd.DataFrame.from_dict(output_dictionary[data])
    dfs_list.append(temp_df)

CPU times: user 65.9 ms, sys: 38.6 ms, total: 104 ms
Wall time: 138 ms


In [88]:
output_dfs = pd.concat(dfs_list)
output_dfs.columns = ['company', 'topics', 'coherence', 'models']
output_dfs.head()

Unnamed: 0,company,topics,coherence,models
0,Hays,2,0.280552,"LatentDirichletAllocation(evaluate_every=2, ma..."
1,Hays,7,0.255732,"LatentDirichletAllocation(evaluate_every=2, ma..."
2,Hays,12,0.276,"LatentDirichletAllocation(evaluate_every=2, ma..."
3,Hays,17,0.289898,"LatentDirichletAllocation(evaluate_every=2, ma..."
4,Hays,22,0.294427,"LatentDirichletAllocation(evaluate_every=2, ma..."


In [89]:
from collections import defaultdict

In [91]:
companies

array(['American Express', 'Eventum IT Solutions', 'Hays', ...,
       'Elkem Silicones', 'Clarity Insights', "Dr. Reddy's"], dtype=object)

In [93]:
best_topics_model = defaultdict(tuple)
for company in unique_ids:
    cond = output_dfs['company'] == company
    filtered_data = output_dfs[cond]
    the_topic = int(filtered_data.loc[filtered_data['coherence'].idxmax(), 'topics'])
    the_model = filtered_data.loc[filtered_data['coherence'].idxmax(), 'models']
    best_topics_model[company] = (the_topic, the_model)

In [99]:
best_topics_model

defaultdict(tuple,
            {'Hays': (72,
              LatentDirichletAllocation(evaluate_every=2, max_iter=200, n_components=72,
                                        random_state=1234)),
             'Boston Consulting Group': (62,
              LatentDirichletAllocation(evaluate_every=2, max_iter=200, n_components=62,
                                        random_state=1234)),
             'Oracle': (137,
              LatentDirichletAllocation(evaluate_every=2, max_iter=200, n_components=137,
                                        random_state=1234)),
             'Philips': (87,
              LatentDirichletAllocation(evaluate_every=2, max_iter=200, n_components=87,
                                        random_state=1234)),
             'IBM': (92,
              LatentDirichletAllocation(evaluate_every=2, max_iter=200, n_components=92,
                                        random_state=1234)),
             'Amazon': (222,
              LatentDirichletAllocation(evaluat

In [136]:
vectorizers_list[0]

('Hays',
 <155x1290 sparse matrix of type '<class 'numpy.int64'>'
 	with 2417 stored elements in Compressed Sparse Row format>)

In [119]:
#generate matrix summarizing distribution of docs (reviews) over topics
docs_of_probas = defaultdict(pd.DataFrame)

for tup in vectorizers_list:
    docs_of_probas[tup[0]] = pd.DataFrame(best_topics_model[tup[0]][1].transform(tup[1]))

In [121]:
docs_of_probas['Hays'].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71
0,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.859127,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984,0.001984
1,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.975347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347,0.000347
2,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.753472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472,0.003472
3,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.000408,0.970997
4,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.890432,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543,0.001543


### Calculate the measures of interest

In [123]:
%%time

comP_h_results = defaultdict(float)

for company, proba_df in docs_of_probas.items():
    comP_h_results[company] = comph(proba_df.values)

CPU times: user 2min 21s, sys: 617 ms, total: 2min 22s
Wall time: 2min 24s


In [125]:
comP_h_results

defaultdict(float,
            {'Hays': 0.59938204560807,
             'Boston Consulting Group': 0.5850236786653754,
             'Oracle': 0.6009032983444482,
             'Philips': 0.5984505264414839,
             'IBM': 0.5874039260169989,
             'Amazon': 0.6069402771068312,
             'Orange': 0.5992706973171229,
             'DXC Technology': 0.598646183202596,
             'Deloitte': 0.5924741579571319,
             'Citi': 0.5875625671644225,
             'Microsoft': 0.598481816057546,
             'Altran Group': 0.6033602192174002,
             'NTT': 0.591552064338981,
             'Continental': 0.5891079382813795,
             'Thales': 0.6115898394023912,
             'Thermo Fisher Scientific': 0.5814171282318377,
             'Google': 0.5820191871204352,
             'Nokia': 0.5911527192422398,
             'Ericsson-Worldwide': 0.5907156058395693,
             'Procter & Gamble': 0.5918299813830346,
             'ABB': 0.5876714099182295,
             'H

In [124]:
%%time


comT_h_results = defaultdict(float)

for company, proba_df in docs_of_probas.items():
    comT_h_results[company] = conth(proba_df)

CPU times: user 217 ms, sys: 5.57 ms, total: 222 ms
Wall time: 273 ms


In [126]:
comT_h_results

defaultdict(float,
            {'Hays': 1.3564581467313805,
             'Boston Consulting Group': 1.3773753978354506,
             'Oracle': 1.3960180432589377,
             'Philips': 1.3222144330859782,
             'IBM': 1.3906218097859446,
             'Amazon': 1.5290729922388016,
             'Orange': 1.2842297789079136,
             'DXC Technology': 1.3334018748516523,
             'Deloitte': 1.4045421606294648,
             'Citi': 1.4406680576615685,
             'Microsoft': 1.3865926685905663,
             'Altran Group': 1.3019209535252456,
             'NTT': 1.269429126080505,
             'Continental': 1.3440344548653722,
             'Thales': 1.2376411761239516,
             'Thermo Fisher Scientific': 1.348286884044163,
             'Google': 1.3933435193800052,
             'Nokia': 1.3720229430315414,
             'Ericsson-Worldwide': 1.3595152088343005,
             'Procter & Gamble': 1.3446942266382103,
             'ABB': 1.296036059699818,
             

In [128]:
def ent_avg(probMatrix):
    entropy_list = []
    for i in range(len(probMatrix)): 
        entropy_list.append(sp.stats.entropy(probMatrix[i]))
    return np.mean(entropy_list)

In [129]:
%%time

entropy_avg_results = defaultdict(float)

for company, proba_df in docs_of_probas.items():
    entropy_avg_results[company] = ent_avg(proba_df.values)

CPU times: user 235 ms, sys: 6.31 ms, total: 242 ms
Wall time: 298 ms


In [130]:
entropy_avg_results

defaultdict(float,
            {'Hays': 0.9347030644082308,
             'Boston Consulting Group': 0.940904311154074,
             'Oracle': 1.0858373795464102,
             'Philips': 0.9475818681114859,
             'IBM': 1.070987978065448,
             'Amazon': 1.2557512025405413,
             'Orange': 0.8347193130021504,
             'DXC Technology': 0.9096828829844992,
             'Deloitte': 1.0500737999881138,
             'Citi': 1.139216960597608,
             'Microsoft': 1.0691058942009501,
             'Altran Group': 0.8511341941971913,
             'NTT': 0.7138935886016018,
             'Continental': 0.9433195860921886,
             'Thales': 0.7442552037795543,
             'Thermo Fisher Scientific': 0.9359049204254245,
             'Google': 1.0206889088327655,
             'Nokia': 1.0592845262753936,
             'Ericsson-Worldwide': 1.0574781404568672,
             'Procter & Gamble': 0.9839316990623069,
             'ABB': 0.804199290029007,
             '

In [131]:
# function to compute the cross-entropy of two probability distributions
def cross_entropy(p, q):
    for i in range(len(p)):
        p[i] = p[i]+1e-12
    for i in range(len(q)):
        q[i] = q[i]+1e-12

    return -sum([p[i] * np.log2(q[i]) for i in range(len(p))])

# function to compute the average cross-entropy of a matrix
def avg_crossEnt(probMatrix): 
#    NOTE: Cross entropy is not symmetric. 
#    This function takes both cross-entropy(p,q) and cross-entropy(q,p) 
#    into account when computing the avg
    crossEntropy_list = []
    for i in range(len(probMatrix)):
        for j in range(len(probMatrix)): 
            if i != j:
                crossEntropy_list.append(cross_entropy(probMatrix[i], probMatrix[j]))
    return np.mean(crossEntropy_list)

In [132]:
%%time

cross_entropy_results = defaultdict(float)

for company, proba_df in docs_of_probas.items():
    cross_entropy_results[company] = avg_crossEnt(proba_df.values)
    
# avg_crossEnt(docs_topics_df.values)

CPU times: user 11min 19s, sys: 1.36 s, total: 11min 21s
Wall time: 11min 27s


In [133]:
cross_entropy_results

defaultdict(float,
            {'Hays': 9.45118768215931,
             'Boston Consulting Group': 8.992603143668836,
             'Oracle': 10.236883386216952,
             'Philips': 9.581171680357297,
             'IBM': 9.482374507187536,
             'Amazon': 11.02774639866361,
             'Orange': 9.159664224451324,
             'DXC Technology': 9.314463527304103,
             'Deloitte': 9.574724640288334,
             'Citi': 9.711687249342075,
             'Microsoft': 10.043317095352132,
             'Altran Group': 9.36795758414793,
             'NTT': 8.352029313034441,
             'Continental': 8.988820556634364,
             'Thales': 9.478352622134727,
             'Thermo Fisher Scientific': 8.590687919835862,
             'Google': 8.979765369278256,
             'Nokia': 9.66275035995434,
             'Ericsson-Worldwide': 9.654204974175151,
             'Procter & Gamble': 9.39231563555692,
             'ABB': 8.476985118333474,
             'Hewlett Packard Ent