In [18]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD, PCA
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
import nltk, re
# nltk.download('wordnet')
import scipy as sp
import math
import numpy as np
from string import punctuation
import csv
from functools import partial
import matplotlib.pyplot as plt
from collections import namedtuple

pd.set_option('display.max_columns', None)

%matplotlib inline

In [19]:
netflix_df = pd.read_pickle("Netflix_Data")

The following function will remove the company names from their respective reviews.

In [20]:
stopwords = nltk.corpus.stopwords.words('english')

def comp_name_out(data, col_to_search, col_reviews, companies_list):
    
    """
    This function takes in a dataframe, the name of the column with all of 
    the companies, the name of the column with the reviews, and an iterable
    with the companies names that are in the dataset. The latter could be a list,
    set, Series, tuple, etc.
    """
    
    for company in companies_list:
        condition = (data[col_to_search] == company)
        data.loc[condition, col_reviews] = data.loc[condition, col_reviews].str.lower().str.strip(company.lower())
    
    return data

The following function helps with the preprocessing of the data. It runs after the lemmatizer, stemmer, snowball, etc.

In [21]:
def normalize_doc(doc):
    """
    This function normalizes your list of documents by taking only
    words, numbers, and spaces in between them. It then filters out
    stop words if you want to.
    """
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    tokens = nltk.word_tokenize(doc)
#     filtered_tokens = [token for token in tokens]
    filtered_tokens = [token for token in tokens if token not in stopwords]
    doc = ' '.join(filtered_tokens)
    return doc

corp_normalizer = np.vectorize(normalize_doc)

Function to get the root of the word. You can get all three (lemma, stem, and snow) or use them separately with the partial functions below. The function above and the function below have been vectorized, meaning, it is like a loop where you only need to pass your iterable to it and it applies the function to every single element.

In [22]:
def root_of_word(docs, root_word_method='lemma'):
    porter_stemmer = nltk.stem.PorterStemmer()
    snowball_stemmer = nltk.stem.SnowballStemmer('english')
    lemma = nltk.wordnet.WordNetLemmatizer()
    
    tokens = nltk.word_tokenize(docs)
    
    if root_word_method == 'lemma':
        doc = ' '.join([lemma.lemmatize(w) for w in tokens])
    elif root_word_method == 'stemm':
        doc = ' '.join([porter_stemmer.stem(w) for w in tokens])
    elif root_word_method == 'snowball':
        doc = ' '.join([snowball_stemmer.stem(w) for w in tokens])
        
    return doc

stemming = np.vectorize(partial(root_of_word, root_word_method='stemm'))
snowball = np.vectorize(partial(root_of_word, root_word_method='snowball'))

In [23]:
def jsd(p, q, base=np.e): # JS distance between probability vectors, used to compute compH
    '''
        Implementation of pairwise `jsd` based on  
        https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence
    '''
    p = np.asarray(p)
    q = np.asarray(q)
    m = (1 / 2 * (p + q))
    return sp.stats.entropy(p, m, base) / 2 +  sp.stats.entropy(q, m, base) / 2

In [24]:
def conth(data): # function to measure content heterogeneity given a topic (prob) matrix
    return (1 / ((sum(map(sum, np.square(data.values)))) / data.shape[0]))

In [25]:
def comph(data): 
    #Transform probMatrix_df into 2D array
        
    df = pd.DataFrame()
    for x in range(len(data)): 
        jsd_list = []
        for y in range(len(data)): 
            jsd_list.append(jsd(data[x], data[y]))
        df[str(x)] = jsd_list

    #Get df lower diagonal
    mask = np.ones(df.shape, dtype='bool')
    mask[np.triu_indices(len(df))] = False
    df_lower_diagonal = df[(df>-1) & mask]
    
    distance_list = []
    for k in range(len(df)): 
    #Transform each column of df_lower_diagonal into list
        column_list = df_lower_diagonal[str(k)].values.tolist()
        #Drop nan values from column_list - to retain only actual values from lower diagonal 
        column_lower_diagonal_list = [l for l in column_list if (math.isnan(l) == False)]
        for d in column_lower_diagonal_list: 
            distance_list.append(d)
            
    return sum(distance_list) / float(len(distance_list))

Remove the company names from the reviews, and extract the reviews into a numpy array.

In [26]:
comp_list = ['Netflix', 'amazon'] 
netflix_df = comp_name_out(netflix_df, 'employerName', 'pros', comp_list)
data_pros = netflix_df['pros'].values

Apply the preprocessing functions in combination.

In [28]:
%%time
data_pros_cleaned = corp_normalizer(stemming(data_pros))

CPU times: user 749 ms, sys: 6.51 ms, total: 756 ms
Wall time: 761 ms


array(['work talent ppl around',
       'reedom respons treat like adult part pro team highli function compani well respect ha super posit brand awar never abl go anywher without get pepper rave happi custom comment love wear compani logo gear thi reason matter netflix dead wood everyon someth veri import compani would nt matter make differ feel good realli make differ matter role compani often repeat employe olymp team mean nt thi 247 life focu gold varsiti team play win veri good still balanc class learn need good worklif balanc veri import compani think best thing subtl thi vacationno holiday stuff work work seem realiz everyon work hard time night weekend often ton hard work want take time vacat long holiday day whatever happen discret nt ask nt get permiss one keep track never heard thi kind polici imagin doest moral feel like treat like grown rule alon think stand beyond ani org',
       'great colleagu incred realli'], dtype='<U1247')

Calculate the total words in the dictionary of review words.

In [29]:
TotalWords_vectorizer = CountVectorizer()
TotalWords_tf = TotalWords_vectorizer.fit_transform(data_pros)
totWords = len(TotalWords_vectorizer.get_feature_names())
totWords

2130

Get the cleaned dictionary.

In [30]:
tf_vectorizer = CountVectorizer(max_df = 0.90, min_df=0.01)
tf = tf_vectorizer.fit_transform(data_pros_cleaned)
tf_feature_names = tf_vectorizer.get_feature_names()

Percentage of words in the final dictionary that can be found in the full corpus.

In [31]:
percVoc = len(tf_feature_names) / totWords * 100
percVoc

17.981220657276996

Our tuple that will collect the topic number, the dissimilarity average, and the model.

In [32]:
Collect = namedtuple("Models", "topics diss_avg models")
output = Collect([], [], [])

Every model runs in parallel.

In [33]:
%%time

for topics in range(2, 300, 5): 
    
    lda = LatentDirichletAllocation(n_components=topics, max_iter=200, 
                                    learning_method='batch', learning_offset=10.,
                                    evaluate_every=2, random_state=1234, n_jobs=-1)
    
    lda_model = lda.fit(tf)
    topicsOverWords = lda_model.components_ / lda_model.components_.sum(axis=1)[:, np.newaxis]
    
    output.topics.append(topics)
    output.diss_avg.append(comph(topicsOverWords))
    output.models.append(lda_model)
    
    print(f"Done with topic {topics}")

Done with topic 2
Done with topic 7
Done with topic 12
Done with topic 17
Done with topic 22
Done with topic 27
Done with topic 32
Done with topic 37
Done with topic 42
Done with topic 47
Done with topic 52
Done with topic 57
Done with topic 62
Done with topic 67
Done with topic 72
Done with topic 77
Done with topic 82
Done with topic 87
Done with topic 92
Done with topic 97
Done with topic 102
Done with topic 107
Done with topic 112
Done with topic 117
Done with topic 122
Done with topic 127
Done with topic 132
Done with topic 137
Done with topic 142
Done with topic 147
Done with topic 152
Done with topic 157
Done with topic 162
Done with topic 167
Done with topic 172
Done with topic 177
Done with topic 182
Done with topic 187
Done with topic 192
Done with topic 197
Done with topic 202
Done with topic 207
Done with topic 212
Done with topic 217
Done with topic 222
Done with topic 227
Done with topic 232
Done with topic 237
Done with topic 242
Done with topic 247
Done with topic 252
Do

Here is the final dataframe with all three outputs.

In [34]:
out_df = pd.DataFrame(zip(*output), columns=['topics', 'coherence', 'models'])
out_df.head()

Unnamed: 0,topics,coherence,models
0,2,0.309527,"LatentDirichletAllocation(evaluate_every=2, ma..."
1,7,0.358032,"LatentDirichletAllocation(evaluate_every=2, ma..."
2,12,0.423534,"LatentDirichletAllocation(evaluate_every=2, ma..."
3,17,0.424758,"LatentDirichletAllocation(evaluate_every=2, ma..."
4,22,0.457503,"LatentDirichletAllocation(evaluate_every=2, ma..."


In [35]:
optimal_topics = int(out_df.loc[out_df['coherence'].idxmax(), 'topics'])
optimal_topics

162

In [36]:
out_df[out_df['topics'] == optimal_topics]

Unnamed: 0,topics,coherence,models
32,162,0.552373,"LatentDirichletAllocation(evaluate_every=2, ma..."


No need to run the model again since we already saved all of them in the dataframe.

In [37]:
best_model = out_df.loc[out_df['topics'] == optimal_topics, 'models'].iloc[0]

Now transform the corpus to get the probability matrix.

In [38]:
#generate matrix summarizing distribution of docs (reviews) over topics
docs_topics_df = pd.DataFrame(best_model.transform(tf))

### Calculate the measures of interest

In [39]:
%%time

comP_H = comph(docs_topics_df.values)
comP_H

CPU times: user 25.4 s, sys: 138 ms, total: 25.6 s
Wall time: 25.8 s


0.6001764879390206

In [40]:
%%time

conT_H = conth(docs_topics_df)
conT_H

CPU times: user 20.5 ms, sys: 1.37 ms, total: 21.8 ms
Wall time: 22.6 ms


2.068080930909674

In [48]:
def ent_avg(probMatrix):
    entropy_list = []
    for i in range(len(probMatrix)): 
        entropy_list.append(sp.stats.entropy(probMatrix[i]))
    entropy_avg = np.mean(entropy_list)
    return entropy_avg    

In [49]:
ent_avg(docs_topics_df.values)

1.5340804507420907

In [57]:
# function to compute the cross-entropy of two probability distributions
def cross_entropy(p, q):
    for i in range(len(p)):
        p[i] = p[i]+1e-12
    for i in range(len(q)):
        q[i] = q[i]+1e-12

    return -sum([p[i] * np.log2(q[i]) for i in range(len(p))])

# function to compute the average cross-entropy of a matrix
def avg_crossEnt(probMatrix): 
#    NOTE: Cross entropy is not symmetric. 
#    This function takes both cross-entropy(p,q) and cross-entropy(q,p) 
#    into account when computing the avg
    crossEntropy_list = []
    for i in range(len(probMatrix)):
        for j in range(len(probMatrix)): 
            if i != j:
                crossEntropy_list.append(cross_entropy(probMatrix[i], probMatrix[j]))
    return np.mean(crossEntropy_list)

In [58]:
%%time

avg_crossEnt(docs_topics_df.values)

10.590204388743519