In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD, PCA
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
import nltk, re
# nltk.download('wordnet')
import scipy as sp
import math
import numpy as np
from string import punctuation
import csv
from functools import partial
import concurrent.futures as cf
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

%matplotlib inline

In [None]:
pd.read_parquet('nameoffile', compression='gzip')

In [2]:
netflix_df = pd.read_pickle("Netflix_Data")

The following function will remove the company names from their respective reviews.

In [3]:
stopwords = nltk.corpus.stopwords.words('english')

def comp_name_out(data, col_to_search, col_reviews, companies_list):
    
    """
    This function takes in a dataframe, the name of the column with all of 
    the companies, the name of the column with the reviews, and an iterable
    with the companies names that are in the dataset. The latter could be a list,
    set, Series, tuple, etc.
    """
    
    for company in companies_list:
        condition = (data[col_to_search] == company)
        data.loc[condition, col_reviews] = data.loc[condition, col_reviews].str.lower().str.strip(company.lower())
    
    return data

The following function helps with the preprocessing of the data. It runs after the lemmatizer, stemmer, snowball, etc.

In [4]:
def normalize_doc(doc):
    """
    This function normalizes your list of documents by taking only
    words, numbers, and spaces in between them. It then filters out
    stop words if you want to.
    """
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    tokens = nltk.word_tokenize(doc)
#     filtered_tokens = [token for token in tokens]
    filtered_tokens = [token for token in tokens if token not in stopwords]
    doc = ' '.join(filtered_tokens)
    return doc

Function to get the root of the word. You can get all three (lemma, stem, and snow) or use them separately with the partial functions below.

In [5]:
def root_of_word(docs, root_word_method='lemma'):
    porter_stemmer = nltk.stem.PorterStemmer()
    snowball_stemmer = nltk.stem.SnowballStemmer('english')
    lemma = nltk.wordnet.WordNetLemmatizer()
    
    tokens = nltk.word_tokenize(docs)
    
    if root_word_method == 'lemma':
        doc = ' '.join([lemma.lemmatize(w) for w in tokens])
    elif root_word_method == 'stemm':
        doc = ' '.join([porter_stemmer.stem(w) for w in tokens])
    elif root_word_method == 'snowball':
        doc = ' '.join([snowball_stemmer.stem(w) for w in tokens])
        
    return doc

stemming = partial(root_of_word, root_word_method='stemm')
snowball = partial(root_of_word, root_word_method='snowball')

In [6]:
def jsd(p, q, base=np.e): # JS distance between probability vectors, used to compute compH
    '''
        Implementation of pairwise `jsd` based on  
        https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence
    '''
    p = np.asarray(p)
    q = np.asarray(q)
    m = (1 / 2 * (p + q))
    return sp.stats.entropy(p, m, base) / 2 +  sp.stats.entropy(q, m, base) / 2

In [7]:
def conth(data): # function to measure content heterogeneity given a topic (prob) matrix
    return (1 / ((sum(map(sum, np.square(data.values)))) / data.shape[0]))

In [8]:
def comph(data): 
    #Transform probMatrix_df into 2D array
        
    df = pd.DataFrame()
    for x in range(len(data)): 
        jsd_list = []
        for y in range(len(data)): 
            jsd_list.append(jsd(data[x], data[y]))
        df[str(x)] = jsd_list

    #Get df lower diagonal
    mask = np.ones(df.shape, dtype='bool')
    mask[np.triu_indices(len(df))] = False
    df_lower_diagonal = df[(df>-1) & mask]
    
    distance_list = []
    for k in range(len(df)): 
    #Transform each column of df_lower_diagonal into list
        column_list = df_lower_diagonal[str(k)].values.tolist()
        #Drop nan values from column_list - to retain only actual values from lower diagonal 
        column_lower_diagonal_list = [l for l in column_list if (math.isnan(l) == False)]
        for d in column_lower_diagonal_list: 
            distance_list.append(d)
            
    return sum(distance_list) / float(len(distance_list))

Remove the company names from the reviews, and extract the reviews into a numpy array.

In [9]:
comp_list = ['Netflix', 'amazon'] 
netflix_df = comp_name_out(netflix_df, 'employerName', 'pros', comp_list)
data_pros = netflix_df['pros'].values

The text preprocessing of the corpus takes place in parallel.

In [11]:
%%time

with cf.ProcessPoolExecutor() as e:
    data_pros_cleaned = e.map(stemming, data_pros)
    data_pros_cleaned = list(e.map(normalize_doc, data_pros_cleaned))
data_pros_cleaned[:3]

CPU times: user 947 ms, sys: 16.9 ms, total: 964 ms
Wall time: 1.07 s


array(['work talent ppl around',
       'reedom respons treat like adult part pro team highli function compani well respect ha super posit brand awar never abl go anywher without get pepper rave happi custom comment love wear compani logo gear thi reason matter netflix dead wood everyon someth veri import compani would nt matter make differ feel good realli make differ matter role compani often repeat employe olymp team mean nt thi 247 life focu gold varsiti team play win veri good still balanc class learn need good worklif balanc veri import compani think best thing subtl thi vacationno holiday stuff work work seem realiz everyon work hard time night weekend often ton hard work want take time vacat long holiday day whatever happen discret nt ask nt get permiss one keep track never heard thi kind polici imagin doest moral feel like treat like grown rule alon think stand beyond ani org',
       'great colleagu incred realli'], dtype='<U1247')

Calculate the total words in the dictionary of review words.

In [12]:
TotalWords_vectorizer = CountVectorizer()
TotalWords_tf = TotalWords_vectorizer.fit_transform(data_pros)
totWords = len(TotalWords_vectorizer.get_feature_names())

2130

Get the cleaned dictionary.

In [13]:
tf_vectorizer = CountVectorizer(max_df = 0.90, min_df=0.01)
tf = tf_vectorizer.fit_transform(data_pros_cleaned)
tf_feature_names = tf_vectorizer.get_feature_names()

Percentage of words in the final dictionary that can be found in the full corpus.

In [14]:
percVoc = len(tf_feature_names) / totWords * 100
percVoc

17.981220657276996

Function to parallelize the models.

In [None]:
our_range = range(2, 300, 5)

def get_models(topics):
    lda = LatentDirichletAllocation(n_components=topics, max_iter=200, 
                                    learning_method='batch', learning_offset=10.,
                                    evaluate_every=2, random_state=1234)
    
    lda_model = lda.fit(tf)
    topicsOverWords = lda_model.components_ / lda_model.components_.sum(axis=1)[:, np.newaxis]
    
    
    return (topics, comph(topicsOverWords), lda_model)

Run the models in parallel.

In [19]:
%%time

with cf.ProcessPoolExecutor() as e:
    output = list(e.map(get_models, our_range))

CPU times: user 153 ms, sys: 181 ms, total: 335 ms
Wall time: 6min 50s


In [23]:
out_df = pd.DataFrame(output, columns=['topics', 'coherence', 'models'])
out_df.head()

Unnamed: 0,topics,coherence,models
0,2,0.309376,"LatentDirichletAllocation(evaluate_every=2, ma..."
1,7,0.359481,"LatentDirichletAllocation(evaluate_every=2, ma..."
2,12,0.418974,"LatentDirichletAllocation(evaluate_every=2, ma..."
3,17,0.447498,"LatentDirichletAllocation(evaluate_every=2, ma..."
4,22,0.444542,"LatentDirichletAllocation(evaluate_every=2, ma..."


In [24]:
optimal_topics = int(out_df.loc[out_df['coherence'].idxmax(), 'topics'])
optimal_topics

172

In [30]:
out_df[out_df['topics'] == optimal_topics]

Unnamed: 0,topics,coherence,models
34,172,0.584355,"LatentDirichletAllocation(evaluate_every=2, ma..."


In [25]:
best_model = out_df.loc[out_df['topics'] == optimal_topics, 'models'].iloc[0]

In [26]:
#generate matrix summarizing distribution of docs (reviews) over topics
docs_topics_df = pd.DataFrame(best_model.transform(tf))

### Calculate the measures of interest

In [39]:
%%time

comP_H = comph(docs_topics_df.values)
comP_H

CPU times: user 25.4 s, sys: 138 ms, total: 25.6 s
Wall time: 25.8 s


0.6001764879390206

In [40]:
%%time

conT_H = conth(docs_topics_df)
conT_H

CPU times: user 20.5 ms, sys: 1.37 ms, total: 21.8 ms
Wall time: 22.6 ms


2.068080930909674

In [48]:
def ent_avg(probMatrix):
    entropy_list = []
    for i in range(len(probMatrix)): 
        entropy_list.append(sp.stats.entropy(probMatrix[i]))
    entropy_avg = np.mean(entropy_list)
    return entropy_avg    

In [49]:
ent_avg(docs_topics_df.values)

1.5340804507420907

In [57]:
# function to compute the cross-entropy of two probability distributions
def cross_entropy(p, q):
    for i in range(len(p)):
        p[i] = p[i]+1e-12
    for i in range(len(q)):
        q[i] = q[i]+1e-12

    return -sum([p[i] * np.log2(q[i]) for i in range(len(p))])

# function to compute the average cross-entropy of a matrix
def avg_crossEnt(probMatrix): 
#    NOTE: Cross entropy is not symmetric. 
#    This function takes both cross-entropy(p,q) and cross-entropy(q,p) 
#    into account when computing the avg
    crossEntropy_list = []
    for i in range(len(probMatrix)):
        for j in range(len(probMatrix)): 
            if i != j:
                crossEntropy_list.append(cross_entropy(probMatrix[i], probMatrix[j]))
    return np.mean(crossEntropy_list)

In [58]:
%%time

avg_crossEnt(docs_topics_df.values)

10.590204388743519