In [1]:
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
import nltk
# nltk.download('wordnet')
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from joblib import parallel_backend
from dask.distributed import Client
import joblib
from sklearn.preprocessing import FunctionTransformer
from functools import partial
import scipy as sp
import math, re
import numpy as np
import csv
import concurrent.futures as cf
import matplotlib.pyplot as plt
from collections import defaultdict
# from numba import jit

pd.set_option('display.max_columns', None)

%matplotlib inline

In [2]:
netflix_df = pd.read_csv("netflix.csv")
netflix_df.shape

(693, 39)

In [3]:
import string

def tokenize_snow(text):
    stem = nltk.stem.SnowballStemmer('english')
    text = text.lower()

    for token in nltk.word_tokenize(text):
        if token in string.punctuation: continue
        yield stem.stem(token)
        
def tokenize_lemma(text):
    stem = nltk.wordnet.WordNetLemmatizer()
    text = text.lower()

    for token in nltk.word_tokenize(text):
        if token in string.punctuation: continue
        yield stem.stem(token)
        
def tokenize_stem(text):
    stem = nltk.stem.PorterStemmer()
    text = text.lower()

    for token in nltk.word_tokenize(text):
        if token in string.punctuation: continue
        yield stem.stem(token)

In [4]:
# @jit(nopython=True)
def jsd(p, q, base=np.e): # JS distance between probability vectors, used to compute compH
    '''
        Implementation of pairwise `jsd` based on  
        https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence
    '''
    p = np.asarray(p)
    q = np.asarray(q)
    m = 1 / 2 * (p + q)
    
    return sp.stats.entropy(p, m, base=base) / 2 +  sp.stats.entropy(q, m, base=base) / 2

In [5]:
# @jit(nopython=True)
def conth(p_mtx_df): # function to measure content heterogeneity given a topic (prob) matrix
    '''
    How is this review spread across the topics.
    Then you take the average values across the reviews
    Herfindall index
    Assuming the reviews are about culture
    Are people on average focus on a few cultural values (topics) when they write their review
    '''
    return (1 / ((sum(map(sum, np.square(p_mtx_df.values)))) / p_mtx_df.shape[0]))

Needs to have an additional argument. Or be completely redisigned

In [35]:
def comph(probMatrix_df, arr_or_df='df'): 

    if arr_or_df == 'df':
        probMatrix = probMatrix_df
    else:
        probMatrix = probMatrix_df.values
    
    df = pd.DataFrame()
    for x in range(len(probMatrix)): 
        jsd_list = []
        for y in range(len(probMatrix)): 
            jsd_list.append(jsd(probMatrix[x], probMatrix[y]))
        df[str(x)] = jsd_list

    #Get df lower diagonal
    mask = np.ones(df.shape, dtype='bool')
    mask[np.triu_indices(len(df))] = False
    df_lower_diagonal = df[(df>-1) & mask]
    
    distance_list = []
    for k in df.columns: 
    #Transform each column of df_lower_diagonal into list
        col_array = df_lower_diagonal.loc[df_lower_diagonal[k].notna(), k].values
        for d in col_array:
            distance_list.append(d)

    return (sum(distance_list) / len(distance_list))

In [8]:
def comp_name_out(data, col_to_search, col_reviews, companies_list):
    
    """
    This function takes in a dataframe, the name of the column with all of 
    the companies, the name of the column with the reviews, and an iterable
    with the companies names that are in the dataset. The latter could be a list,
    set, Series, tuple, etc.
    """
    
    for company in companies_list:
        condition = (data[col_to_search] == company)
        data.loc[condition, col_reviews] = data.loc[condition, col_reviews].str.lower().str.strip(company.lower())
    
    return data

In [9]:
comp_list = ['Netflix', 'amazon'] 
netflix_df = comp_name_out(netflix_df, 'employerName', 'pros', comp_list)

In [10]:
data_pros = netflix_df['pros'].values
data_pros[0]

'you will be working with the most talented ppl around.'

In [11]:
def normalize_doc(doc):
    """
    This function normalizes your list of documents by taking only
    words, numbers, and spaces in between them. It then filters out
    stop words if you want to.
    """
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    tokens = nltk.word_tokenize(doc)
    filtered_tokens = [token for token in tokens]
#     filtered_tokens = [token for token in tokens if token not in stop_words]
    doc = ' '.join(filtered_tokens)
    return doc

corp_normalizer = np.vectorize(normalize_doc)

In [12]:
with cf.ProcessPoolExecutor() as executor:
    data_pros_cleaned = executor.map(normalize_doc, data_pros)
    data_pros_cleaned = list(data_pros_cleaned)

TotalWords_vectorizer = CountVectorizer(stop_words='english')
TotalWords_tf = TotalWords_vectorizer.fit_transform(data_pros_cleaned)
totWords = len(TotalWords_vectorizer.get_feature_names())
# totWords


tf_vectorizer = CountVectorizer(tokenizer=tokenize_snow, max_df = 0.90, min_df=0.01)

tf = tf_vectorizer.fit_transform(data_pros_cleaned)

tf_feature_names = tf_vectorizer.get_feature_names()

percVoc = len(tf_feature_names) / totWords * 100
percVoc

16.52046783625731

In [14]:
%%time

i = 0    
output=np.zeros((60,3))

#totWordsPerdocument = np.sum(tf_matrix, axis=1)
for topics in range(2,300,5): 
    
    lda = LatentDirichletAllocation(n_components=topics, max_iter=200, 
                                    learning_method='online',random_state=1234, n_jobs=-1)
    lda_fit = lda.fit(tf)
    #output normalized matrix with distributions of topics over words
    #normalized
    topicsOverWords = lda_fit.components_ / lda_fit.components_.sum(axis=1)[:, np.newaxis]
    topicsDissim_avg = comph(topicsOverWords)

#store results per firm   
    output[i,0] = topics
    output[i,1] = topicsDissim_avg 
    output[i,2] = percVoc
  
    i = i+1
    
    print(f'Done with topic {topics}')

Done with topic 2
Done with topic 7
Done with topic 12
Done with topic 17
Done with topic 22
Done with topic 27
Done with topic 32
Done with topic 37
Done with topic 42
Done with topic 47
Done with topic 52
Done with topic 57
Done with topic 62
Done with topic 67
Done with topic 72
Done with topic 77
Done with topic 82
Done with topic 87
Done with topic 92
Done with topic 97
Done with topic 102
Done with topic 107
Done with topic 112
Done with topic 117
Done with topic 122
Done with topic 127
Done with topic 132
Done with topic 137
Done with topic 142
Done with topic 147
Done with topic 152
Done with topic 157
Done with topic 162
Done with topic 167
Done with topic 172
Done with topic 177
Done with topic 182
Done with topic 187
Done with topic 192
Done with topic 197
Done with topic 202
Done with topic 207
Done with topic 212
Done with topic 217
Done with topic 222
Done with topic 227
Done with topic 232
Done with topic 237
Done with topic 242
Done with topic 247
Done with topic 252
Do

In [None]:
%%time


output = defaultdict(np.float32)

for i, topics in enumerate(range(2, 300, 5)):

    lda = LatentDirichletAllocation(n_components=topics, max_iter=200, learning_method='online', 
                                    random_state=1234, n_jobs=-1)
    ldamo = lda.fit(tf)

    #output normalized matrix with distributions of topics over words
    #normalized
    topicsOverWords = ldamo.components_ / ldamo.components_.sum(axis=1)[:, np.newaxis]
    topicsDissim_avg = comph(topicsOverWords)

    #store results per firm   
    output[i] = (topics, topicsDissim_avg, percVoc)
    print(f'Done with topic {topics}')

In [None]:
%%time

# to use joblib with dask we need a client
# client = Client(processes=False)
mo_list = []


for topics in range(2, 300, 5):
    with joblib.parallel_backend('dask'):
        lda = LatentDirichletAllocation(n_components=topics, max_iter=200, learning_method='batch', 
                                        learning_offset=10., evaluate_every=2, random_state=1234)#, n_jobs=-1)
        ldamo = lda.fit(tf)
        mo_list.append((topics, ldamo))

In [79]:
tops = (ldamo.components_ / ldamo.components_.sum(axis=1)[:, np.newaxis])

In [15]:
topics_coherence_df = pd.DataFrame(output, columns=['topics', 'coherence', 'voc%'])
topics_coherence_df.head()

Unnamed: 0,topics,coherence,voc%
0,2.0,0.299548,16.520468
1,7.0,0.390344,16.520468
2,12.0,0.432444,16.520468
3,17.0,0.418613,16.520468
4,22.0,0.424082,16.520468


In [16]:
optimal_topics = int(topics_coherence_df.loc[topics_coherence_df['coherence'].idxmax(), 'topics'])
optimal_topics

12

In [17]:
lda = LatentDirichletAllocation(n_components=optimal_topics,
                                max_iter=200, 
                                learning_method='batch', 
                                learning_offset=10.,
                                evaluate_every=2,
                                random_state=1234,
                                n_jobs=-1)

In [18]:
#generate matrix summarizing distribution of docs (reviews) over topics
probMatrix = lda.fit_transform(tf)

In [19]:
probMatrix

array([[8.33362002e-03, 8.33358403e-03, 8.33344468e-03, ...,
        8.33378128e-03, 4.36523807e-01, 8.33370994e-03],
       [4.10284195e-02, 3.68742235e-04, 3.68753442e-04, ...,
        2.51008316e-01, 7.04644549e-01, 3.68751822e-04],
       [1.66668211e-02, 1.66667274e-02, 8.16664791e-01, ...,
        1.66668440e-02, 1.66667517e-02, 1.66668555e-02],
       ...,
       [2.77782128e-02, 2.77792629e-02, 2.77779631e-02, ...,
        2.77780380e-02, 2.77779938e-02, 2.77782421e-02],
       [4.62984546e-03, 4.57358786e-01, 4.62973700e-03, ...,
        4.62983456e-03, 4.62990910e-03, 4.62979304e-03],
       [9.25970965e-03, 9.25946955e-03, 8.98145585e-01, ...,
        9.25931826e-03, 9.25932319e-03, 9.25934628e-03]])

In [20]:
docs_topics_df = pd.DataFrame(data = probMatrix)
docs_topics_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.008334,0.008334,0.008333,0.236404,0.252069,0.008334,0.008333,0.008334,0.008334,0.008334,0.436524,0.008334
1,0.041028,0.000369,0.000369,0.000369,0.000369,0.000369,0.000369,0.000369,0.000369,0.251008,0.704645,0.000369
2,0.016667,0.016667,0.816665,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667
3,0.000794,0.058097,0.000794,0.000794,0.000794,0.000794,0.054277,0.074014,0.000794,0.442668,0.169775,0.196407
4,0.001302,0.093446,0.17107,0.001302,0.001302,0.079256,0.001302,0.001302,0.001302,0.310491,0.001302,0.336622


In [21]:
%%time

comp_H = comph(docs_topics_df, arr_or_df='arr')
comp_H

CPU times: user 25.5 s, sys: 96.8 ms, total: 25.6 s
Wall time: 25.8 s


0.4907799214962477

In [22]:
%%time

conT_H = conth(docs_topics_df)
conT_H

CPU times: user 2.61 ms, sys: 48 µs, total: 2.66 ms
Wall time: 2.63 ms


1.7702037303029805

In [25]:
def ent_avg(probMatrix): 
    import statistics
    entropy_list = []
    for i in range(len(probMatrix)): 
        entropy_list.append(entropy(probMatrix[i]))
    entropy_avg = statistics.mean(entropy_list)
    return entropy_avg    

ent_avg(docs_topics_df)

NameError: name 'entropy' is not defined

In [24]:
# function to compute the cross-entropy of two probability distributions
def cross_entropy(p, q):
    for i in range(len(p)):
        p[i] = p[i]+1e-12
    for i in range(len(q)):
        q[i] = q[i]+1e-12

    return -sum([p[i]*log2(q[i]) for i in range(len(p))])

In [None]:
# function to compute the average cross-entropy of a matrix
def avg_crossEnt(probMatrix): 
#    NOTE: Cross entropy is not symmetric. 
#    This function takes both cross-entropy(p,q) and cross-entropy(q,p) 
#    into account when computing the avg
    crossEntropy_list = []
    for i in range(len(probMatrix)):
        for j in range(len(probMatrix)): 
            if i != j:
                crossEntropy_list.append(cross_entropy(probMatrix[i], probMatrix[j]))
    crossEntropy_avg = statistics.mean(crossEntropy_list)
    return crossEntropy_avg    