In [1]:
# Import modules
import pandas as pd
import re
import gensim
from gensim.utils import simple_preprocess

In [2]:
df = pd.read_csv('reviewproscons.csv',index_col=0)
df.head()

  mask |= (ar1 == a)


Unnamed: 0,review
0,"Great Benefits like healthcare and 401k, nice ..."
1,Prestigious organization with great vision and...
2,Global company making variety of products. Ben...
3,Great and relaxed atmosphere\r\nTeam members a...
4,


In [3]:
# Remove the nulls
df = df.dropna()

In [4]:
# Remove punctuation
df['review_cleaned'] = df['review'].map(lambda x: re.sub('[,\.!?]', '', x))

# Convert the reviews to lowercase
df['review_cleaned'] = df['review_cleaned'].map(lambda x: x.lower())

# Print out the first rows of reviews
df['review_cleaned'].head()

0    great benefits like healthcare and 401k nice c...
1    prestigious organization with great vision and...
2    global company making variety of products bene...
3    great and relaxed atmosphere\r\nteam members a...
5    personally i love my team and my boss is great...
Name: review_cleaned, dtype: object

In [5]:
keywords_filter = ['culture','value','philosophy','belief']

In [6]:
df['contained'] = df['review'].apply(lambda x: 1 if any(s in x for s in keywords_filter) else 0)
df.head()

Unnamed: 0,review,review_cleaned,contained
0,"Great Benefits like healthcare and 401k, nice ...",great benefits like healthcare and 401k nice c...,0
1,Prestigious organization with great vision and...,prestigious organization with great vision and...,0
2,Global company making variety of products. Ben...,global company making variety of products bene...,0
3,Great and relaxed atmosphere\r\nTeam members a...,great and relaxed atmosphere\r\nteam members a...,0
5,"Personally, I love my team and my boss is grea...",personally i love my team and my boss is great...,0


In [7]:
df_filter = df[df['contained']==1]
df_filter.head(5)

Unnamed: 0,review,review_cleaned,contained
8,"Lots of benefits, good culture.Commute can be ...",lots of benefits good culturecommute can be br...,1
10,"Great 401k match, relaxed culture at workSome ...",great 401k match relaxed culture at worksome p...,1
11,Benefits and ability to try / learn many thing...,benefits and ability to try / learn many thing...,1
24,Make a few good productsThis company is one of...,make a few good productsthis company is one of...,1
28,Beautiful building and roof-top garden. Good l...,beautiful building and roof-top garden good lo...,1


In [28]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=False))  # deacc=True removes punctuations
data = df_filter.review_cleaned.values.tolist()
data_words = list(sent_to_words(data))
print(data_words[:1])

[['lots', 'of', 'benefits', 'good', 'culturecommute', 'can', 'be', 'brutal', 'due', 'to', 'increasing', 'austin', 'traffic', 'around', 'this', 'location', 'can', 'be', 'tough', 'to', 'get', 'position', 'here', 'without', 'many', 'inside', 'connections']]


In [30]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [31]:
# NLTK Stop words
# import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]
def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [32]:
import spacy
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
print(data_lemmatized[:1])

[['lot', 'benefit', 'good', 'culturecommute', 'brutal', 'due', 'increase', 'austin', 'traffic', 'location', 'tough', 'get', 'position', 'many', 'inside', 'connection']]


In [33]:
import gensim.corpora as corpora
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
# Create Corpus
texts = data_lemmatized
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1)]]


In [34]:
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=10, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

In [35]:
from pprint import pprint
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.050*"management" + 0.050*"employee" + 0.028*"pay" + 0.024*"value" + '
  '0.020*"good" + 0.020*"benefit" + 0.018*"high" + 0.017*"low" + 0.014*"level" '
  '+ 0.013*"company"'),
 (1,
  '0.083*"core" + 0.034*"bank" + 0.027*"safety" + 0.025*"integrity" + '
  '0.023*"college" + 0.020*"commitment" + 0.018*"weak" + 0.017*"outstanding" + '
  '0.016*"regional" + 0.013*"region"'),
 (2,
  '0.030*"day" + 0.025*"time" + 0.022*"hour" + 0.022*"work" + 0.016*"get" + '
  '0.015*"manager" + 0.013*"pay" + 0.011*"job" + 0.011*"store" + 0.011*"week"'),
 (3,
  '0.091*"not" + 0.049*"can" + 0.037*"blame" + 0.037*"do" + 0.027*"recently" + '
  '0.024*"situation" + 0.024*"contract" + 0.020*"disconnect" + 0.016*"relax" + '
  '0.016*"payroll"'),
 (4,
  '0.083*"team" + 0.033*"member" + 0.027*"fun" + 0.021*"free" + 0.017*"event" '
  '+ 0.016*"project" + 0.013*"food" + 0.012*"intern" + 0.011*"load" + '
  '0.010*"hike"'),
 (5,
  '0.036*"culture" + 0.028*"company" + 0.017*"opportunity" + 0.016*"great" + '
  '0

In [36]:
from gensim.models import CoherenceModel
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.3821753020928752


In [42]:
from numba import jit
@jit
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b,
                                           per_word_topics=True)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [38]:
#Hyperparameter Tuning
#Number of Topics (K)
#Dirichlet hyperparameter alpha: Document-Topic Density
#Dirichlet hyperparameter beta: Word-Topic Density

In [39]:
import numpy as np
import tqdm
grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')
# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')
# Validation sets
num_of_docs = len(corpus)
corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
               # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
               gensim.utils.ClippedCorpus(corpus, num_of_docs*0.75), 
               corpus]
corpus_title = ['75% Corpus', '100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

In [40]:
# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=540)
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
    pbar.close()

  0%|          | 0/540 [00:00<?, ?it/s]

ValueError: Stop argument for islice() must be None or an integer: 0 <= x <= sys.maxsize.

In [None]:
import pyLDAvis.gensim
import pickle 
import pyLDAvis
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
LDAvis_prepared