In [1]:
import re
import numpy as np
import pandas as  pd
from pprint import pprint# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel# spaCy for preprocessing
import spacy# Plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import pickle

from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

#load pkl in a list
def load_corpus(pkl_path):
    rfc_dict = pickle.load(open( pkl_path, "rb" ))
    return list(rfc_dict.values())

# pkl is saved as a list of strings
cleaned_rfcs = load_corpus('callflow_pkl.pkl')
data_words = []
for s in cleaned_rfcs:
    data_words.append(s.split(' '))
# print(data_words)

In [2]:
bigram = gensim.models.phrases.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.phrases.Phrases(bigram[data_words], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
# See trigram example

# Define function for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out



In [3]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
print(type(data_lemmatized))
print(data_lemmatized[0])

<class 'list'>
['exampl', 'suppos', 'host', 'receiv', 'router', 'advertis', 'router', 'router', 'lifetim_second', 'default', 'router', 'prefer', 'medium', 'bodi', 'router', 'advertis', 'contain', 'rout', 'inform', 'option', 'rout', 'lifetim_second', 'rout', 'prefer', 'low', 'process', 'router', 'advertis', 'type', 'host', 'router', 'default', 'router', 'list', 'lifetim_second', 'type', 'host', 'receiv', 'router', 'advertis', 'entri', 'router', 'default', 'router', 'list', 'medium', 'prefer', 'lifetim_second', 'type', 'host', 'entri', 'rout', 'tabl', 'router', 'low', 'prefer', 'lifetim_second', 'process', 'router', 'advertis', 'type', 'host', 'may', 'transient', 'state', 'rout', 'tabl', 'router', 'medium', 'prefer', 'lifetim_second']


In [4]:
id2word = corpora.Dictionary(data_lemmatized)  
# Create Corpus 
texts = data_lemmatized  
# Term Document Frequency 
corpus = [id2word.doc2bow(text) for text in texts]  
# View 
print('corpus type: ', type(corpus))

corpus type:  <class 'list'>


In [5]:
num_topics = 40
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=100, 
update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True)
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]
# coherence and perplexity : metrics for topic model

[(31,
  '0.081*"xro" + 0.039*"int" + 0.033*"string_xro" + 0.030*"ssrc" + '
  '0.021*"unicod" + 0.019*"begin" + 0.015*"normal" + 0.015*"bitcount" + '
  '0.015*"dft" + 0.015*"bitbuf"'),
 (11,
  '0.075*"radiu" + 0.068*"repli" + 0.042*"supportedmethod" + 0.034*"dhcpv" + '
  '0.028*"ge" + 0.025*"otherwis" + 0.025*"server" + 0.020*"fal" + '
  '0.020*"deleg" + 0.012*"solicit"'),
 (29,
  '0.075*"auth" + 0.067*"historyinfo" + 0.064*"hmacalgo" + 0.064*"chunk" + '
  '0.048*"select" + 0.032*"initrandom" + 0.032*"cookieack" + '
  '0.032*"initackrandom" + 0.032*"cookieecho" + 0.028*"idi"'),
 (15,
  '0.069*"offset" + 0.053*"bind" + 0.047*"cbname" + 0.046*"gscbflag" + '
  '0.046*"gs" + 0.046*"gsauthzid" + 0.046*"gsnonstdflag" + 0.046*"saslnam" + '
  '0.046*"gsheader" + 0.026*"xid"'),
 (22,
  '0.098*"sourc" + 0.078*"xselement" + 0.059*"hash" + 0.056*"xscomplextyp" + '
  '0.045*"minoccur" + 0.040*"xsexten" + 0.032*"xscomplexcont" + '
  '0.032*"xssequenc" + 0.020*"xml_version" + 0.017*"encodingutf"'),
 (

In [6]:
print('\nPerplexity: ', lda_model.log_perplexity(corpus))
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)



Perplexity:  -7.6388953032837215

Coherence Score:  0.48971456019746384


In [7]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
