In [1]:
import re
import numpy as np
import pandas as  pd
from pprint import pprint# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel# spaCy for preprocessing
import spacy# Plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import pickle

from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

#load pkl in a list
def load_corpus(pkl_path):
    rfc_dict = pickle.load(open( 'rfc_pkl.pkl', "rb" ))
    return list(rfc_dict.values())

# pkl is saved as a list of strings
cleaned_rfcs = load_corpus('rfc_pkl.pkl')
data_words = []
for s in cleaned_rfcs:
    data_words.append(s.split(' '))

In [2]:
bigram = gensim.models.phrases.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.phrases.Phrases(bigram[data_words], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
# See trigram example

# Define function for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out



In [3]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
# print(type(data_lemmatized))
# print(data_lemmatized[0])

In [4]:
id2word = corpora.Dictionary(data_lemmatized)  
# Create Corpus 
texts = data_lemmatized  
# Term Document Frequency 
corpus = [id2word.doc2bow(text) for text in texts]  
# View 
print('corpus type: ', type(corpus))

corpus type:  <class 'list'>


In [5]:
num_topics = 50
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=100, 
update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True)
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]
# coherence and perplexity : metrics for topic model

[(21,
  '0.043*"nfsv" + 0.035*"lock" + 0.032*"file" + 0.029*"oper" + 0.026*"lfb" + '
  '0.021*"filehandl" + 0.019*"case" + 0.018*"open" + 0.017*"stateid" + '
  '0.016*"compon"'),
 (13,
  '0.050*"locat" + 0.028*"inform" + 0.027*"element" + 0.027*"xml" + '
  '0.022*"xselement" + 0.021*"presenc" + 0.021*"datum" + 0.017*"devic" + '
  '0.013*"provid" + 0.013*"schema"'),
 (10,
  '0.073*"printer" + 0.038*"job" + 0.031*"ipp" + 0.027*"telnet" + '
  '0.019*"print" + 0.012*"log" + 0.011*"user" + 0.011*"rfc" + 0.010*"suppli" + '
  '0.010*"command"'),
 (46,
  '0.025*"mrb" + 0.022*"code" + 0.021*"decod" + 0.016*"xsdelement" + '
  '0.012*"broker" + 0.011*"hip" + 0.011*"puzzl" + 0.011*"track" + 0.010*"silk" '
  '+ 0.009*"return"'),
 (42,
  '0.071*"flow" + 0.034*"inform" + 0.033*"jw" + 0.027*"templat" + 0.027*"rfc" '
  '+ 0.025*"element" + 0.023*"select" + 0.021*"export" + 0.021*"record" + '
  '0.020*"process"'),
 (1,
  '0.026*"interv" + 0.020*"line" + 0.019*"current" + 0.019*"second" + '
  '0.017*"int

In [6]:
print('\nPerplexity: ', lda_model.log_perplexity(corpus))
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)



Perplexity:  -6.954321521155356

Coherence Score:  0.44048501159229575


In [7]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
