In [15]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [19]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [20]:
df = pd.read_excel('final_dataset.xlsx', sheet_name='Sheet1')

In [24]:
# Convert to list
data = df.Short_description.values.tolist()

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]


In [33]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))



In [34]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)



In [41]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

#def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    #"""https://spacy.io/api/annotation"""
   # texts_out = []
   # for sent in texts:
    #    doc = nlp(" ".join(sent)) 
    #    texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    #return texts_out

In [43]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
# nlp = spacy.load('en', disable=['parser', 'ner']) -- error 
# Do lemmatization keeping only noun, adj, vb, adv
#data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [47]:
# Create Dictionary
id2word = corpora.Dictionary(data_words_bigrams)

# Create Corpus
texts = data_words_bigrams

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# human readable format of corpus
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('ci', 1),
  ('ended', 1),
  ('erp', 1),
  ('error', 1),
  ('fi', 1),
  ('finance', 1),
  ('sap', 1),
  ('status', 1)]]

In [48]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [49]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.106*"job" + 0.091*"sap" + 0.089*"prd" + 0.089*"erp" + 0.089*"sales" + '
  '0.088*"ci" + 0.087*"sd" + 0.087*"distribution" + 0.082*"set" + '
  '0.071*"status"'),
 (1,
  '0.241*"sc" + 0.185*"backlog" + 0.184*"utility" + 0.164*"report" + '
  '0.047*"asn" + 0.021*"rpt" + 0.021*"shipable" + 0.011*"jc_zmm_mth_mb" + '
  '0.011*"mb" + 0.010*"austin"'),
 (2,
  '0.111*"data" + 0.099*"bpc" + 0.088*"prd" + 0.073*"redwood" + 0.055*"cost" + '
  '0.055*"ended" + 0.055*"sap" + 0.054*"ci" + 0.051*"nw" + 0.051*"atl"'),
 (3,
  '0.105*"job" + 0.102*"redwood" + 0.101*"prd" + 0.100*"ci" + 0.100*"sap" + '
  '0.099*"ok" + 0.098*"ended" + 0.097*"warehouse" + 0.096*"business" + '
  '0.096*"bw"'),
 (4,
  '0.042*"dp" + 0.042*"picks" + 0.042*"ot" + 0.042*"items" + 0.042*"extractio" '
  '+ 0.038*"post" + 0.036*"upload" + 0.035*"rma" + 0.030*"clearing" + '
  '0.026*"records"'),
 (5,
  '0.147*"chain" + 0.110*"file" + 0.100*"set" + 0.084*"update" + '
  '0.080*"status" + 0.079*"atp" + 0.073*"commit" + 0.047*"

In [51]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words_bigrams, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -3.4637621085271872

Coherence Score:  0.42801766317212653


In [54]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [55]:
mallet_path = 'mallet-2.0.8/bin/mallet' 
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=id2word)

In [56]:
# Show Topics
pprint(ldamallet.show_topics(formatted=False))

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_words_bigrams, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)

[(8,
  [('ci', 0.2722252898950856),
   ('ended', 0.23854224185532855),
   ('sap', 0.23743787962451685),
   ('job', 0.13970182219768085),
   ('error', 0.10712313638873551),
   ('generated', 0.0016565433462175593),
   ('bi', 0.0011043622308117063),
   ('dd_sap_atlut', 0.0005521811154058532),
   ('dp', 0.0005521811154058532),
   ('zpc_sf_bbur', 0.0005521811154058532)]),
 (5,
  [('pi', 0.157298277425204),
   ('process', 0.15639165911151406),
   ('netweaver', 0.14460562103354488),
   ('integration', 0.14460562103354488),
   ('job', 0.09066183136899365),
   ('event', 0.0829555757026292),
   ('raised', 0.0815956482320943),
   ('idoc', 0.0385312783318223),
   ('ecc', 0.030371713508612876),
   ('redwood', 0.027198549410698096)]),
 (16,
  [('redwood', 0.24603174603174602),
   ('warehouse', 0.1984126984126984),
   ('business', 0.19701213818860877),
   ('bw', 0.19701213818860877),
   ('prd', 0.15779645191409897),
   ('inventory', 0.0009337068160597573),
   ('cb_sap_', 0.0009337068160597573),
   ('