In [2]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [3]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [4]:
df = pd.read_excel('final_dataset.xlsx', sheet_name='Sheet1')

In [5]:
# Convert to list
data = df.Short_description.values.tolist()

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]


In [6]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

In [7]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)



In [9]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [13]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
import en_core_web_sm
nlp = en_core_web_sm.load()
#nlp = spacy.load('en', disable=['parser', 'ner']) # error 
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [14]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# human readable format of corpus
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('end', 1),
  ('erp', 1),
  ('error', 1),
  ('fi', 1),
  ('finance', 1),
  ('sap', 1),
  ('status', 1)]]

In [15]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [16]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.166*"crm" + 0.091*"set" + 0.091*"status" + 0.084*"sap" + 0.078*"vistex" + '
  '0.074*"chm" + 0.071*"management" + 0.071*"channel" + 0.045*"record" + '
  '0.038*"doc"'),
 (1,
  '0.252*"zfico_a" + 0.165*"_" + 0.089*"item" + 0.084*"extractio" + '
  '0.084*"pick" + 0.068*"zfico_c" + 0.043*"c_non_sap_atlut" + 0.030*"cb_sap" + '
  '0.027*"ca_prb" + 0.011*"w_sap_atlut"'),
 (2,
  '0.306*"datum" + 0.175*"report" + 0.155*"cost" + 0.122*"centre" + '
  '0.085*"refresh" + 0.085*"transaction" + 0.033*"document" + 0.015*"asn" + '
  '0.000*"zmultiupdate" + 0.000*"extended"'),
 (3,
  '0.283*"raise" + 0.283*"event" + 0.106*"plant" + 0.050*"output" + '
  '0.047*"inbound" + 0.035*"invoice" + 0.035*"proforma" + 0.025*"gts" + '
  '0.021*"interface" + 0.018*"recon"'),
 (4,
  '0.075*"upload" + 0.062*"clearing" + 0.044*"program" + 0.035*"gtn" + '
  '0.029*"ok" + 0.029*"drs" + 0.029*"decision" + 0.029*"result" + '
  '0.029*"usage" + 0.029*"insp"'),
 (5,
  '0.121*"file" + 0.116*"backlog" + 0.102*"st" +

In [17]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -3.3678008137528797

Coherence Score:  0.4721403434574917


In [23]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis
pyLDAvis.show(vis)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [27]:
mallet_path = 'mallet-2.0.8/bin/mallet' 
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=id2word)

CalledProcessError: Command 'mallet-2.0.8/bin/mallet import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "\S+" --input C:\Users\USER\AppData\Local\Temp\b98ad0_corpus.txt --output C:\Users\USER\AppData\Local\Temp\b98ad0_corpus.mallet' returned non-zero exit status 1

In [56]:
# Show Topics
pprint(ldamallet.show_topics(formatted=False))

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_words_bigrams, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)

[(8,
  [('ci', 0.2722252898950856),
   ('ended', 0.23854224185532855),
   ('sap', 0.23743787962451685),
   ('job', 0.13970182219768085),
   ('error', 0.10712313638873551),
   ('generated', 0.0016565433462175593),
   ('bi', 0.0011043622308117063),
   ('dd_sap_atlut', 0.0005521811154058532),
   ('dp', 0.0005521811154058532),
   ('zpc_sf_bbur', 0.0005521811154058532)]),
 (5,
  [('pi', 0.157298277425204),
   ('process', 0.15639165911151406),
   ('netweaver', 0.14460562103354488),
   ('integration', 0.14460562103354488),
   ('job', 0.09066183136899365),
   ('event', 0.0829555757026292),
   ('raised', 0.0815956482320943),
   ('idoc', 0.0385312783318223),
   ('ecc', 0.030371713508612876),
   ('redwood', 0.027198549410698096)]),
 (16,
  [('redwood', 0.24603174603174602),
   ('warehouse', 0.1984126984126984),
   ('business', 0.19701213818860877),
   ('bw', 0.19701213818860877),
   ('prd', 0.15779645191409897),
   ('inventory', 0.0009337068160597573),
   ('cb_sap_', 0.0009337068160597573),
   ('