In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import os, sys
import string

from gensim import utils
from gensim.models import word2vec, keyedvectors
from gensim.test.utils import datapath, get_tmpfile, temporary_file, common_texts as sentences
from gensim.models import LdaModel
from gensim.models import LsiModel

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import spacy 
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

import pyLDAvis
import pyLDAvis.gensim
import warnings


In [2]:
file = 'cleaned_workorders_nov15.csv'
df_file= pd.read_csv('cleaned_workorders_nov15.csv')
# df_file.text

In [3]:
df_file.head()
cols = ['HybridID', 'text', 'ChoiceString']
df = pd.DataFrame(df_file, columns = cols)
# print(df['text'][:2500].tolist())

In [4]:
text_list = df['text'].to_string()
text_list
# texts = text_list



In [13]:
tags_list = ['<p>' ,'<* p>' , '<p*>',
             '<b>', '</b>', '\n',
             '<ul>','</ul>','<li>',
             '</li>','<br>', '</br>', 
             '-','<strong>','</strong>',
             '<span*>','</span>','<a href*>',
             '</a>', '<em>','</em>', '&', 'nbsp', '&nbsp', 'â', 'p', 'nan']

for tag in tags_list:
    df.text.replace(to_replace=tag, value='', regex=True, inplace=True)

tokens = word_tokenize(text_list)  
# convert to lower case
tokens = [w.lower() for w in tokens] # remove punctuation from each word

table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]   # remove remaining tokens that are not alphabetic
words = [word for word in stripped if word.isalpha()]
# filter out stop words

stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]

words=[s for s in words if not s in tags_list]
texts = [words]
# print(texts)

In [6]:
dictionary = Dictionary(texts)
# dictionary
corpus = [dictionary.doc2bow(text) for text in texts]
# corpus

In [7]:
import warnings
warnings.filterwarnings('ignore')

# Topic Modeling with Latent Dirichlet Allocation (LDA)


There are two layers in this model — documents and tokens — and the size or dimensionality of the document vectors is the number of tokens in the corpus vocabulary. This approach has a number of disadvantages:

* Document vectors tend to be large (one dimension for each token ⇒ lots of dimensions)
* They also tend to be very sparse. Any given document only contains a small fraction of all tokens in the vocabulary, so most values in the document's token vector are 0.
* The dimensions are fully independent from each other — there's no sense of connection between related tokens, such as knife and fork.

## LDA injects a third layer into this conceptual model. 

    Documents are represented as a mixture of a pre-defined number of topics, and the topics are represented as a mixture of the individual tokens in the vocabulary. The number of topics is a model hyper-parameter selected by me. 

### LDA makes a prior assumption that the (document, topic) and (topic, token) mixtures follow Dirichlet probability distributions. 



## This assumption encourages documents to consist mostly of a handful of topics, and topics to consist mostly of a modest set of the tokens.
   
![LDA.png](attachment:LDA.png)
[Source](http://nbviewer.jupyter.org/github/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb#topic=0&lambda=1&term=)

In [8]:
np.random.seed(1) # setting random seed to get the same results each time.
ldamodel = LdaModel(corpus, id2word=dictionary, num_topics=30)
ldamodel.top_topics(corpus)

[([(0.017259656, 'door'),
   (0.016448947, 'water'),
   (0.014094589, 'kitchen'),
   (0.009400658, 'working'),
   (0.00878327, 'replace'),
   (0.008748789, 'needs'),
   (0.008574101, 'ac'),
   (0.008432431, 'need'),
   (0.008044791, 'leaking'),
   (0.008019682, 'bathroom'),
   (0.007423643, 'tenant'),
   (0.00731644, 'sink'),
   (0.00701993, 'toilet'),
   (0.005870474, 'leak'),
   (0.005703835, 'light'),
   (0.005551222, 'unit'),
   (0.005347853, 'room'),
   (0.00525461, 'new'),
   (0.005221681, 'air'),
   (0.0051896735, 'back')],
  1.0000889005818406e-12),
 ([(0.012178018, 'door'),
   (0.011604878, 'kitchen'),
   (0.011506881, 'leaking'),
   (0.009788029, 'needs'),
   (0.009132907, 'bathroom'),
   (0.009042634, 'replace'),
   (0.008835839, 'unit'),
   (0.008576839, 'need'),
   (0.008467101, 'ac'),
   (0.008220635, 'working'),
   (0.008096297, 'repair'),
   (0.00807816, 'sink'),
   (0.0077279964, 'toilet'),
   (0.006708423, 'water'),
   (0.00655111, 'new'),
   (0.006204741, 'light'),
 

In [9]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

## HDP the Hierarchical Dirichlet Process is an *unsupervised topic model* which figures out the number of topics on it's own.



In [10]:
from gensim.models import HdpModel
from gensim.models.wrappers import LdaMallet
import pyLDAvis.gensim

hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)
hdpmodel.show_topics()


[(0,
  '0.016*water + 0.014*door + 0.010*kitchen + 0.010*bathroom + 0.009*leaking + 0.009*needs + 0.009*replace + 0.008*sink + 0.008*working + 0.008*ac + 0.008*need + 0.007*unit + 0.007*toilet + 0.007*repair + 0.007*new + 0.006*leak + 0.006*tenant + 0.006*front + 0.006*light + 0.006*air'),
 (1,
  '0.000*cleand + 0.000*thermablast + 0.000*glas + 0.000*okien + 0.000*trickey + 0.000*police + 0.000*bee + 0.000*tested + 0.000*immersion + 0.000*fixt + 0.000*except + 0.000*sugg + 0.000*appare + 0.000*fam + 0.000*newphoneline + 0.000*lakeshore + 0.000*torcon + 0.000*stow + 0.000*genessee + 0.000*circuit'),
 (2,
  '0.000*getitdone + 0.000*clem + 0.000*bryson + 0.000*catches + 0.000*yemi + 0.000*escrines + 0.000*higher + 0.000*jayesh + 0.000*tubshower + 0.000*pans + 0.000*attention + 0.000*vies + 0.000*thoroughl + 0.000*modified + 0.000*loca + 0.000*unblock + 0.000*burrowing + 0.000*waste + 0.000*entrada + 0.000*laptop'),
 (3,
  '0.000*planning + 0.000*empower + 0.000*beh + 0.000*pex + 0.000*bob

In [11]:
pyLDAvis.gensim.prepare(hdpmodel, corpus, dictionary)

# Wait, what am I looking at again?

#### There are a lot of moving parts in the visualization. Here's a brief summary:

* On the left, there is a plot of the __"distance" between all of the topics__ (labeled as the Intertopic Distance Map)
    * The plot is rendered in two dimensions according a multidimensional scaling (MDS) algorithm. Topics that are generally __similar should be appear close together on the plot__, while dissimilar topics should appear far apart.
    * The relative __size of a topic's circle in the plot corresponds to the relative frequency of the topic__ in the corpus.
    * An individual topic may be selected for closer scrutiny by clicking on its circle, or entering its number in the "selected topic" box in the upper-left.
* On the right, there is a bar chart showing __top terms__.
    * When no topic is selected in the plot on the left, the bar chart shows the top-30 most "salient" terms in the corpus. A term's saliency is a measure of both how frequent the term is in the corpus and how "distinctive" it is in distinguishing between different topics.
    * When a particular topic is selected, the bar chart changes to show the top-30 most "relevant" terms for the selected topic. The relevance metric is controlled by the parameter λ, which can be adjusted with a slider above the bar chart.
        * Setting the λ parameter close to __1.0__ (the default) will rank the terms solely according to their __probability within the topic__.
        * Setting λ close to 0.0 will rank the terms solely according to their __"distinctiveness" or "exclusivity" within the topic — i.e., terms that occur only in this topic, and do not occur in other topics__.
        * Setting λ to values between 0.0 and 1.0 will result in an intermediate ranking, weighting term probability and exclusivity accordingly.
        
* Rolling the mouse over a term in the bar chart on the right will cause the topic circles to resize in the plot on the left, to show the strength of the relationship between the topics and the selected term.

A more detailed explanation of the pyLDAvis visualization can be found [here](https://cran.r-project.org/web/packages/LDAvis/vignettes/details.pdf). Unfortunately, though the data used by gensim and pyLDAvis are the same, they don't use the same ID numbers for topics. If you need to match up topics in gensim's LdaMulticore object and pyLDAvis' visualization, you have to dig through the terms manually.

## Analyzing our LDA model

The interactive visualization pyLDAvis produces is helpful for both:

1. Better understanding and interpreting individual topics, and
2. Better understanding the relationships between the topics.

For (1), you can manually select each topic to view its top most freqeuent and/or "relevant" terms, using different values of the λ parameter. This can help when you're trying to assign a human interpretable name or "meaning" to each topic.

For (2), exploring the Intertopic Distance Plot can help you learn about how topics relate to each other, including potential higher-level structure between groups of topics.

In our plot, there is a stark divide along the x-axis, with two topics far to the left and most of the remaining 48 far to the right. Inspecting the two outlier topics provides a plausible explanation: both topics contain many non-English words, while most of the rest of the topics are in English. So, one of the main attributes that distinguish the reviews in the dataset from one another is their language.

This finding isn't entirely a surprise. In addition to English-speaking cities, the Yelp dataset includes reviews of businesses in Montreal and Karlsruhe, Germany, often written in French and German, respectively. Multiple languages isn't a problem for our demo, but for a real NLP application, you might need to ensure that the text you're processing is written in English (or is at least tagged for language) before passing it along to some downstream processing. If that were the case, the divide along the x-axis in the topic plot would immediately alert you to a potential data quality issue.

The y-axis separates two large groups of topics — let's call them "super-topics" — one in the upper-right quadrant and the other in the lower-right quadrant. These super-topics correlate reasonably well with the pattern we'd noticed while naming the topics:

The super-topic in the lower-right tends to be about food. It groups together the burger & fries, breakfast, sushi, barbecue, and greek topics, among others.

The super-topic in the upper-right tends to be about other elements of the restaurant experience. It groups together the ambience & seating, location & time, family, and customer service topics, among others.

So, in addition to the 50 direct topics the model has learned, our analysis suggests a higher-level pattern in the data. Restaurant reviewers in the Yelp dataset talk about two main things in their reviews, in general: (1) the food, and (2) their overall restaurant experience. For this dataset, this is a very intuitive result, and we probably didn't need a sophisticated modeling technique to tell it to us. When working with datasets from other domains, though, such high-level patterns may be much less obvious from the outset — and that's where topic modeling can help.

## Describing text with LDA

Beyond data exploration, one of the key uses for an LDA model is _providing a compact, quantitative description of natural language text_. Once an LDA model has been trained, it can be used to represent free text as a mixture of the topics the model learned from the original corpus. This mixture can be interpreted as a __probability distribution__ across the topics, so the LDA representation of a paragraph of text might look like 50% Topic A, 20% Topic B, 20% Topic C, and 10% Topic D.

To use an LDA model to generate a vector representation of new text, you'll need to apply any text preprocessing steps you used on the model's training corpus to the new text, too. For our model, the preprocessing steps we used include:

Using spaCy to remove punctuation and lemmatize the text
Applying our first-order phrase model to join word pairs
Applying our second-order phrase model to join longer phrases
Removing stopwords
Creating a bag-of-words representation
Once you've applied these preprocessing steps to the new text, it's ready to pass directly to the model to create an LDA representation. The lda_description(...) function will perform all these steps for us, including printing the resulting topical description of the input text.

## LSI : Latent Semantic Indexing 
 
It is a popular information retreival method which works by decomposing the original matrix of words to maintain key topics. Gensim's implementation uses an SVD.

In [12]:
model_lsi = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary)
model_lsi.show_topics(num_topics=10)  # Showing only the top 5 topics

[(0,
  '0.316*"water" + 0.285*"door" + 0.206*"kitchen" + 0.198*"bathroom" + 0.185*"leaking" + 0.175*"needs" + 0.172*"replace" + 0.166*"sink" + 0.165*"working" + 0.163*"ac"')]