# FAQ Topic Generation

Topic generation using Latent Dirichlet Allocation (LDA)

Author: Shreyash Gupta

Organization: IndiaMART InterMESH Pvt. Ltd.

# Importing Dataset

Importing necessary modules

In [1]:
import pandas as pd

Reading the input data

In [2]:
df = pd.read_excel("faqtrain.xlsx")

# Data preprocessing

Importing necessary modules

In [3]:
import nltk
nltk.download("stopwords")
import re
import pandas as pd
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
import spacy

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Pooja\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Creating a list of all question titles

In [4]:
data = df["Question Title"].values.tolist()

### Defining clean-up functions

Removing e-mails

In [5]:
def remove_emails(text):
    text = [re.sub("\S*@\S*\s?", "", sent) for sent in text]
    return text

Removing extra spaces

In [6]:
def remove_extra_spaces(text):
    text = [re.sub("\s+", " ", sent) for sent in text]
    return text

Removing quotes

In [7]:
def remove_quotes(text):
    text = [re.sub("\'", "", sent) for sent in text]
    return text

Tokenizing, lowercasing and removing punctuations

In [8]:
def tokenize_and_clean(text):
    for sent in text:
        yield(gensim.utils.simple_preprocess(str(sent),
                                             deacc=True))

Stopwords removal

In [9]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
def remove_stopwords(text):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in text]

Creating bigrams

In [10]:
def make_bigrams(text):
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return [bigram_mod[doc] for doc in text]

Lemmatizing text

In [11]:
def lemmatization(text, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in text:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

### Calling all functions

In [12]:
data = remove_emails(data)
data = remove_extra_spaces(data)
data = remove_quotes(data)
data_words = list(tokenize_and_clean(data))
data_words = remove_stopwords(data_words)
data_words_bigrams = make_bigrams(data_words)
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

# Topic modelling

Creating dictionary

In [13]:
id2word = corpora.Dictionary(data_lemmatized)

Creating corpus

In [14]:
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]

Building the model

In [15]:
lda_model = gensim.models.ldamodel.LdaModel(corpus = corpus,
                                           id2word = id2word,
                                           num_topics = 20, 
                                           random_state = 100,
                                           update_every = 1,
                                           chunksize = 100,
                                           passes = 10,
                                           alpha = 'auto',
                                           per_word_topics = True)

# Model analysis

Importing necessary modules

In [16]:
from gensim.models import CoherenceModel

Printing topics

In [17]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.245*"air" + 0.087*"need" + 0.071*"coin" + 0.052*"cool" + 0.046*"flow" + '
  '0.039*"buy" + 0.036*"dry" + 0.032*"business" + 0.016*"year" + '
  '0.009*"start"'),
 (1,
  '0.073*"buyer" + 0.040*"wheat" + 0.039*"center" + 0.034*"purchase" + '
  '0.000*"cashew_nuts" + 0.000*"seed" + 0.000*"bacoco" + 0.000*"power" + '
  '0.000*"cord" + 0.000*"grass"'),
 (2,
  '0.106*"sell" + 0.095*"test" + 0.060*"online" + 0.000*"old" + '
  '0.000*"apparatus" + 0.000*"go" + 0.000*"mechanic" + 0.000*"could" + '
  '0.000*"nebula" + 0.000*"payment"'),
 (3,
  '0.142*"light" + 0.086*"lead" + 0.010*"white" + 0.000*"pair" + 0.000*"metal" '
  '+ 0.000*"scrap" + 0.000*"tube_light" + 0.000*"watt" + 0.000*"set" + '
  '0.000*"car"'),
 (4,
  '0.000*"axel" + 0.000*"changeover_switch" + 0.000*"salfee" + 0.000*"bokaro" '
  '+ 0.000*"alambagh" + 0.000*"micropore" + 0.000*"residence" + 0.000*"cft" + '
  '0.000*"contract" + 0.000*"vl"'),
 (5,
  '0.071*"home" + 0.059*"supplier" + 0.051*"list" + 0.032*"silk" + '
  '0.0

Compute perplexity

In [18]:
print('Perplexity: ', lda_model.log_perplexity(corpus))

Perplexity:  -19.349753593068733


Compute coherence score

In [19]:
coherence_model_lda = CoherenceModel(model = lda_model,
                                     texts = data_lemmatized,
                                     dictionary = id2word,
                                     coherence = 'c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  0.49526484124977904


# Model visualization

Importing necessary modules

In [20]:
import pyLDAvis
import pyLDAvis.gensim

Visualizng topic - keywords relationship

In [21]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
