# FAQ Topic Generation

Topic generation using Latent Dirichlet Allocation (LDA)

Author: Shreyash Gupta

Organization: IndiaMART InterMESH Pvt. Ltd.

# Importing Dataset

Importing necessary modules

In [None]:
import pandas as pd

Reading the input data

In [None]:
df = pd.read_excel("faqtrain.xlsx")

# Data preprocessing

Importing necessary modules

In [None]:
import nltk
nltk.download("stopwords")
import re
import pandas as pd
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
import spacy

Creating a list of all question titles

In [None]:
data = df["Question Title"].values.tolist()

### Defining clean-up functions

Removing e-mails

In [None]:
def remove_emails(text):
    text = [re.sub("\S*@\S*\s?", "", sent) for sent in text]
    return text

Removing extra spaces

In [None]:
def remove_extra_spaces(text):
    text = [re.sub("\s+", " ", sent) for sent in text]
    return text

Removing quotes

In [None]:
def remove_quotes(text):
    text = [re.sub("\'", "", sent) for sent in text]
    return text

Tokenizing, lowercasing and removing punctuations

In [None]:
def tokenize_and_clean(text):
    for sent in text:
        yield(gensim.utils.simple_preprocess(str(sent),
                                             deacc=True))

Stopwords removal

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
def remove_stopwords(text):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in text]

Creating bigrams

In [None]:
def make_bigrams(text):
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return [bigram_mod[doc] for doc in text]

Lemmatizing text

In [None]:
def lemmatization(text, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in text:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

### Calling all functions

In [None]:
data = remove_emails(data)
data = remove_extra_spaces(data)
data = remove_quotes(data)
data_words = list(tokenize_and_clean(data))
data_words = remove_stopwords(data_words)
data_words_bigrams = make_bigrams(data_words)
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

# Topic modelling

Creating dictionary

In [None]:
id2word = corpora.Dictionary(data_lemmatized)

Creating corpus

In [None]:
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]

Building the model

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus = corpus,
                                           id2word = id2word,
                                           num_topics = 20, 
                                           random_state = 100,
                                           update_every = 1,
                                           chunksize = 100,
                                           passes = 10,
                                           alpha = 'auto',
                                           per_word_topics = True)

# Model analysis

Importing necessary modules

In [None]:
from gensim.models import CoherenceModel

Printing topics

In [None]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

Compute perplexity

In [None]:
print('Perplexity: ', lda_model.log_perplexity(corpus))

Compute coherence score

In [None]:
coherence_model_lda = CoherenceModel(model = lda_model,
                                     texts = data_lemmatized,
                                     dictionary = id2word,
                                     coherence = 'c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

# Model visualization

Importing necessary modules

In [None]:
import pyLDAvis
import pyLDAvis.gensim

Visualizng topic - keywords relationship

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis