# Topic modelling

## 0.0 Imports

In [1]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dglover\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

## 1.0 Run

In [3]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['use', 'show', 'however', 'approach', 'well', 'provide',' present', 'include', 'word', 'nlp'])

In [4]:
# Import Dataset
df = pd.read_csv('..\\data\\arxiv_papers_full_v2.csv', index_col=0)
#print(df.target_names.unique())
df.head()

Unnamed: 0,Title,PDF URL,Author,DOI,Published Date,Summary,Journal Ref,Primary Category,Category,Entry ID
0,Natural Language Processing using Hadoop and K...,http://arxiv.org/pdf/1608.04434v1,"[arxiv.Result.Author('Emre Erturk'), arxiv.Res...",,2016-08-15 23:09:21+00:00,"Natural language processing, as a data analyti...",,cs.CL,['cs.CL'],http://arxiv.org/abs/1608.04434v1
1,Integrating AI Planning with Natural Language ...,http://arxiv.org/pdf/2202.07138v2,"[arxiv.Result.Author('Kebing Jin'), arxiv.Resu...",,2022-02-15 02:19:09+00:00,Natural language processing (NLP) aims at inve...,,cs.AI,"['cs.AI', 'cs.CL']",http://arxiv.org/abs/2202.07138v2
2,Simple Natural Language Processing Tools for D...,http://arxiv.org/pdf/1906.11608v2,[arxiv.Result.Author('Leon Derczynski')],,2019-06-27 13:15:12+00:00,This technical note describes a set of baselin...,,cs.CL,['cs.CL'],http://arxiv.org/abs/1906.11608v2
3,Towards the Study of Morphological Processing ...,http://arxiv.org/pdf/2006.16212v1,"[arxiv.Result.Author('Mirinso Shadang'), arxiv...",,2020-06-29 17:24:09+00:00,There is no or little work on natural language...,In proceeding of Regional International Confer...,cs.CL,['cs.CL'],http://arxiv.org/abs/2006.16212v1
4,Natural Language Understanding with Distribute...,http://arxiv.org/pdf/1511.07916v1,[arxiv.Result.Author('Kyunghyun Cho')],,2015-11-24 23:23:13+00:00,This is a lecture note for the course DS-GA 30...,,cs.CL,"['cs.CL', 'stat.ML']",http://arxiv.org/abs/1511.07916v1


In [5]:
df.shape

(3558, 10)

In [6]:
df['Summary'] = df['Summary'].map(lambda x: x.lower().replace('natural langauge processing', ''))

In [7]:
# Convert to list
data = df['Summary'].values.tolist()

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

In [8]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

In [9]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [10]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [11]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
#nlp = spacy.load('en', disable=['parser', 'ner'])
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])



In [12]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [35]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [36]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.071*"intent" + 0.057*"change" + 0.056*"unit" + 0.050*"fast" + '
  '0.044*"precision" + 0.042*"meaning" + 0.039*"external" + 0.036*"rnn" + '
  '0.030*"update" + 0.029*"bidirectional"'),
 (1,
  '0.000*"sphere" + 0.000*"linguistics" + 0.000*"poll" + 0.000*"ascribing" + '
  '0.000*"consolidate" + 0.000*"watermark" + 0.000*"watermarke" + '
  '0.000*"watermarking" + 0.000*"opportunitie" + 0.000*"decentralized"'),
 (2,
  '0.211*"entity" + 0.075*"event" + 0.062*"relation" + 0.059*"protein" + '
  '0.037*"verify" + 0.037*"claim" + 0.035*"ontology" + 0.025*"molecular" + '
  '0.025*"molecule" + 0.021*"think"'),
 (3,
  '0.214*"bias" + 0.060*"gender" + 0.053*"encourage" + 0.048*"determine" + '
  '0.046*"usually" + 0.045*"video" + 0.037*"gender_bia" + 0.034*"term_memory" '
  '+ 0.033*"long_short" + 0.022*"dictionary"'),
 (4,
  '0.050*"language" + 0.048*"model" + 0.027*"task" + 0.017*"natural" + '
  '0.017*"processing" + 0.016*"text" + 0.012*"use" + 0.012*"method" + '
  '0.012*"dataset" + 0.

In [37]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -11.235099735501308

Coherence Score:  0.43171519090320076


In [38]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

  default_term_info = default_term_info.sort_values(


In [39]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=texts)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']


  sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)


In [40]:
df_dominant_topic.tail(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
3548,3548,4,0.6922,"language, model, task, natural, processing, text, use, method, dataset, performance","[self_supervise, learning, speech, representation, successfully, apply, various, downstream, tas..."
3549,3549,4,0.6967,"language, model, task, natural, processing, text, use, method, dataset, performance","[large, language, model, llm, revolutionize, natural, language, processing, demand, massive, res..."
3550,3550,4,0.7468,"language, model, task, natural, processing, text, use, method, dataset, performance","[text, summarization, essential, task, natural, language, processing, researcher, develop, vario..."
3551,3551,4,0.7304,"language, model, task, natural, processing, text, use, method, dataset, performance","[sentiment_analysis, vital, tool, uncover, insight, financial, article, news, social_media, shap..."
3552,3552,4,0.6051,"language, model, task, natural, processing, text, use, method, dataset, performance","[recent, research, demonstrate, task, fine_tune, multi_modal, large, language, model, llm, use, ..."
3553,3553,4,0.5646,"language, model, task, natural, processing, text, use, method, dataset, performance","[large, document, write, juridical, language, difficult, interpret, long, sentence, lead, intric..."
3554,3554,4,0.843,"language, model, task, natural, processing, text, use, method, dataset, performance","[promise, performance, various, natural, language, processing, task, current, system, vulnerable..."
3555,3555,4,0.7402,"language, model, task, natural, processing, text, use, method, dataset, performance","[recent, advancement, speech, emotion, recognition, ser, model, state, art, deep, learning, appr..."
3556,3556,4,0.6258,"language, model, task, natural, processing, text, use, method, dataset, performance","[recent_advance, large, language, model, lead, renew, interest, natural, language, processing, h..."
3557,3557,4,0.7214,"language, model, task, natural, processing, text, use, method, dataset, performance","[pre_traine, model, czech, natural, language, processing, often, evaluate, purely, linguistic, t..."


In [41]:
df_dominant_topic['Dominant_Topic'].value_counts()

4     3529
13      29
Name: Dominant_Topic, dtype: int64

In [42]:
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf_mallet.head()

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Text
0,4,0.907,"language, model, task, natural, processing, text, use, method, dataset, performance","[deep, large, pre_traine, model, state, art, various, natural, language, processing, task, huge,..."
1,13,0.6178,"system, natural, process, semantic, sentence, information, word, use, analysis, user","[write, specification, computer, program, easy, take, account, disparate, conceptual, world, app..."


In [43]:
# Display setting to show more characters in column
pd.options.display.max_colwidth = 100

sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text"]

# Show
sent_topics_sorteddf_mallet.head(10)

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Representative Text
0,4,0.907,"language, model, task, natural, processing, text, use, method, dataset, performance","[deep, large, pre_traine, model, state, art, various, natural, language, processing, task, huge,..."
1,13,0.6178,"system, natural, process, semantic, sentence, information, word, use, analysis, user","[write, specification, computer, program, easy, take, account, disparate, conceptual, world, app..."


In [44]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# Change Column names
df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

# Show
df_dominant_topics

Unnamed: 0,Dominant_Topic,Topic_Keywords,Num_Documents,Perc_Documents
0,4,"language, model, task, natural, processing, text, use, method, dataset, performance",,
1,4,"language, model, task, natural, processing, text, use, method, dataset, performance",,
2,4,"language, model, task, natural, processing, text, use, method, dataset, performance",,
3,4,"language, model, task, natural, processing, text, use, method, dataset, performance",,
4,4,"language, model, task, natural, processing, text, use, method, dataset, performance",3529.0,0.9918
...,...,...,...,...
3553,4,"language, model, task, natural, processing, text, use, method, dataset, performance",,
3554,4,"language, model, task, natural, processing, text, use, method, dataset, performance",,
3555,4,"language, model, task, natural, processing, text, use, method, dataset, performance",,
3556,4,"language, model, task, natural, processing, text, use, method, dataset, performance",,
