# Topic modelling

## 0.0 Imports

In [1]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dglover\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

## 1.0 Run

In [3]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['use', 'show', 'however', 'approach', 'well', 'provide',' present', 'include', 'word', 'nlp', 'natural', 'language', 'processing'])

In [4]:
# Import Dataset
df = pd.read_csv('..\\data\\50_summaries.csv', index_col=0)
#print(df.target_names.unique())
df.head()

Unnamed: 0_level_0,Summary
Filename,Unnamed: 1_level_1
1809.09190v1.pdf,In this paper we formulate audio play hot n c...
1903.10625v2.pdf,Finite State Transducers FST are an efficient...
1904.04307v1.pdf,The quantification of semantic similarity bet...
1809.02794v3.pdf,This paper focuses on the aim of semantic rol...
1805.01083v1.pdf,The mainconstructusedinextractionlanguagesand...


In [5]:
df.reset_index(inplace = True)

In [6]:
# Import Dataset
df2 = pd.read_csv('..\\data\\arxiv_papers_full_v2.csv', index_col=0)

In [7]:
df2.head(2)

Unnamed: 0,Title,PDF URL,Author,DOI,Published Date,Summary,Journal Ref,Primary Category,Category,Entry ID
0,Natural Language Processing using Hadoop and K...,http://arxiv.org/pdf/1608.04434v1,"[arxiv.Result.Author('Emre Erturk'), arxiv.Res...",,2016-08-15 23:09:21+00:00,"Natural language processing, as a data analyti...",,cs.CL,['cs.CL'],http://arxiv.org/abs/1608.04434v1
1,Integrating AI Planning with Natural Language ...,http://arxiv.org/pdf/2202.07138v2,"[arxiv.Result.Author('Kebing Jin'), arxiv.Resu...",,2022-02-15 02:19:09+00:00,Natural language processing (NLP) aims at inve...,,cs.AI,"['cs.AI', 'cs.CL']",http://arxiv.org/abs/2202.07138v2


In [8]:
df2.rename(columns = {'Summary' : 'Abstract'}, inplace=True)

In [9]:
df2['Filename'] = df2['PDF URL'].map(lambda x: x.split('/')[-1] + '.pdf')

In [10]:
df = df.merge(df2, how = 'left', on = 'Filename', indicator = True)

In [11]:
df['_merge'].value_counts()

both          50
left_only      0
right_only     0
Name: _merge, dtype: int64

In [12]:
df.isna().sum()

Filename             0
Summary              0
Title                0
PDF URL              0
Author               0
DOI                 47
Published Date       0
Abstract             0
Journal Ref         44
Primary Category     0
Category             0
Entry ID             0
_merge               0
dtype: int64

In [13]:
df.shape

(50, 13)

In [14]:
df['Summary'] = df['Summary'].map(lambda x: x.lower().replace('natural langauge processing', ''))

In [15]:
# Convert to list
data = df['Summary'].values.tolist()

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

In [16]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

In [17]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [18]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [19]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
#nlp = spacy.load('en', disable=['parser', 'ner'])
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])



In [20]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [21]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [22]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.023*"use" + 0.015*"model" + 0.013*"semantic" + 0.012*"task" + '
  '0.010*"framework" + 0.009*"system" + 0.008*"word" + 0.008*"propose" + '
  '0.008*"lexical" + 0.008*"work"'),
 (1,
  '0.019*"model" + 0.013*"task" + 0.013*"use" + 0.011*"propose" + '
  '0.009*"write" + 0.009*"paper" + 0.007*"system" + 0.007*"neural" + '
  '0.007*"set" + 0.007*"base"'),
 (2,
  '0.019*"system" + 0.010*"lithium" + 0.010*"lm" + 0.010*"usage" + '
  '0.010*"sentence" + 0.007*"user" + 0.007*"rich" + 0.007*"free" + '
  '0.007*"learn" + 0.007*"content"'),
 (3,
  '0.018*"use" + 0.013*"method" + 0.010*"sentence" + 0.010*"research" + '
  '0.010*"classification" + 0.008*"task" + 0.008*"present" + '
  '0.008*"application" + 0.008*"learn" + 0.005*"model"'),
 (4,
  '0.025*"model" + 0.016*"task" + 0.015*"text" + 0.012*"work" + 0.012*"base" + '
  '0.012*"project" + 0.012*"key" + 0.010*"paper" + 0.010*"feature" + '
  '0.009*"method"')]


In [23]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -6.804613259867314

Coherence Score:  0.34664226052529784


In [24]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

  default_term_info = default_term_info.sort_values(


In [25]:
import pandas as pd

def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [26]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=texts)

  sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)


In [27]:
df_topic_sents_keywords

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,0
0,1,0.99,"model, task, use, propose, write, paper, syste...","[paper, audio, play, hot, cold, set, alarm, pm..."
1,1,0.9945,"model, task, use, propose, write, paper, syste...","[state, transducer, efficient, way, represent,..."
2,0,0.9957,"use, model, semantic, task, framework, system,...","[quantification, semantic, similarity, word, u..."
3,4,0.995,"model, task, text, work, base, project, key, p...","[paper, focus, aim, semantic, role, label, lab..."
4,0,0.9883,"use, model, semantic, task, framework, system,...","[also, new, level, incorporating, advance, wor..."
5,3,0.9804,"use, method, sentence, research, classificatio...","[arabic, popular, msa, arabicdialectad, berevi..."
6,4,0.9938,"model, task, text, work, base, project, key, p...","[efficient, encoding, context, speak, computat..."
7,0,0.9937,"use, model, semantic, task, framework, system,...","[recent, work, rer, seek, make, image, refer, ..."
8,0,0.9923,"use, model, semantic, task, framework, system,...","[hybrid, wordcharact, model, abstractive, summ..."
9,0,0.9933,"use, model, semantic, task, framework, system,...","[regular, expression, re, widely, use, network..."


In [28]:
df

Unnamed: 0,Filename,Summary,Title,PDF URL,Author,DOI,Published Date,Abstract,Journal Ref,Primary Category,Category,Entry ID,_merge
0,1809.09190v1.pdf,in this paper we formulate audio play hot n c...,From Audio to Semantics: Approaches to end-to-...,http://arxiv.org/pdf/1809.09190v1,"[arxiv.Result.Author('Parisa Haghani'), arxiv....",,2018-09-24 19:46:24+00:00,Conventional spoken language understanding sys...,,eess.AS,"['eess.AS', 'cs.CL', 'cs.SD']",http://arxiv.org/abs/1809.09190v1,both
1,1903.10625v2.pdf,finite state transducers fst are an efficient...,Neural Grammatical Error Correction with Finit...,http://arxiv.org/pdf/1903.10625v2,"[arxiv.Result.Author('Felix Stahlberg'), arxiv...",,2019-03-25 23:05:11+00:00,Grammatical error correction (GEC) is one of t...,,cs.CL,['cs.CL'],http://arxiv.org/abs/1903.10625v2,both
2,1904.04307v1.pdf,the quantification of semantic similarity bet...,Word Similarity Datasets for Thai: Constructio...,http://arxiv.org/pdf/1904.04307v1,"[arxiv.Result.Author('Ponrudee Netisopakul'), ...",,2019-04-08 19:18:09+00:00,Distributional semantics in the form of word e...,,cs.CL,['cs.CL'],http://arxiv.org/abs/1904.04307v1,both
3,1809.02794v3.pdf,this paper focuses on the aim of semantic rol...,Explicit Contextual Semantics for Text Compreh...,http://arxiv.org/pdf/1809.02794v3,"[arxiv.Result.Author('Zhuosheng Zhang'), arxiv...",,2018-09-08 12:34:59+00:00,Who did what to whom is a major focus in natur...,,cs.CL,['cs.CL'],http://arxiv.org/abs/1809.02794v3,both
4,1805.01083v1.pdf,the mainconstructusedinextractionlanguagesand...,Scalable Semantic Querying of Text,http://arxiv.org/pdf/1805.01083v1,"[arxiv.Result.Author('Xiaolan Wang'), arxiv.Re...",,2018-05-03 01:57:31+00:00,We present the KOKO system that takes declarat...,,cs.DB,"['cs.DB', 'cs.CL']",http://arxiv.org/abs/1805.01083v1,both
5,1903.02784v1.pdf,arabic isrecognisedasthe4th most popular lang...,Arabic natural language processing: An overview,http://arxiv.org/pdf/1903.02784v1,"[arxiv.Result.Author('Imane Guellil'), arxiv.R...",10.1016/j.jksuci.2019.02.006,2019-03-07 09:22:35+00:00,Arabic is recognised as the 4th most used lang...,,cs.CL,['cs.CL'],http://arxiv.org/abs/1903.02784v1,both
6,1807.00267v1.pdf,an efficient approach to encoding context for...,An Efficient Approach to Encoding Context for ...,http://arxiv.org/pdf/1807.00267v1,"[arxiv.Result.Author('Raghav Gupta'), arxiv.Re...",,2018-07-01 04:11:18+00:00,"In task-oriented dialogue systems, spoken lang...",,cs.CL,['cs.CL'],http://arxiv.org/abs/1807.00267v1,both
7,1805.11818v1.pdf,recent work on rer has sought to make an imag...,Visual Referring Expression Recognition: What ...,http://arxiv.org/pdf/1805.11818v1,"[arxiv.Result.Author('Volkan Cirik'), arxiv.Re...",,2018-05-30 06:03:21+00:00,We present an empirical analysis of the state-...,,cs.CL,"['cs.CL', 'cs.AI', 'cs.CV', 'cs.NE']",http://arxiv.org/abs/1805.11818v1,both
8,1802.09968v2.pdf,a hybrid wordcharacter model for abstractive ...,A Hybrid Word-Character Approach to Abstractiv...,http://arxiv.org/pdf/1802.09968v2,"[arxiv.Result.Author('Chieh-Teng Chang'), arxi...",,2018-02-27 15:31:11+00:00,Automatic abstractive text summarization is an...,,cs.CL,['cs.CL'],http://arxiv.org/abs/1802.09968v2,both
9,1805.05588v1.pdf,regular expressions res are widely used in ne...,Marrying up Regular Expressions with Neural Ne...,http://arxiv.org/pdf/1805.05588v1,"[arxiv.Result.Author('Bingfeng Luo'), arxiv.Re...",,2018-05-15 06:40:44+00:00,The success of many natural language processin...,,cs.CL,['cs.CL'],http://arxiv.org/abs/1805.05588v1,both


In [29]:
# Assuming 'original_df' is your original dataframe and 'lda_output' is the output from LDA
merged_df = df.merge(df_topic_sents_keywords, left_index=True, right_index=True)

In [30]:
merged_df.head()

Unnamed: 0,Filename,Summary,Title,PDF URL,Author,DOI,Published Date,Abstract,Journal Ref,Primary Category,Category,Entry ID,_merge,Dominant_Topic,Perc_Contribution,Topic_Keywords,0
0,1809.09190v1.pdf,in this paper we formulate audio play hot n c...,From Audio to Semantics: Approaches to end-to-...,http://arxiv.org/pdf/1809.09190v1,"[arxiv.Result.Author('Parisa Haghani'), arxiv....",,2018-09-24 19:46:24+00:00,Conventional spoken language understanding sys...,,eess.AS,"['eess.AS', 'cs.CL', 'cs.SD']",http://arxiv.org/abs/1809.09190v1,both,1,0.99,"model, task, use, propose, write, paper, syste...","[paper, audio, play, hot, cold, set, alarm, pm..."
1,1903.10625v2.pdf,finite state transducers fst are an efficient...,Neural Grammatical Error Correction with Finit...,http://arxiv.org/pdf/1903.10625v2,"[arxiv.Result.Author('Felix Stahlberg'), arxiv...",,2019-03-25 23:05:11+00:00,Grammatical error correction (GEC) is one of t...,,cs.CL,['cs.CL'],http://arxiv.org/abs/1903.10625v2,both,1,0.9945,"model, task, use, propose, write, paper, syste...","[state, transducer, efficient, way, represent,..."
2,1904.04307v1.pdf,the quantification of semantic similarity bet...,Word Similarity Datasets for Thai: Constructio...,http://arxiv.org/pdf/1904.04307v1,"[arxiv.Result.Author('Ponrudee Netisopakul'), ...",,2019-04-08 19:18:09+00:00,Distributional semantics in the form of word e...,,cs.CL,['cs.CL'],http://arxiv.org/abs/1904.04307v1,both,0,0.9957,"use, model, semantic, task, framework, system,...","[quantification, semantic, similarity, word, u..."
3,1809.02794v3.pdf,this paper focuses on the aim of semantic rol...,Explicit Contextual Semantics for Text Compreh...,http://arxiv.org/pdf/1809.02794v3,"[arxiv.Result.Author('Zhuosheng Zhang'), arxiv...",,2018-09-08 12:34:59+00:00,Who did what to whom is a major focus in natur...,,cs.CL,['cs.CL'],http://arxiv.org/abs/1809.02794v3,both,4,0.995,"model, task, text, work, base, project, key, p...","[paper, focus, aim, semantic, role, label, lab..."
4,1805.01083v1.pdf,the mainconstructusedinextractionlanguagesand...,Scalable Semantic Querying of Text,http://arxiv.org/pdf/1805.01083v1,"[arxiv.Result.Author('Xiaolan Wang'), arxiv.Re...",,2018-05-03 01:57:31+00:00,We present the KOKO system that takes declarat...,,cs.DB,"['cs.DB', 'cs.CL']",http://arxiv.org/abs/1805.01083v1,both,0,0.9883,"use, model, semantic, task, framework, system,...","[also, new, level, incorporating, advance, wor..."


In [38]:
merged_df.to_csv('../data/Dominant_topics_of_summaries.csv')

In [31]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=texts)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

  sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)


In [32]:
df_dominant_topic.to_csv('../data/Dominant_topics_of_absracts.csv')

In [33]:
df_dominant_topic.tail(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
40,40,3,0.9919,"use, method, sentence, research, classificatio...","[uniblock, scoring, filter, information, train..."
41,41,3,0.9886,"use, method, sentence, research, classificatio...","[troll, public, commurum, fix, offensive, adve..."
42,42,4,0.9905,"model, task, text, work, base, project, key, p...","[base, contextual, emotion, classifier, contex..."
43,43,0,0.9889,"use, model, semantic, task, framework, system,...","[program, synthesis, semantic, parsing, learn,..."
44,44,1,0.9932,"model, task, use, propose, write, paper, syste...","[unilateral, contract, term, service, play, su..."
45,45,4,0.9943,"model, task, text, work, base, project, key, p...","[outofvocabulary, embed, imputation, ground, g..."
46,46,0,0.9905,"use, model, semantic, task, framework, system,...","[evaluation, basic, module, isolate, spelling,..."
47,47,0,0.9913,"use, model, semantic, task, framework, system,...","[pretraining, framework, understanding, framew..."
48,48,0,0.99,"use, model, semantic, task, framework, system,...","[new, model, achieve, considerable, deal, spec..."
49,49,1,0.993,"model, task, use, propose, write, paper, syste...","[aim, build, neural, network, model, task, err..."


In [34]:
df_dominant_topic['Dominant_Topic'].value_counts()

0    13
4    13
1    11
3     8
2     5
Name: Dominant_Topic, dtype: int64

In [35]:
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf_mallet.head()

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Text
0,0,0.9959,"use, model, semantic, task, framework, system,...","[novel, teaching, parallel, distribute, comput..."
1,1,0.9959,"model, task, use, propose, write, paper, syste...","[automate, technique, model, check, violation,..."
2,2,0.9937,"system, lithium, lm, usage, sentence, user, ri...","[lithium, extract, rich, set, develop, content..."
3,3,0.9952,"use, method, sentence, research, classificatio...","[robot, perform, simple, fetchandcarry, task, ..."
4,4,0.9957,"model, task, text, work, base, project, key, p...","[plan, base, framework, essay, generation, aim..."


In [36]:
# Display setting to show more characters in column
pd.options.display.max_colwidth = 100

sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text"]

# Show
sent_topics_sorteddf_mallet.head(10)

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Representative Text
0,0,0.9959,"use, model, semantic, task, framework, system, word, propose, lexical, work","[novel, teaching, parallel, distribute, computing, concept, use, memebase, programming, lolcode,..."
1,1,0.9959,"model, task, use, propose, write, paper, system, neural, set, base","[automate, technique, model, check, violation, probability, reach, error, state, exceed, use, ve..."
2,2,0.9937,"system, lithium, lm, usage, sentence, user, rich, free, learn, content","[lithium, extract, rich, set, develop, content, lithium, high, involve, heavy, usage, jargon, th..."
3,3,0.9952,"use, method, sentence, research, classification, task, present, application, learn, model","[robot, perform, simple, fetchandcarry, task, many, potential, application, eldercare, describe,..."
4,4,0.9957,"model, task, text, work, base, project, key, paper, feature, method","[plan, base, framework, essay, generation, aim, understand, represent, mean, topic, argue, gener..."


In [37]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# Change Column names
df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

# Show
df_dominant_topics

Unnamed: 0,Dominant_Topic,Topic_Keywords,Num_Documents,Perc_Documents
0,1,"model, task, use, propose, write, paper, system, neural, set, base",13.0,0.26
1,1,"model, task, use, propose, write, paper, system, neural, set, base",11.0,0.22
2,0,"use, model, semantic, task, framework, system, word, propose, lexical, work",5.0,0.1
3,4,"model, task, text, work, base, project, key, paper, feature, method",8.0,0.16
4,0,"use, model, semantic, task, framework, system, word, propose, lexical, work",13.0,0.26
5,3,"use, method, sentence, research, classification, task, present, application, learn, model",,
6,4,"model, task, text, work, base, project, key, paper, feature, method",,
7,0,"use, model, semantic, task, framework, system, word, propose, lexical, work",,
8,0,"use, model, semantic, task, framework, system, word, propose, lexical, work",,
9,0,"use, model, semantic, task, framework, system, word, propose, lexical, work",,
