# Topic modelling

## 0.0 Imports

In [67]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dglover\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [68]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

## 1.0 Run

In [69]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [70]:
# Import Dataset
df = pd.read_csv('..\\data\\summaries_full.csv', index_col=0)
#print(df.target_names.unique())
df.head()

Unnamed: 0_level_0,Summary
PDF URL,Unnamed: 1_level_1
http://arxiv.org/pdf/1608.04434v1,Hadoop is one of the platform s that can proce...
http://arxiv.org/pdf/2202.07138v2,Integrating AI Planning with Natural Language ...
http://arxiv.org/pdf/1906.11608v2,The tools are machine learning based using nat...
http://arxiv.org/pdf/2006.16212v1,The name Tangkhul also known as Hao or Ihao re...
http://arxiv.org/pdf/1511.07916v1,This is a lecture note for the course DS GA at...


In [71]:
df.shape

(3557, 1)

In [72]:
df.dropna(inplace=True)

In [73]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = list(df['Summary'])
vectorizer = CountVectorizer(decode_error='ignore')
X = vectorizer.fit_transform(corpus)

In [74]:
vectorizer.get_feature_names_out()

array(['aa', 'aaaa', 'aaai', ..., 'zwnj', 'zynq', 'zzy'], dtype=object)

In [75]:
CV_df = pd.DataFrame(X.toarray())
CV_df.columns = vectorizer.get_feature_names_out()
#CV_df.head()

In [76]:
CV_df.shape

(3556, 24780)

In [77]:
CV_stopwords = []
for col in CV_df.columns:
    if sum(CV_df[col]) > 0.25 * (CV_df.shape[0]):
        CV_stopwords.append(col)

In [78]:
CV_stopwords

['also',
 'an',
 'and',
 'are',
 'as',
 'at',
 'based',
 'be',
 'by',
 'can',
 'data',
 'for',
 'from',
 'has',
 'have',
 'in',
 'information',
 'is',
 'it',
 'language',
 'languages',
 'learning',
 'model',
 'models',
 'natural',
 'new',
 'nlp',
 'of',
 'on',
 'or',
 'our',
 'paper',
 'processing',
 'published',
 'study',
 'such',
 'task',
 'tasks',
 'text',
 'that',
 'the',
 'this',
 'to',
 'university',
 'use',
 'used',
 'using',
 'we',
 'with']

In [79]:
extra_stopwords = ['author','article','use', 'show', 'however', 'approach', 'well', 'provide',' present', 'include', 'word', 'nlp', 'natural', 'language', 'processing']
full_stopwords = CV_stopwords + extra_stopwords
stop_words.extend(full_stopwords)

In [80]:
df.reset_index(inplace = True)

In [81]:
# Import Dataset
df2 = pd.read_csv('..\\data\\arxiv_papers_full_v2.csv', index_col=0)

In [82]:
df2.head(2)

Unnamed: 0,Title,PDF URL,Author,DOI,Published Date,Summary,Journal Ref,Primary Category,Category,Entry ID
0,Natural Language Processing using Hadoop and K...,http://arxiv.org/pdf/1608.04434v1,"[arxiv.Result.Author('Emre Erturk'), arxiv.Res...",,2016-08-15 23:09:21+00:00,"Natural language processing, as a data analyti...",,cs.CL,['cs.CL'],http://arxiv.org/abs/1608.04434v1
1,Integrating AI Planning with Natural Language ...,http://arxiv.org/pdf/2202.07138v2,"[arxiv.Result.Author('Kebing Jin'), arxiv.Resu...",,2022-02-15 02:19:09+00:00,Natural language processing (NLP) aims at inve...,,cs.AI,"['cs.AI', 'cs.CL']",http://arxiv.org/abs/2202.07138v2


In [83]:
df2.rename(columns = {'Summary' : 'Abstract'}, inplace=True)

In [84]:
#df2['Filename'] = df2['PDF URL'].map(lambda x: x.split('/')[-1] + '.pdf')

In [85]:
df = df.merge(df2, how = 'left', on = 'PDF URL', indicator = True)

In [86]:
df['_merge'].value_counts()

both          3568
left_only        0
right_only       0
Name: _merge, dtype: int64

In [87]:
df.isna().sum()

PDF URL                0
Summary                0
Title                  0
Author                 0
DOI                 3165
Published Date         0
Abstract               0
Journal Ref         3052
Primary Category       0
Category               0
Entry ID               0
_merge                 0
dtype: int64

In [88]:
df.shape

(3568, 12)

In [89]:
df['Abstract'] = df['Abstract'].map(lambda x: str(x).lower().replace('natural langauge processing', ''))

In [90]:
# Convert to list
data = df['Abstract'].values.tolist()

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

In [91]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

In [92]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [93]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ']):#,  'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [94]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_trigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
#nlp = spacy.load('en', disable=['parser', 'ner'])
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])



In [95]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [96]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [97]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.029*"llm" + 0.024*"research" + 0.013*"field" + 0.012*"knowledge" + '
  '0.012*"attention" + 0.011*"challenge" + 0.011*"application" + 0.010*"large" '
  '+ 0.009*"potential" + 0.008*"various"'),
 (1,
  '0.036*"semantic" + 0.020*"structure" + 0.016*"representation" + '
  '0.014*"entity" + 0.013*"sentence" + 0.011*"relation" + 0.010*"memory" + '
  '0.009*"legal" + 0.009*"process" + 0.008*"property"'),
 (2,
  '0.020*"performance" + 0.015*"propose" + 0.014*"dataset" + 0.013*"method" + '
  '0.012*"pre_traine" + 0.011*"state" + 0.011*"training" + 0.011*"domain" + '
  '0.010*"large" + 0.010*"art"'),
 (3,
  '0.014*"system" + 0.011*"speech" + 0.011*"embedding" + 0.010*"result" + '
  '0.009*"different" + 0.009*"feature" + 0.009*"method" + 0.009*"present" + '
  '0.009*"word" + 0.009*"sentence"'),
 (4,
  '0.019*"system" + 0.018*"process" + 0.014*"human" + 0.013*"generate" + '
  '0.013*"user" + 0.011*"question" + 0.010*"image" + 0.008*"framework" + '
  '0.008*"graph" + 0.008*"dialogue"')]


In [98]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)



Perplexity:  -7.714437952072352

Coherence Score:  0.3769663286712831


In [99]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

  default_term_info = default_term_info.sort_values(


In [100]:
import pandas as pd

def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [101]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=texts)

  sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)


In [102]:
df_topic_sents_keywords

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,0
0,3,0.4399,"system, speech, embedding, result, different, ...","[analytic, relate, technology, widely, many, r..."
1,0,0.3579,"llm, research, field, knowledge, attention, ch...","[aim, investigate, interaction, agent, human, ..."
2,0,0.3794,"llm, research, field, knowledge, attention, ch...","[technical, note, describe, set, baseline, too..."
3,3,0.6887,"system, speech, embedding, result, different, ...","[little, work, tangkhul, current, work, humble..."
4,0,0.4473,"llm, research, field, knowledge, attention, ch...","[note, course, understanding, distribute, repr..."
...,...,...,...,...
3563,1,0.2932,"semantic, structure, representation, entity, s...","[large, document, write, juridical, difficult,..."
3564,2,0.8751,"performance, propose, dataset, method, pre_tra...","[promise, performance, various, current, syste..."
3565,2,0.4089,"performance, propose, dataset, method, pre_tra...","[recent_advancement, speech, ser, state, art, ..."
3566,0,0.3257,"llm, research, field, knowledge, attention, ch...","[recent_advance, large, lead, renew, interest,..."


In [103]:
df

Unnamed: 0,PDF URL,Summary,Title,Author,DOI,Published Date,Abstract,Journal Ref,Primary Category,Category,Entry ID,_merge
0,http://arxiv.org/pdf/1608.04434v1,Hadoop is one of the platform s that can proce...,Natural Language Processing using Hadoop and K...,"[arxiv.Result.Author('Emre Erturk'), arxiv.Res...",,2016-08-15 23:09:21+00:00,"natural language processing, as a data analyti...",,cs.CL,['cs.CL'],http://arxiv.org/abs/1608.04434v1,both
1,http://arxiv.org/pdf/2202.07138v2,Integrating AI Planning with Natural Language ...,Integrating AI Planning with Natural Language ...,"[arxiv.Result.Author('Kebing Jin'), arxiv.Resu...",,2022-02-15 02:19:09+00:00,natural language processing (nlp) aims at inve...,,cs.AI,"['cs.AI', 'cs.CL']",http://arxiv.org/abs/2202.07138v2,both
2,http://arxiv.org/pdf/1906.11608v2,The tools are machine learning based using nat...,Simple Natural Language Processing Tools for D...,[arxiv.Result.Author('Leon Derczynski')],,2019-06-27 13:15:12+00:00,this technical note describes a set of baselin...,,cs.CL,['cs.CL'],http://arxiv.org/abs/1906.11608v2,both
3,http://arxiv.org/pdf/2006.16212v1,The name Tangkhul also known as Hao or Ihao re...,Towards the Study of Morphological Processing ...,"[arxiv.Result.Author('Mirinso Shadang'), arxiv...",,2020-06-29 17:24:09+00:00,there is no or little work on natural language...,In proceeding of Regional International Confer...,cs.CL,['cs.CL'],http://arxiv.org/abs/2006.16212v1,both
4,http://arxiv.org/pdf/1511.07916v1,This is a lecture note for the course DS GA at...,Natural Language Understanding with Distribute...,[arxiv.Result.Author('Kyunghyun Cho')],,2015-11-24 23:23:13+00:00,this is a lecture note for the course ds-ga 30...,,cs.CL,"['cs.CL', 'stat.ML']",http://arxiv.org/abs/1511.07916v1,both
...,...,...,...,...,...,...,...,...,...,...,...,...
3563,http://arxiv.org/pdf/2307.01211v1,An automated method for the ontologicalreprese...,An automated method for the ontological repres...,"[arxiv.Result.Author('Giampaolo Bella'), arxiv...",,2023-06-30 09:04:47+00:00,large documents written in juridical language ...,,cs.AI,"['cs.AI', 'cs.CL']",http://arxiv.org/abs/2307.01211v1,both
3564,http://arxiv.org/pdf/2307.01488v1,SCAT modifies ran driven augmentations of the ...,SCAT: Robust Self-supervised Contrastive Learn...,"[arxiv.Result.Author('Junjie Wu'), arxiv.Resul...",,2023-07-04 05:41:31+00:00,despite their promising performance across var...,,cs.CL,['cs.CL'],http://arxiv.org/abs/2307.01488v1,both
3565,http://arxiv.org/pdf/2307.06090v1,Large language models LLMs have revolutionised...,Can Large Language Models Aid in Annotating Sp...,"[arxiv.Result.Author('Siddique Latif'), arxiv....",,2023-07-12 11:27:40+00:00,despite recent advancements in speech emotion ...,,cs.SD,"['cs.SD', 'eess.AS']",http://arxiv.org/abs/2307.06090v1,both
3566,http://arxiv.org/pdf/2307.07051v1,Predictive Power Varies with Clinical Note Typ...,Making the Most Out of the Limited Context Len...,"[arxiv.Result.Author('Hongyi Zheng'), arxiv.Re...",,2023-07-13 20:04:05+00:00,recent advances in large language models have ...,Association for Computational Linguistics - St...,cs.CL,"['cs.CL', 'cs.IR', 'cs.LG']",http://arxiv.org/abs/2307.07051v1,both


In [104]:
# Assuming 'original_df' is your original dataframe and 'lda_output' is the output from LDA
merged_df = df.merge(df_topic_sents_keywords, left_index=True, right_index=True)

In [105]:
merged_df.head()

Unnamed: 0,PDF URL,Summary,Title,Author,DOI,Published Date,Abstract,Journal Ref,Primary Category,Category,Entry ID,_merge,Dominant_Topic,Perc_Contribution,Topic_Keywords,0
0,http://arxiv.org/pdf/1608.04434v1,Hadoop is one of the platform s that can proce...,Natural Language Processing using Hadoop and K...,"[arxiv.Result.Author('Emre Erturk'), arxiv.Res...",,2016-08-15 23:09:21+00:00,"natural language processing, as a data analyti...",,cs.CL,['cs.CL'],http://arxiv.org/abs/1608.04434v1,both,3,0.4399,"system, speech, embedding, result, different, ...","[analytic, relate, technology, widely, many, r..."
1,http://arxiv.org/pdf/2202.07138v2,Integrating AI Planning with Natural Language ...,Integrating AI Planning with Natural Language ...,"[arxiv.Result.Author('Kebing Jin'), arxiv.Resu...",,2022-02-15 02:19:09+00:00,natural language processing (nlp) aims at inve...,,cs.AI,"['cs.AI', 'cs.CL']",http://arxiv.org/abs/2202.07138v2,both,0,0.3579,"llm, research, field, knowledge, attention, ch...","[aim, investigate, interaction, agent, human, ..."
2,http://arxiv.org/pdf/1906.11608v2,The tools are machine learning based using nat...,Simple Natural Language Processing Tools for D...,[arxiv.Result.Author('Leon Derczynski')],,2019-06-27 13:15:12+00:00,this technical note describes a set of baselin...,,cs.CL,['cs.CL'],http://arxiv.org/abs/1906.11608v2,both,0,0.3794,"llm, research, field, knowledge, attention, ch...","[technical, note, describe, set, baseline, too..."
3,http://arxiv.org/pdf/2006.16212v1,The name Tangkhul also known as Hao or Ihao re...,Towards the Study of Morphological Processing ...,"[arxiv.Result.Author('Mirinso Shadang'), arxiv...",,2020-06-29 17:24:09+00:00,there is no or little work on natural language...,In proceeding of Regional International Confer...,cs.CL,['cs.CL'],http://arxiv.org/abs/2006.16212v1,both,3,0.6887,"system, speech, embedding, result, different, ...","[little, work, tangkhul, current, work, humble..."
4,http://arxiv.org/pdf/1511.07916v1,This is a lecture note for the course DS GA at...,Natural Language Understanding with Distribute...,[arxiv.Result.Author('Kyunghyun Cho')],,2015-11-24 23:23:13+00:00,this is a lecture note for the course ds-ga 30...,,cs.CL,"['cs.CL', 'stat.ML']",http://arxiv.org/abs/1511.07916v1,both,0,0.4473,"llm, research, field, knowledge, attention, ch...","[note, course, understanding, distribute, repr..."


In [106]:
merged_df['Dominant_Topic'].value_counts()

2    1391
3    1169
4     452
0     336
1     220
Name: Dominant_Topic, dtype: int64

In [107]:
merged_df.loc[merged_df['Dominant_Topic'] == 0].sort_values('Perc_Contribution', ascending=False)[['Summary','Title','Perc_Contribution']].head(10)

Unnamed: 0,Summary,Title,Perc_Contribution
36,A survey paper proposes a clearer view of natu...,"Natural Language Reasoning, A Survey",0.7702
136,This study provides an overview of the history...,Sejarah dan Perkembangan Teknik Natural Langua...,0.7223
3066,Natural Language QA Approaches using Reasoning...,Natural Language QA Approaches using Reasoning...,0.6729
231,Exploring the Landscape of Natural Language Pr...,Exploring the Landscape of Natural Language Pr...,0.6586
1634,A A Bibliometric Review of Large Language Mode...,A Bibliometric Review of Large Language Models...,0.6464
2508,Dual use the intentional harmful reuse of tech...,Thorny Roses: Investigating the Dual Use Dilem...,0.6438
1802,A Survey of the Usages of Deep Learning for Na...,A Survey of the Usages of Deep Learning in Nat...,0.6351
1817,A Survey of the Usages of Deep Learning for Na...,A Survey of the Usages of Deep Learning in Nat...,0.6351
1816,A Survey of the Usages of Deep Learning for Na...,A Survey of the Usages of Deep Learning in Nat...,0.6351
1801,A Survey of the Usages of Deep Learning for Na...,A Survey of the Usages of Deep Learning in Nat...,0.6351


In [108]:
merged_df.loc[merged_df['Dominant_Topic'] == 1].sort_values('Perc_Contribution', ascending=False)[['Summary','Title','Perc_Contribution']].head(10)

Unnamed: 0,Summary,Title,Perc_Contribution
28,The thesis develops the translation between ca...,Categorical Tools for Natural Language Processing,0.7087
3075,ArXiv v math CT Apr Volume Issue ISSN Pregroup...,Lambek pregroups are Frobenius spiders in preo...,0.7053
234,Graph Interpolation Grammars are a rule based ...,Graph Interpolation Grammars: a Rule-based App...,0.6972
2758,The cen tral role of the lexicon in Meaning T ...,A Formal Look at Dependency Grammars and Phras...,0.6745
80,In this chapter we introduce a new dialogical ...,"Logical Semantics, Dialogical Argumentation, a...",0.6735
1914,Quantized Detector Networks QDN is a descripti...,Quantized Detector Networks: A review of recen...,0.6622
3034,Category Theory for Quantum encompasses quantu...,Category Theory for Quantum Natural Language P...,0.6487
2937,Program equivalence is the fulcrum for reasoni...,Recursive Session Logical Relations,0.6444
25,Paper defines event expression over sentences ...,Natural Language Understanding Based on Semant...,0.6417
1572,For any monoid M the family of languages accep...,Rational semigroup automata,0.6187


In [109]:
merged_df.loc[merged_df['Dominant_Topic'] == 2].sort_values('Perc_Contribution', ascending=False)[['Summary','Title','Perc_Contribution']].head(10)

Unnamed: 0,Summary,Title,Perc_Contribution
2168,Deep and large pre trained models are the stat...,XtremeDistilTransformers: Task Transfer for Ta...,0.8853
1161,SwitchPrompt Learning Domain Speci c Gated Sof...,SwitchPrompt: Learning Domain-Specific Gated S...,0.8801
3564,SCAT modifies ran driven augmentations of the ...,SCAT: Robust Self-supervised Contrastive Learn...,0.8751
3411,Back Translated Task Adaptive Pretraining Impr...,Back-Translated Task Adaptive Pretraining: Imp...,0.8484
1876,Recent work has shown that deep transformer la...,GPT-3 Models are Poor Few-Shot Learners in the...,0.8475
39,Cross lingual Adaption Model Agnostic Meta Lea...,Cross-lingual Adaption Model-Agnostic Meta-Lea...,0.8405
1325,NTK approximating MLP Fusion for Efficient Lan...,NTK-approximating MLP Fusion for Efficient Lan...,0.8257
511,Fine Tuning Transformers Vocabulary Transfer T...,Fine-Tuning Transformers: Vocabulary Transfer,0.8196
2194,Prompt Tuning for Discriminative Pre trained L...,Prompt Tuning for Discriminative Pre-trained L...,0.8172
654,HyPe Better Pre trained Language Model Fine tu...,HyPe: Better Pre-trained Language Model Fine-t...,0.8149


In [110]:
merged_df.loc[merged_df['Dominant_Topic'] == 3].sort_values('Perc_Contribution', ascending=False)[['Summary','Title','Perc_Contribution']].head(10)

Unnamed: 0,Summary,Title,Perc_Contribution
3200,The aim of this challenge is to build a multi ...,Multi-Speaker Multi-Lingual VQTTS System for L...,0.8278
801,Exploiting Transliterated Words for Finding Si...,Exploiting Transliterated Words for Finding Si...,0.8168
532,An annotated dataset and tagger tool for the l...,UzbekTagger: The rule-based POS tagger for Uzb...,0.7859
82,Automatic Language Identi cation for RomanceLa...,Automatic Language Identification for Romance ...,0.7854
2535,Building a Lemmatizer and a Spell checker for ...,Building a Lemmatizer and a Spell-checker for ...,0.7837
2092,Experiments with POS Tagging Code mixed Indian...,Experiments with POS Tagging Code-mixed Indian...,0.78
628,This paper reports about our work in the NLP o...,JU_KS@SAIL_CodeMixed-2017: Sentiment Analysis ...,0.7757
1443,The models hauWE Hausa Word s Embedding are bi...,hauWE: Hausa Words Embedding for Natural Langu...,0.7615
1102,Mapping supervised bilingual word embeddings f...,Mapping Supervised Bilingual Word Embeddings f...,0.7613
3317,The complexities of Arabic language in morphol...,Improving Sentiment Analysis in Arabic Using W...,0.7603


In [111]:
merged_df.loc[merged_df['Dominant_Topic'] == 4].sort_values('Perc_Contribution', ascending=False)[['Summary','Title','Perc_Contribution']].head(10)

Unnamed: 0,Summary,Title,Perc_Contribution
41,Most musical programming languages are develop...,Composition by Conversation,0.7401
601,Interactive Task and Concept Learning from Nat...,Interactive Task and Concept Learning from Nat...,0.7328
262,The approach is general and could be applied t...,Processing Natural Language About Ongoing Actions,0.7322
1009,Current version of Business process management...,Towards ontology based BPMN Implementation,0.7273
1783,Automating a factory where robots are involved...,"Towards VEsNA, a Framework for Managing Virtua...",0.7052
1484,The database systems course is offered as part...,Towards Enhancing Database Education: Natural ...,0.6893
2540,Intelligent Home D Automatic D House Design fr...,Intelligent Home 3D: Automatic 3D-House Design...,0.6833
2565,Knowledge graphs KG have become an important d...,Visual Diagrammatic Queries in ViziQuer: Overv...,0.6792
682,Sense Plan Ask is a novel approach for generat...,SPA: Verbal Interactions between Agents and Av...,0.676
602,The paper intends to present a review on Objec...,Object Oriented Analysis using Natural Languag...,0.6649


In [112]:
merged_df.to_csv('../data/Dominant_topics_of_summaries_v2.csv')

# Archive

In [113]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

In [None]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=texts)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

In [None]:
df_dominant_topic.to_csv('../data/Dominant_topics_of_absracts.csv')

In [None]:
df_dominant_topic.tail(10)

In [None]:
df_dominant_topic['Dominant_Topic'].value_counts()

In [None]:
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf_mallet.head()

In [None]:
# Display setting to show more characters in column
pd.options.display.max_colwidth = 100

sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text"]

# Show
sent_topics_sorteddf_mallet.head(10)

In [None]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# Change Column names
df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

# Show
df_dominant_topics