In [30]:
import pandas as pd
import numpy as np

from gensim.models.ldamulticore import LdaMulticore
from gensim.models import LdaModel,LsiModel, CoherenceModel
from gensim import corpora
from gensim.utils import simple_preprocess

import re
import string
from bs4 import BeautifulSoup
from tqdm import tqdm
from collections import Counter

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

from scipy.sparse import csr_matrix



**Data used:** topic-modeling-for-research-articles on Kaggle

In [53]:
data = pd.read_csv('/kaggle/input/topic-modeling-for-research-articles/train.csv')
abstracts = data['ABSTRACT'].tolist() 
print('Loaded Documents: {}'.format(len(abstracts)))
print(abstracts[0:2])

Loaded Documents: 20972
["  Predictive models allow subject-specific inference when analyzing disease\nrelated alterations in neuroimaging data. Given a subject's data, inference can\nbe made at two levels: global, i.e. identifiying condition presence for the\nsubject, and local, i.e. detecting condition effect on each individual\nmeasurement extracted from the subject's data. While global inference is widely\nused, local inference, which can be used to form subject-specific effect maps,\nis rarely used because existing models often yield noisy detections composed of\ndispersed isolated islands. In this article, we propose a reconstruction\nmethod, named RSM, to improve subject-specific detections of predictive\nmodeling approaches and in particular, binary classifiers. RSM specifically\naims to reduce noise due to sampling error associated with using a finite\nsample of examples to train classifiers. The proposed method is a wrapper-type\nalgorithm that can be used with different bina

## Preprocess Data

In [58]:
#no of documents reduced for computation
n = 5000

re_punctuation = re.compile('['+string.punctuation+']')
stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

preproc_abstract = []
for abstract in tqdm(np.random.choice(abstracts, n)):
    abstract = BeautifulSoup(abstract, 'lxml').get_text().lower()
    abstract = re_punctuation.sub(' ', abstract)
    abstract = simple_preprocess(abstract,deacc=True,min_len=3)
    abstract = [lemmatizer.lemmatize(word) for word in abstract]
    abstract = [word for word in abstract if word not in stop]
    preproc_abstract.append(abstract)


100%|██████████| 5000/5000 [00:07<00:00, 631.76it/s]


In [59]:
wordfreq = Counter()
for comment in preproc_abstract:
    wordfreq.update(abstract)
print('Unique Words In Abstracts: {}'.format(len(wordfreq)))


minwords = 5
text = [[word for word in comment if wordfreq[word] > minwords] for comment in preproc_abstract]



Unique Words In Abstracts: 52


In [60]:
dictionary = corpora.Dictionary(text)
vocab = [dictionary[i] for i in dictionary.keys()]
print('Documents: {}'.format(len(text)))

corpus = [dictionary.doc2bow(doc) for doc in text]

def sparse_matrix(corpus, n):
    data, row, col = [], [], []
    for cc, doc in enumerate(corpus):
        for word in doc:
            row.append(cc)
            col.append(word[0])
            data.append(word[1])
    X = csr_matrix((np.array(data), (np.array(row), np.array(col))), shape=(cc+1, n))
    return X

X = sparse_matrix(corpus, len(dictionary))
print('Train Shape:\t{}'.format(X.shape))

Documents: 5000
Train Shape:	(5000, 52)


In [61]:
topic_no = 6
print('Number of topics:\t{}'.format(topic_no))

Number of topics:	6


# **LDA**

In [82]:
lda_model = LdaModel(num_topics=topic_no,
                        id2word=dictionary,
                        random_state=100,
                        update_every=1,
                        chunksize=100,
                        passes=10,
                        alpha='auto',
                        per_word_topics=True
                     )

        
# Plot the topics
for i, topic in enumerate(lda_model.get_topics().argsort(axis=1)[:, -10:][:, ::-1], 1):
    print('Topic {}: {}'.format(i, ' '.join([vocab[id] for id in topic])))

Topic 1: conclusion anderson time real basis support critical single density chain
Topic 2: double density renormalization time critical impurity enables ordered generic respectively
Topic 3: enables conclusion case dynamic impurity noninteracting entropy time choice entanglement
Topic 4: real time double single matrix state enables saturation entanglement support
Topic 5: set critical generic calculation anderson conclusion renormalization respectively double orbitals
Topic 6: critical impurity spectrum generic time long entropy real quench density


In [83]:
print(lda_model.print_topics())

[(0, '0.023*"conclusion" + 0.022*"anderson" + 0.022*"time" + 0.022*"real" + 0.022*"basis" + 0.021*"support" + 0.021*"critical" + 0.021*"single" + 0.021*"density" + 0.021*"chain"'), (1, '0.023*"double" + 0.023*"density" + 0.023*"renormalization" + 0.022*"time" + 0.021*"critical" + 0.021*"impurity" + 0.021*"enables" + 0.021*"ordered" + 0.021*"generic" + 0.021*"respectively"'), (2, '0.025*"enables" + 0.023*"conclusion" + 0.022*"case" + 0.022*"dynamic" + 0.022*"impurity" + 0.022*"noninteracting" + 0.021*"entropy" + 0.021*"time" + 0.021*"choice" + 0.021*"entanglement"'), (3, '0.025*"real" + 0.024*"time" + 0.024*"double" + 0.022*"single" + 0.022*"matrix" + 0.021*"state" + 0.021*"enables" + 0.021*"saturation" + 0.021*"entanglement" + 0.021*"support"'), (4, '0.024*"set" + 0.022*"critical" + 0.022*"generic" + 0.022*"calculation" + 0.022*"anderson" + 0.022*"conclusion" + 0.022*"renormalization" + 0.021*"respectively" + 0.021*"double" + 0.021*"orbitals"'), (5, '0.027*"critical" + 0.024*"impurity"

# **LSA**

In [84]:
from gensim.models import LsiModel
# bow = [dictionary.doc2bow(text) for text in corpus]
lsa_model = LsiModel(corpus, num_topics=topic_no, id2word=dictionary)
    
    
print(lsa_model.print_topics(num_topics=6, num_words=10))

[(0, '0.945*"model" + 0.200*"time" + 0.130*"state" + 0.100*"study" + 0.077*"set" + 0.075*"dynamic" + 0.066*"case" + 0.064*"real" + 0.056*"linear" + 0.048*"density"'), (1, '0.834*"time" + -0.299*"model" + 0.254*"state" + 0.162*"study" + 0.160*"energy" + 0.143*"dynamic" + 0.129*"case" + 0.100*"set" + 0.091*"real" + 0.089*"linear"'), (2, '0.818*"state" + -0.433*"time" + 0.188*"energy" + 0.185*"study" + 0.104*"group" + 0.099*"density" + -0.092*"model" + 0.084*"set" + 0.075*"case" + 0.073*"matrix"'), (3, '-0.493*"set" + 0.411*"state" + -0.398*"study" + -0.360*"group" + -0.341*"matrix" + -0.280*"case" + 0.199*"time" + -0.147*"linear" + -0.098*"density" + -0.086*"find"'), (4, '-0.885*"energy" + 0.304*"set" + 0.226*"state" + -0.190*"density" + 0.114*"group" + -0.079*"spectrum" + 0.072*"time" + 0.062*"matrix" + -0.052*"regime" + -0.038*"find"'), (5, '0.745*"group" + -0.636*"set" + -0.110*"energy" + 0.107*"matrix" + 0.065*"study" + -0.056*"real" + -0.041*"multi" + -0.038*"single" + 0.035*"time" 

In [88]:
for i, topic in enumerate(lsa_model.get_topics().argsort(axis=1)[:, -10:][:, ::-1], 1):
    print('Topic {}: {}'.format(i, ' '.join([vocab[id] for id in topic])))

Topic 1: model time state study set dynamic case real linear density
Topic 2: time state study energy dynamic case set real linear matrix
Topic 3: state energy study group density set case matrix multi single
Topic 4: state time model long dynamic quenched gapped noninteracting impurity orbitals
Topic 5: set state group time matrix multi linear generic chain dynamic
Topic 6: group matrix study time case dynamic model behavior critical double


# **Coherence Score for both**

In [86]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=text, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score for LDA: ', coherence_lda)


Coherence Score for LDA:  0.4639572197958572


In [87]:
coherence_model_lsa = CoherenceModel(model=lsa_model, texts=text, dictionary=dictionary, coherence='c_v')
coherence_lsa = coherence_model_lsa.get_coherence()
print('\nCoherence Score for LSA: ', coherence_lsa)


Coherence Score for LSA:  0.41536494353071324
