In [3]:
from gensim.models import LdaMulticore,TfidfModel,CoherenceModel
from gensim.corpora import Dictionary
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
from wordcloud import WordCloud
import re 
from nltk.corpus import stopwords
from string import punctuation
import nltk
from nltk.stem import WordNetLemmatizer

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
data = pd.read_csv('/content/drive/MyDrive/Papers.csv')
data.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


In [6]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [7]:
data = data[['id','title','paper_text']]
data.isnull().sum()

id            0
title         0
paper_text    0
dtype: int64

In [8]:
stuff_to_be_removed = list(stopwords.words('english')) + list(punctuation)
new_words = ['this','model','loss','graph']
stuff_to_be_removed.extend(new_words)

In [9]:
def preprocess(message):
    message = message.lower()
    #Remove links 
    message = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                    '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', message)
    # Remove extra spaces 
    message = re.sub(' +', ' ', message)
    # Remove mentions 
    message =re.sub("(@[A-Za-z0-9_]+)","", message)
    # Remove all non alphanumeric characters 
    message = re.sub("^[A-Za-z0-9_-]*$", "", message)
    # Remove stopwords and lemmatize
    lemmatizer = WordNetLemmatizer()
    message = ' '.join([lemmatizer.lemmatize(token) for token in message.split() if token not in stuff_to_be_removed and len(token)>=4])
    return message.split()

In [10]:
preprocessed_docs = data['paper_text'].apply(preprocess)

In [11]:
corpus = ' '.join([' '.join(item) for idx, item in preprocessed_docs.iteritems()])

In [13]:
dictionary = Dictionary(preprocessed_docs)

In [14]:
corpus = [dictionary.doc2bow(doc) for doc in preprocessed_docs]

In [15]:
tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [17]:
lda_model_tfidf = LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

In [18]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print(f'Topic: {idx} Word: {topic}')
    print('\n')

Topic: 0 Word: 0.000*"network" + 0.000*"image" + 0.000*"neuron" + 0.000*"kernel" + 0.000*"spike" + 0.000*"policy" + 0.000*"node" + 0.000*"regret" + 0.000*"action" + 0.000*"bound"


Topic: 1 Word: 0.000*"kernel" + 0.000*"clustering" + 0.000*"neuron" + 0.000*"network" + 0.000*"matrix" + 0.000*"bound" + 0.000*"training" + 0.000*"theorem" + 0.000*"image" + 0.000*"policy"


Topic: 2 Word: 0.000*"image" + 0.000*"kernel" + 0.000*"object" + 0.000*"network" + 0.000*"node" + 0.000*"bound" + 0.000*"policy" + 0.000*"neuron" + 0.000*"feature" + 0.000*"theorem"


Topic: 3 Word: 0.000*"policy" + 0.000*"image" + 0.000*"network" + 0.000*"neuron" + 0.000*"object" + 0.000*"spike" + 0.000*"action" + 0.000*"cluster" + 0.000*"regret" + 0.000*"kernel"


Topic: 4 Word: 0.000*"neuron" + 0.000*"policy" + 0.000*"network" + 0.000*"image" + 0.000*"kernel" + 0.000*"latent" + 0.000*"tree" + 0.000*"classifier" + 0.000*"posterior" + 0.000*"node"


Topic: 5 Word: 0.000*"kernel" + 0.000*"policy" + 0.000*"bound" + 0.000*