In [6]:
# Importing modules
import pandas as pd

# Read data into papers
papers = pd.read_csv(r'C:\Users\HP\Desktop\jupyterfile\NIPS Papers\papers.csv')
# Print head
papers.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


In [7]:
# Remove the columns
papers = papers.drop(columns=['id', 'title', 'abstract', 
                              'event_type', 'pdf_name', 'year'], axis=1)
# sample only 10 papers - for demonstration purposes
papers = papers.sample(10)
# Print out the first rows of papers
papers.head()

Unnamed: 0,paper_text
1421,Learning Graphical Models\nwith Mercer Kernels...
6028,Using Social Dynamics to Make Individual Predi...
1995,Gradient Flow Independent Component\nAnalysis ...
2438,Using Deep Belief Nets to Learn Covariance Ker...
877,Error-correcting Codes on a Bethe-like Lattice...


In [8]:
# Load the regular expression library
import re
# Remove punctuation
papers['paper_text_processed'] = papers['paper_text'].map(lambda x: re.sub('[,\.!?]', '', x))
# Convert the titles to lowercase
papers['paper_text_processed'] = papers['paper_text_processed'].map(lambda x: x.lower())
# Print out the first rows of papers
papers['paper_text_processed'].head()

1421    learning graphical models\nwith mercer kernels...
6028    using social dynamics to make individual predi...
1995    gradient flow independent component\nanalysis ...
2438    using deep belief nets to learn covariance ker...
877     error-correcting codes on a bethe-like lattice...
Name: paper_text_processed, dtype: object

In [10]:
import gensim
from gensim.utils import simple_preprocess
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
data = papers.paper_text_processed.values.tolist()
data_words = list(sent_to_words(data))
print(data_words[:1])

[['learning', 'graphical', 'models', 'with', 'mercer', 'kernels', 'francis', 'bach', 'division', 'of', 'computer', 'science', 'university', 'of', 'california', 'berkeley', 'ca', 'fbach', 'csberkeleyedu', 'michael', 'jordan', 'computer', 'science', 'and', 'statistics', 'university', 'of', 'california', 'berkeley', 'ca', 'jordan', 'csberkeleyedu', 'abstract', 'we', 'present', 'class', 'of', 'algorithms', 'for', 'learning', 'the', 'structure', 'of', 'graphical', 'models', 'from', 'data', 'the', 'algorithms', 'are', 'based', 'on', 'measure', 'known', 'as', 'the', 'kernel', 'generalized', 'variance', 'kgv', 'which', 'essentially', 'allows', 'us', 'to', 'treat', 'all', 'variables', 'on', 'an', 'equal', 'footing', 'as', 'gaussians', 'in', 'feature', 'space', 'obtained', 'from', 'mercer', 'kernels', 'thus', 'we', 'are', 'able', 'to', 'learn', 'hybrid', 'graphs', 'involving', 'discrete', 'and', 'continuous', 'variables', 'of', 'arbitrary', 'type', 'we', 'explore', 'the', 'computational', 'prope




In [12]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [13]:
# NLTK Stop words
# import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]
def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [14]:
import spacy
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
print(data_lemmatized[:1])

[['learn', 'graphical_model', 'mercer', 'kernel', 'francis', 'bach', 'division', 'computer_science', 'university', 'california', 'berkeley', 'ca', 'fbach', 'csberkeleyedu', 'michael', 'jordan', 'computer_science', 'statistic', 'university', 'california', 'berkeley', 'ca', 'jordan', 'csberkeleyedu', 'abstract', 'present', 'class', 'algorithm', 'learn', 'structure', 'graphical_model', 'datum', 'algorithm', 'base', 'measure', 'know', 'kernel', 'generalized_variance', 'kgv', 'essentially', 'allow', 'treat', 'variable', 'equal', 'foot', 'gaussian', 'feature_space', 'obtain', 'mercer', 'kernel', 'thus', 'able', 'learn', 'hybrid', 'graph', 'involve', 'discrete', 'continuous_variables', 'arbitrary', 'type', 'explore', 'computational', 'property', 'approach', 'show', 'kernel', 'trick', 'compute', 'relevant', 'statistic', 'linear', 'time', 'illustrate', 'framework', 'experiment', 'involve', 'discrete', 'continuous', 'datum', 'introduction', 'graphical_model', 'compact', 'efficient', 'way', 'repr




In [15]:
import gensim.corpora as corpora
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
# Create Corpus
texts = data_lemmatized
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1])

[[(0, 2), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 1), (9, 1), (10, 1), (11, 2), (12, 14), (13, 2), (14, 14), (15, 2), (16, 2), (17, 2), (18, 4), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 5), (25, 18), (26, 1), (27, 4), (28, 2), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 5), (36, 1), (37, 1), (38, 1), (39, 1), (40, 3), (41, 1), (42, 1), (43, 2), (44, 1), (45, 1), (46, 15), (47, 1), (48, 5), (49, 1), (50, 8), (51, 1), (52, 3), (53, 7), (54, 1), (55, 1), (56, 2), (57, 2), (58, 2), (59, 1), (60, 6), (61, 12), (62, 1), (63, 1), (64, 1), (65, 1), (66, 2), (67, 2), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 12), (75, 1), (76, 1), (77, 4), (78, 1), (79, 1), (80, 2), (81, 2), (82, 4), (83, 1), (84, 1), (85, 2), (86, 1), (87, 3), (88, 2), (89, 2), (90, 1), (91, 3), (92, 2), (93, 1), (94, 1), (95, 1), (96, 4), (97, 1), (98, 1), (99, 3), (100, 1), (101, 11), (102, 2), (103, 1), (104, 1), (105, 1), (106, 5), (107, 13), (108, 3), (109, 4), 

In [16]:
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=10, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

In [17]:
from pprint import pprint
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.023*"view" + 0.019*"classifier" + 0.018*"set" + 0.016*"learn" + '
  '0.014*"example" + 0.013*"use" + 0.013*"training" + 0.012*"xv" + '
  '0.012*"multi_view" + 0.010*"class"'),
 (1,
  '0.030*"model" + 0.020*"structure" + 0.018*"permutation" + 0.016*"rim" + '
  '0.009*"algorithm" + 0.009*"datum" + 0.008*"element" + 0.008*"gmm" + '
  '0.008*"set" + 0.008*"parameter"'),
 (2,
  '0.031*"model" + 0.031*"individual" + 0.017*"use" + 0.015*"prediction" + '
  '0.012*"disease" + 0.012*"trajectory" + 0.011*"time" + 0.011*"subtype" + '
  '0.009*"datum" + 0.008*"zi"'),
 (3,
  '0.001*"model" + 0.001*"use" + 0.001*"datum" + 0.001*"set" + '
  '0.001*"individual" + 0.000*"learn" + 0.000*"show" + 0.000*"kernel" + '
  '0.000*"time" + 0.000*"feature"'),
 (4,
  '0.015*"algorithm" + 0.014*"individual" + 0.012*"sample" + 0.012*"time" + '
  '0.012*"search" + 0.011*"tree" + 0.010*"model" + 0.009*"dataset" + '
  '0.009*"approximate" + 0.008*"distance"'),
 (5,
  '0.013*"model" + 0.012*"matrix" + 0.010*"c

In [18]:
from gensim.models import CoherenceModel
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.44753326934014004
