In [1]:
import pandas as pd  #import panda in jupitar notebook
papers = pd.read_csv('papers.csv') # read the dataset 
papers.head() #Number of rows to select.

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


In [2]:
# Remove the columns
papers = papers.drop(columns=['id', 'title', 'abstract', 
                              'event_type', 'pdf_name', 'year'], axis=1)  #drop the coulm 
# sample only 100 papers
papers = papers.sample(10)
# Print out the first rows of papers
papers.head()

Unnamed: 0,paper_text
1334,Maximum Likelihood and the Information\nBottle...
1299,Half-Lives of EigenFlows for Spectral Clusteri...
5762,Optimal Learning for Multi-pass Stochastic Gra...
948,Sparsity of data representation of optimal ker...
3402,Brain covariance selection: better individual\...


In [3]:

import re # Load the regular expression library

papers['paper_text_processed'] = papers['paper_text'].map(lambda x: re.sub('[,\.!?]', '', x))# Remove punctuation
papers['paper_text_processed'] = papers['paper_text_processed'].map(lambda x: x.lower())
papers['paper_text_processed'].head() # Print out the first rows of papers

1334    maximum likelihood and the information\nbottle...
1299    half-lives of eigenflows for spectral clusteri...
5762    optimal learning for multi-pass stochastic gra...
948     sparsity of data representation of optimal ker...
3402    brain covariance selection: better individual\...
Name: paper_text_processed, dtype: object

In [9]:
import gensim 
from gensim.utils import simple_preprocess #Convert a document into a list of tokens
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
data = papers.paper_text_processed.values.tolist()
data_words = list(sent_to_words(data))
print(data_words[:1][0][:10])



['maximum', 'likelihood', 'and', 'the', 'information', 'bottleneck', 'noam', 'slonim', 'yair', 'weiss']


In [10]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=10) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=10)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [15]:
# NLTK Stop words
# import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]
def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]
def lemmatization(texts, allowed_postags=['NOUN']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [16]:
import spacy
# Remove Stop Wordsa
data_words_nostops = remove_stopwords(data_words)
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser'])
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN'])
print(data_lemmatized[:1])

[['likelihood', 'information_bottleneck', 'noam', 'yair', 'weiss', 'school', 'computer', 'science', 'engineering', 'abstract', 'information_bottleneck', 'ib', 'formulation', 'clustering', 'problem', 'joint_distribution', 'method', 'define', 'partition', 'value', 'likelihood', 'mixture', 'model', 'approach', 'clustering', 'problem', 'paper', 'method', 'mapping', 'ml', 'problem', 'mixture_model', 'mapping', 'problem', 'fact', 'input', 'distribution', 'sample', 'size', 'problem', 'case', 'define', 'log', 'likelihood', 'vice_versa', 'value', 'fixed_point', 'transformation', 'result', 'case', 'algorithm', 'problem', 'solution', 'introduction', 'analysis', 'set', 'object', 'partition', 'score', 'function', 'tishby', 'approach', 'problem', 'approach', 'joint_distribution', 'representation', 'information', 'discussion', 'mutual_information', 'variable', 'representation', 'information', 'mutual_information', 'trade', 'quantity', 'representation', 'representation', 'quality', 'cluster', 'fractio

In [17]:
import gensim.corpora as corpora
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
# Create Corpus
texts = data_lemmatized
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 7), (4, 1), (5, 2), (6, 3), (7, 9), (8, 1), (9, 4), (10, 4), (11, 1), (12, 1), (13, 1), (14, 1), (15, 10), (16, 7), (17, 5), (18, 1), (19, 2), (20, 10), (21, 1), (22, 2), (23, 1), (24, 1), (25, 2), (26, 2), (27, 3), (28, 2), (29, 1), (30, 1), (31, 2), (32, 1), (33, 3), (34, 1), (35, 1), (36, 2), (37, 1), (38, 2), (39, 4), (40, 1), (41, 7), (42, 1), (43, 1), (44, 2), (45, 2), (46, 1), (47, 3), (48, 6), (49, 4), (50, 16), (51, 1), (52, 1), (53, 5), (54, 1), (55, 1), (56, 2), (57, 1), (58, 7), (59, 1), (60, 6), (61, 1), (62, 3), (63, 3), (64, 1), (65, 2), (66, 3), (67, 3), (68, 1), (69, 3), (70, 10), (71, 1), (72, 3), (73, 1), (74, 2), (75, 1), (76, 2), (77, 2), (78, 3), (79, 3), (80, 5), (81, 1), (82, 2), (83, 1), (84, 5), (85, 24), (86, 1), (87, 2), (88, 1), (89, 1), (90, 2), (91, 4), (92, 4), (93, 1), (94, 4), (95, 1), (96, 1), (97, 2), (98, 2), (99, 2), (100, 1), (101, 7), (102, 3), (103, 1), (104, 2), (105, 1), (106, 1), (107, 2), (108, 1), (109, 1), (11

In [19]:
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=10, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

In [20]:
from pprint import pprint
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.019*"eigenvector" + 0.019*"probability" + 0.018*"edge" + 0.016*"cluster" '
  '+ 0.015*"matrix" + 0.013*"function" + 0.012*"cost_function" + '
  '0.012*"result" + 0.012*"sensitivity" + 0.011*"number"'),
 (1,
  '0.017*"ctw" + 0.017*"stimulus" + 0.012*"model" + 0.010*"datum" + '
  '0.010*"estimate" + 0.009*"result" + 0.009*"time" + 0.009*"memory" + '
  '0.009*"assumption" + 0.008*"number"'),
 (2,
  '0.001*"ctw" + 0.001*"probability" + 0.001*"datum" + 0.001*"model" + '
  '0.001*"matrix" + 0.001*"number" + 0.001*"eigenvector" + 0.001*"cluster" + '
  '0.001*"edge" + 0.001*"problem"'),
 (3,
  '0.001*"channel" + 0.001*"model" + 0.001*"datum" + 0.001*"probability" + '
  '0.001*"problem" + 0.001*"cell" + 0.001*"assumption" + 0.001*"cluster" + '
  '0.001*"function" + 0.001*"number"'),
 (4,
  '0.021*"channel" + 0.020*"model" + 0.018*"brain" + 0.017*"datum" + '
  '0.014*"community" + 0.014*"graph" + 0.013*"network" + 0.011*"estimation" + '
  '0.011*"structure" + 0.009*"problem"'),
 (5,
  

In [21]:
from gensim.models import CoherenceModel
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.39210503030051375
