In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Import all the necessary libraries.

In [None]:
#import packages
import re
import numpy as np
import pandas as pd
from pprint import pprint

#import nltk for stopwords and english words
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
words = set(nltk.corpus.words.words())

# Gensim packages
import gensim
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

Let's load the data file and have a loot at the structure of it.

In [None]:
nips_papers=pd.read_csv('../input/nips-papers-1987-2019-updated/papers.csv')
nips_papers.head(5)

We want to see how many papers are in the data...

In [None]:
nips_papers.shape

and how the documents look.

In [None]:
text=nips_papers.full_text.values.tolist()
text[1]

Cleaning the documents is necessary as they contain a lot of symbols, mathematical language, abreviations and other words that will not give any useful information to the model.

In [None]:
#clean the text using 'simple_process()'. tokenization, pancuation removal, remove unnecessary characters
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

text_words = list(sent_to_words(text))

Bigrams are a good method to make the terms more compact and give meaning to terms that can't stand on their own.

In [None]:
#Build the bigram
bigram = gensim.models.Phrases(text_words, min_count=5, threshold=100) # higher threshold fewer phrases.

#Faster way to get a sentence clubbed as a bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)

Let's define the final text cleaning functions to use...

In [None]:
# Define functions for stopwords, bigrams, lemmatization and remove non english words
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'VERB', 'ADV', 'ADJ']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

def remove_non_english(texts):
        return [[word for word in doc if word.lower() in words] for doc in texts]

and run them!

In [None]:
%%time

# Remove Stop Words
data_words_nostops = remove_stopwords(text_words)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'VERB', 'ADJ'])

#Remove non english words
english_text=remove_non_english(data_lemmatized)

# Form Bigrams
data_words_bigrams = make_bigrams(english_text)

This is the final form of a document inside our corpus. Looks more neat, without strange symbols and meaningful words.

In [None]:
print(data_words_bigrams[1])

We need to have an idea of how many unique words are included in the corpus and how frequent each one is.

In [None]:
#Build term frequency dictionary
freq_dict={}
for text in data_words_bigrams:
    for word in text:
        if word in freq_dict:
            freq_dict[word]+=1
        else:
            freq_dict[word]=1
            
print("There are",len(freq_dict),"unique words used in the whole set of papers")

#Frequency of each word
for key, value in sorted(freq_dict.items(), key=lambda item: item[1]):
    print("%s: %s" % (key, value))

We decide to remove the least frequent ones and keep about 25% of the total terms. That percentage should give the most important information.

In [None]:
#Build function to remove low frequency terms
def remove_low_freq_terms(texts, f):
    return [[word for word in doc if freq_dict[word]>f] for doc in texts]

texts_final=remove_low_freq_terms(data_words_bigrams, 25)

The next step is to finalize the corpus, build the dictionary of the corpus and convert it to Bag Of Words format.

In [None]:
%%time

# Create Corpus
texts=texts_final

# Create Dictionary
id2word = Dictionary(texts)

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

Now we're ready to implement the LDA model through a function that will test the performance of the model for different number of topics.

In [None]:
def compute_coherence_values(dictionary, corpus, texts, topic_count):
    """
    Compute perplexity and c_v coherence scores for various LDA models

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    topic_count : Number of topics

    Returns:
    -------
    model_list : List of LDA topic models
    perplexity_values : Perplexity scores corresponding to the LDA model with respective number of topics and passes
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics and passes
    """
    perplexity_values = []
    coherence_values = []
    model_list = []
    for num_topics in topic_count:            
                model=gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                      num_topics=num_topics,
                                                      id2word=id2word,
                                                      chunksize=300,
                                                      random_state=100,
                                                      update_every=1,
                                                      alpha='auto',
                                                      eta='auto',
                                                      passes=40,
                                                      per_word_topics=True)
                model_list.append(model)
                # Compute Perplexity
                perplexity_score = model.log_perplexity(corpus)
                perplexity_values.append(perplexity_score)  # a measure of how good the model is. lower the better.
                coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
                coherence_score=coherencemodel.get_coherence()
                coherence_values.append(coherence_score)
                print('num_topics:', num_topics, 'Perplexity:', perplexity_score, 'Coherence:', coherence_score )

    return model_list, perplexity_values, coherence_values

In [None]:
%%time

model_list, perplexity_values, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=texts, topic_count = [10,16,20])

The model with the highest scores is the one with 20 topics.
These are the top 10 keywords for each of the 20 topics.

In [None]:
# Print the Keyword in the 10 topics
pprint(model_list[-1].print_topics())
doc_lda = model_list[-1][corpus]

Finally we visualize the topics using pyLDAvis tool.

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(model_list[-1], corpus, id2word)
vis