---
# NLP Analysis of Scientific Abstracts

####  US National Library of Medicine National Institutes of Health 
#### The National Center for Biotechnology Information 
#### PubMed

---
#### Introduction  
The PubMed database is managed by the National Library of Medicine (National Instititutes of Health). The database contains citations to over 29 million biomedical papers and books and is typically the first resource for biological, biomedical and health research.   
The aim of this exercise is to explore a few techniques for modeling topics and comparing those techiques with coherence measures.  

In [1]:
# Import stanard tools
import numpy as np
import pandas as pd
import glob
from pprint import pprint
import re

In [22]:
# Import for NLP tools
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.phrases import Phrases, Phraser

# Import mallet for topic modeling
mallet_path = '/usr/share/Mallet/bin/mallet'

# Spacy to lemmate
import spacy

# Import NLTK for stop words
from nltk.corpus import stopwords

In [3]:
# Import visualization tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
# Import train / test
from sklearn.model_selection import train_test_split

In [5]:
# Import custom module to get abstracts
from lib.get_abstracts import get_abstracts

In [6]:
# Suppress warnings
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

---
#### Get Abstracts   
The get_abstracts function passes a search term to the PubMed database though an api call. A csv file is of abstracts is returned.

In [7]:
# #Get abstracts and create csv of abstracts
# # API key obtained from NCBI

# terms = ['gene therapy', 'chemotherapy', 'infection control', \
#          'precision medicine', 'orthopedics', 'allograph', 'nosocomial', \
#          'pediatric', 'geriatrics', 'imaging']

# for i in terms:
#     get_abstracts(i, 100, 'ragorham1@gmail.com', 
#                   '3ffbbb6bd110815d69e4aa14b7c26d72ab09')


---
#### Stop words   
In addition to the standard NLTK stop word lists, words that are found not to add to the topic model (because they may be words common to all medical topics) as extended to the list.

In [8]:
# Get stop words and add a few more
stop_words = stopwords.words('english')
other_words = ['from', 'subject', 'patient', 'patients', 'group', 'sub', 'sup']
stop_words.extend(other_words)

In [9]:
# Make dataframe from each csv file

df = pd.concat([pd.read_csv(f, header=-1, names=['pmid', 'term',
                        'abstract']) for f in glob.glob('./data/*.csv')])
df = df.dropna()

In [10]:
df_train, df_test = train_test_split(df)

In [11]:
# Convert to list
data = df_train.abstract.tolist()

In [12]:
# Toeknize sentences to words
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

sent_words = list(sent_to_words(data))


In [13]:
# Remove stop words
data_stop = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in sent_words]


# Verify that the stop words are removed.
# a = [item for sublist in data_stop for item in sublist]
# a.count('patients')

In [97]:
# Make bigram and trigram models - threshold is a hyperparameter
bigram = Phrases(data_stop, min_count=5, threshold=50)
trigram = Phrases(bigram[data_stop])  


bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)

In [67]:
# Form the bigram groups
data_words_bigrams = [bigram_mod[doc] for doc in data_stop]

In [98]:
# Form the trigram groups
data_words_trigrams = [trigram_mod[bigram_mod[doc]] for doc in data_stop]
trigram_words = trigram[bigram[data_stop]]

In [106]:
# for phrase, score in trigram.export_phrases(bigram[data_stop]):
#     print(phrase)
trigram_words

<gensim.interfaces.TransformedCorpus at 0x7f1b369587f0>

In [None]:
# Spacy en model
nlp = spacy.load('en')

In [None]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
data_lemmatized = lemmatization(data_words_trigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])


In [None]:
# a = [item for sublist in data_lemmatized for item in sublist]
# a.count('patients')

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]


In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, 
                                             num_topics=10, id2word=id2word)

In [None]:
pprint(ldamallet.print_topics())

In [None]:
doc_lda = lda_model[corpus]

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_npmi')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)