# NLP Analysis of Scientific Abstracts
---
####  US National Library of Medicine National Institutes of Health 
#### The National Center for Biotechnology Information 
#### PubMed

In [1]:
# Import stanard tools
import numpy as np
import pandas as pd
import glob
import re

In [2]:
# Import for NLP tools
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Import mallet for topic modeling
mallet_path = '/usr/share/Mallet/bin/mallet'

# Spacy to lemmate
import spacy

# Import NLTK for stop words
from nltk.corpus import stopwords

In [3]:
# Import visualization tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
# Import train / test
from sklearn.model_selection import train_test_split

In [5]:
# Import custom module to get abstracts
from lib.get_abstracts import get_abstracts

In [6]:
# Suppress warnings
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [7]:
# # #Get abstracts and create csv of abstracts
# # # API key obtained from NCBI
# get_abstracts('gene therapy', 100, 'ragorham1@gmail.com', 
#               '3ffbbb6bd110815d69e4aa14b7c26d72ab09')

# get_abstracts('chemotherapy', 100, 'ragorham1@gmail.com', 
#               '3ffbbb6bd110815d69e4aa14b7c26d72ab09')

# get_abstracts('infection control', 100, 'ragorham1@gmail.com', 
#               '3ffbbb6bd110815d69e4aa14b7c26d72ab09')

# get_abstracts('precision medicine', 100, 'ragorham1@gmail.com', 
#               '3ffbbb6bd110815d69e4aa14b7c26d72ab09')

# get_abstracts('orthopedics', 100, 'ragorham1@gmail.com', 
#               '3ffbbb6bd110815d69e4aa14b7c26d72ab09')

# get_abstracts('allograph', 100, 'ragorham1@gmail.com', 
#               '3ffbbb6bd110815d69e4aa14b7c26d72ab09')

# get_abstracts('nosocomial', 100, 'ragorham1@gmail.com', 
#               '3ffbbb6bd110815d69e4aa14b7c26d72ab09')

# get_abstracts('pediatric', 100, 'ragorham1@gmail.com', 
#               '3ffbbb6bd110815d69e4aa14b7c26d72ab09')

# get_abstracts('geriatrics', 100, 'ragorham1@gmail.com', 
#               '3ffbbb6bd110815d69e4aa14b7c26d72ab09')

# get_abstracts('imaging', 100, 'ragorham1@gmail.com', 
#               '3ffbbb6bd110815d69e4aa14b7c26d72ab09')

In [8]:
# Get stop words and add a few more
stop_words = stopwords.words('english')
other_words = ['from', 'subject', 'patient', 'group', 'sub', 'sup']
stop_words.extend(other_words)

In [9]:
# Import Dataset
# df = pd.read_csv('./data/gene therapy.csv', header=-1, names=['pmid', 'term',
#                                                               'abstract'])
# df = df.dropna()
# df

df = pd.concat([pd.read_csv(f, header=-1, names=['pmid', 'term',
                        'abstract']) for f in glob.glob('./data/*.csv')])
df = df.dropna()

In [10]:
df_train, df_test = train_test_split(df)

In [11]:
# Convert to list
data = df_train.abstract.tolist()

In [12]:
# Toeknize sentences to words
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

data_words = list(sent_to_words(data))

In [13]:
# Make bigram and trigram models - threshold is a hyperparameter
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=1)
trigram = gensim.models.Phrases(bigram[data_words], threshold=1)  


bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)



In [25]:
# Remove stop words
data_words_stops = [[word for word in simple_preprocess(str(doc)) \
                       if word not in stop_words] for doc in data_words]


In [15]:
# Form the bigram groups
data_words_bigrams = [bigram_mod[doc] for doc in data_words_stops]

In [16]:
# Form the trigram groups
data_words_trigrams = [trigram_mod[bigram_mod[doc]] for doc in data_words_stops]

In [17]:
# Spacy en model
nlp = spacy.load('en')

In [18]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [19]:
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [20]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]


In [21]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, 
                                             num_topics=10, id2word=id2word)

In [22]:
doc_lda = lda_model[corpus]

In [23]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [24]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.353122123495774

Coherence Score:  0.3527643155174926
