# NLP Analysis of Scientific Abstracts
---
####  US National Library of Medicine National Institutes of Health 
#### The National Center for Biotechnology Information 
#### PubMed

In [1]:
# Import custom module for calling abstracts from a key word or phrase
from lib.get_abstracts import get_abstracts

In [2]:
# Import standard analysis tools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# Import tools for NLP 
import scipy
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.decomposition import NMF

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim

In [4]:
# Get abstracts and create csv of abstracts
# API key obtained from NCBI
#  get_abstracts('gene therapy', 1000, 'ragorham1@gmail.com', 
#                '3ffbbb6bd110815d69e4aa14b7c26d72ab09')

In [12]:
# Process abstracts 
df = pd.read_csv('./data/gene therapy.csv', header=-1, 
                 names=['pmid', 'term', 'abstract'])
df = df.dropna()

abstracts = list(df.abstract)

# Make tf-idf matrix
vectorizer = TfidfVectorizer(stop_words='english')
abstracts_tfidf = vectorizer.fit_transform(abstracts)

In [26]:
import guidedlda
from scipy import sparse
xx = sparse.coo_matrix(abstracts)
# X = guidedlda.datasets.load_data(xx)
# vocab = guidedlda.datasets.load_vocab(abstracts)
# word2id = dict((v, idx) for idx, v in enumerate(vocab))

xx.shape

(1, 98)

In [13]:
#  Make the wors list
terms = vectorizer.get_feature_names()

# Number of topics
ntopics=10

# Linking words to topics
def word_topic(tfidf,solution, wordlist):
    
    # Loading scores for each word on each topic/component.
    words_by_topic=tfidf.T * solution

    # Linking the loadings to the words in an easy-to-read way.
    components=pd.DataFrame(words_by_topic,index=wordlist)
    
    return components
# Extracts the top N words and their loadings for each topic.
def top_words(components, n_top_words):
    n_topics = range(components.shape[1])
    index= np.repeat(n_topics, n_top_words, axis=0)
    topwords=pd.Series(index=index)
    for column in range(components.shape[1]):
        # Sort the column so that highest loadings are at the top.
        sortedwords=components.iloc[:,column].sort_values(ascending=False)
        # Choose the N highest loadings.
        chosen=sortedwords[:n_top_words]
        # Combine loading and index into a string.
        chosenlist=chosen.index +" "+round(chosen,2).map(str) 
        topwords.loc[column]=chosenlist
    return(topwords)

# Number of words to look at for each topic.
n_top_words = 10

In [14]:
svd= TruncatedSVD(ntopics)
lsa = make_pipeline(svd, Normalizer(copy=False))
abstracts_lsa = lsa.fit_transform(abstracts_tfidf)

components_lsa = word_topic(abstracts_tfidf, abstracts_lsa, terms)

topwords=pd.DataFrame()
topwords['LSA']=top_words(components_lsa, n_top_words)    

In [8]:
lda = LDA(n_topics=ntopics, 
          doc_topic_prior=None, # Prior = 1/n_documents
          topic_word_prior=1/ntopics,
          learning_decay=0.7, # Convergence rate.
          learning_offset=10.0, # Causes earlier iterations to have less influence on the learning
          max_iter=10, # when to stop even if the model is not converging (to prevent running forever)
          evaluate_every=-1, # Do not evaluate perplexity, as it slows training time.
          mean_change_tol=0.001, # Stop updating the document topic distribution in the E-step when mean change is < tol
          max_doc_update_iter=100, # When to stop updating the document topic distribution in the E-step even if tol is not reached
          n_jobs=-1, # Use all available CPUs to speed up processing time.
          verbose=0, # amount of output to give while iterating
          random_state=0
         )

abstracts_lda = lda.fit_transform(abstracts_tfidf) 

components_lda = word_topic(abstracts_tfidf, abstracts_lda, terms)

topwords['LDA']=top_words(components_lda, n_top_words)



In [19]:
abstracts_tfidf.data

array([0.15816771, 0.15816771, 0.15816771, ..., 0.1973093 , 0.1973093 ,
       0.1973093 ])

In [9]:
nmf = NMF(alpha=0.0, 
          init='nndsvdar', # how starting value are calculated
          l1_ratio=0.0, # Sets whether regularization is L2 (0), L1 (1), or a combination (values between 0 and 1)
          max_iter=200, # when to stop even if the model is not converging (to prevent running forever)
          n_components=ntopics, 
          random_state=0, 
          solver='cd', # Use Coordinate Descent to solve
          tol=0.0001, # model will stop if tfidf-WH <= tol
          verbose=0 # amount of output to give while iterating
         )
abstracts_nmf = nmf.fit_transform(abstracts_tfidf) 

components_nmf = word_topic(abstracts_tfidf, abstracts_nmf, terms)

topwords['NNMF']=top_words(components_nmf, n_top_words)

In [10]:
# for topic in range(ntopics):
#     print('Topic {}:'.format(topic))
#     print(topwords.loc[topic])

In [11]:
import pyLDAvis.sklearn
 
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda, abstracts_tfidf, vectorizer, R=10, mds='tsne')
panel