**the first solution**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Read Data
Import packages needed:

In [2]:
import numpy as np
import pandas as pd
import re, nltk, spacy, gensim
# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint
# Plotting tools

import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
 df=pd.read_excel("/content/drive/MyDrive/Colab Notebooks/Pubmed5k.xlsx")

In [4]:
df.head(10)

Unnamed: 0,ArticleID,Title,Abstract
0,34153941,Stable Coordination Variability in Overground ...,Coordination variability (CV) is commonly anal...
1,34153942,Weak Hip Strength Increases Dynamic Knee Valgu...,Clinical Scenario: Dynamic knee valgus (DKV) i...
2,34153964,Current and Future Projections of Amyotrophic ...,Various methodologies have been reported to as...
3,34153968,Disparities between Asian and Non-Asian Thromb...,As outcomes for acute ischemic stroke (AIS) va...
4,34153978,Maternal Factors Predicting Loss to Follow-Up ...,Because hearing loss in children can result in...
5,34153979,Testing the feasibility and acceptability of a...,"Despite growing numbers in the USA, immigrant ..."
6,34153980,Lupus nephritis and ANCA-associated vasculitis...,Historically the treatment of lupus nephritis ...
7,34153982,Precision medicine in diabetic nephropathy and...,Progressive chronic kidney disease (CKD) in in...
8,34153983,Precision medicine in immunoglobulin A nephrop...,Immunoglobulin A nephropathy (IgAN) is the mos...
9,34153984,Precision medicine in transplantation and hemo...,"In kidney transplantation, precision medicine ..."


# Data Cleaning

In [None]:
# Convert to list
data = df.Abstract.values.tolist()
# Remove Emails
data = [re.sub(r'\S*@\S*\s?', '', sent) for sent in data]
# Remove new line characters
data = [re.sub(r'\s+', ' ', sent) for sent in data]
# Remove distracting single quotes
data = [re.sub(r"\'", "", sent) for sent in data]
pprint(data[:1])

['Coordination variability (CV) is commonly analyzed to understand dynamical '
 'qualities of human locomotion. The purpose of this study was to develop '
 'guidelines for the number of trials required to inform the calculation of a '
 'stable mean lower limb CV during overground locomotion. Three-dimensional '
 'lower limb kinematics were captured for 10 recreational runners performing '
 '20 trials each of preferred and fixed speed walking and running. Stance '
 'phase CV was calculated for 9 segment and joint couplings using a modified '
 'vector coding technique. The number of trials required to achieve a CV mean '
 'within 10% of 20 strides average was determined for each coupling and '
 'individual. The statistical outputs of mode (walking vs running) and speed '
 '(preferred vs fixed) were compared when informed by differing numbers of '
 'trials. A minimum of 11 trials were required for stable mean stance phase '
 'CV. With fewer than 11 trials, CV was underestimated and led to

## **Tokenize**

removing punctuations and unnecessary characters altogether.

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
data_words = list(sent_to_words(data))
print(data_words[:1])

[['coordination', 'variability', 'cv', 'is', 'commonly', 'analyzed', 'to', 'understand', 'dynamical', 'qualities', 'of', 'human', 'locomotion', 'the', 'purpose', 'of', 'this', 'study', 'was', 'to', 'develop', 'guidelines', 'for', 'the', 'number', 'of', 'trials', 'required', 'to', 'inform', 'the', 'calculation', 'of', 'stable', 'mean', 'lower', 'limb', 'cv', 'during', 'overground', 'locomotion', 'three', 'dimensional', 'lower', 'limb', 'kinematics', 'were', 'captured', 'for', 'recreational', 'runners', 'performing', 'trials', 'each', 'of', 'preferred', 'and', 'fixed', 'speed', 'walking', 'and', 'running', 'stance', 'phase', 'cv', 'was', 'calculated', 'for', 'segment', 'and', 'joint', 'couplings', 'using', 'modified', 'vector', 'coding', 'technique', 'the', 'number', 'of', 'trials', 'required', 'to', 'achieve', 'cv', 'mean', 'within', 'of', 'strides', 'average', 'was', 'determined', 'for', 'each', 'coupling', 'and', 'individual', 'the', 'statistical', 'outputs', 'of', 'mode', 'walking', 

## **Stemming**

Stemming is the process of reducing a word to its word stem that affixes to suffixes and prefixes or to the roots of words known as a lemma.

The advantage of this is, we get to reduce the total number of unique words in the dictionary. As a result, the number of columns in the document-word matrix (created by CountVectorizer in the next step) will be denser with lesser columns. You can expect better topics to be generated in the end.

In [None]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): #'NOUN', 'ADJ', 'VERB', 'ADV'
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

In [None]:
!python -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 4.3 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [None]:
# Initialize spacy ‘en’ model, keeping only tagger component (for efficiency)
# Run in terminal: python -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])
# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN','VERB']) #select noun and verb
print(data_lemmatized[:2])

['coordination analyze understand quality locomotion purpose study develop guideline number trial require inform calculation mean limb locomotion limb kinematic capture runner perform trial fix speed walk run stance phase cv calculate segment coupling use modify vector coding technique number trial require achieve cv mean stride determine coupling individual output mode walk run speed prefer fix compare inform differ number trial minimum trial require stance phase cv trial underestimate lead oversight difference mode speed overground locomotion cv research population use vector coding approach should use trial researcher should consequence number trial study finding', 'scenario knee valgus alteration knee lead increase risk injury weakness hip musculature hip abduction extension hext rotation may contribute increase dkv leg landing task focus question decrease hip strength associate increase dkv leg land task athlete summary finding study include control trial cohort study case control

## **Create the Document-Word matrix**
The LDA topic model algorithm requires a document word matrix as the main input.

In [None]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,
                             stop_words='english',             
                             lowercase=True,                   
                             token_pattern='[a-zA-Z0-9]{3,}')
data_vectorized= vectorizer.fit_transform(data_lemmatized)

# **Build LDA model with sklearn**
Everything is ready to build a Latent Dirichlet Allocation (LDA) model.  initialise one and call fit_transform() to build the LDA model.


In [None]:
# Build LDA Model
lda_model = LatentDirichletAllocation(n_components=20,               # Number of topics
                                      max_iter=10,               
                                      learning_method='online',   
                                      random_state=100,          
                                      batch_size=128,            
                                      evaluate_every = -1,
                                      n_jobs = -1 )
lda_output = lda_model.fit_transform(data_vectorized)
print(lda_model)  # Model attributes

LatentDirichletAllocation(learning_method='online', n_components=20, n_jobs=-1,
                          random_state=100)


# **Diagnose model performance with perplexity and log-likelihood**
A model with higher log-likelihood and lower perplexity (exp(-1. * log-likelihood per word)) is considered to be good.

In [None]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))
# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))
# See model parameters
pprint(lda_model.get_params())

Log Likelihood:  -2532962.209688689
Perplexity:  891.9139147288679
{'batch_size': 128,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'online',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 10,
 'mean_change_tol': 0.001,
 'n_components': 20,
 'n_jobs': -1,
 'perp_tol': 0.1,
 'random_state': 100,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}


# Use GridSearch to determine the best LDA model.

In [None]:

# Define Search Param
search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}
# Init the Model
lda = LatentDirichletAllocation(max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)
# Do the Grid Search
model.fit(data_vectorized)
GridSearchCV(cv=None, error_score='raise',
       estimator=LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1
             , perp_tol=0.1, random_state=None,
             topic_word_prior=None,  verbose=0),
       param_grid={'n_topics': [10, 15, 20, 25, 30], 'learning_decay': [0.5, 0.7, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

GridSearchCV(error_score='raise',
             estimator=LatentDirichletAllocation(learning_method=None,
                                                 n_jobs=1),
             param_grid={'learning_decay': [0.5, 0.7, 0.9],
                         'n_topics': [10, 15, 20, 25, 30]},
             return_train_score='warn')

In [None]:
# Best Model
best_lda_model = model.best_estimator_
# Model Parameters
print("Best Model's Params: ", model.best_params_)
# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)
# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'learning_decay': 0.5, 'n_components': 10}
Best Log Likelihood Score:  -536916.1699748768
Model Perplexity:  882.4865287858445


# **Dominant topic**
To classify a document as belonging to a particular topic, a logical approach is to see which topic has the highest contribution to that document and assign it. In the table below, I’ve greened out all major topics in a document and assigned the most dominant topic in its own column.

In [None]:
# Create Document — Topic Matrix
lda_output = best_lda_model.transform(data_vectorized)
# column names
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
# index names
docnames = ['Doc' + str(i) for i in range(len(data))]
# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic
# Styling
def color_green(val):
 color = 'green' if val > .1 else 'black'
 return 'color: {col}'.format(col=color)
def make_bold(val):
 weight = 700 if val > .1 else 400
 return 'font-weight: {weight}'.format(weight=weight)
# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,dominant_topic
Doc0,0.0,0.0,0.0,0.0,0.0,0.49,0.0,0.19,0.31,0.0,5
Doc1,0.0,0.51,0.03,0.03,0.0,0.09,0.0,0.0,0.17,0.18,1
Doc2,0.38,0.29,0.03,0.0,0.0,0.0,0.0,0.04,0.25,0.0,0
Doc3,0.17,0.2,0.0,0.0,0.0,0.0,0.0,0.59,0.04,0.0,7
Doc4,0.41,0.08,0.0,0.15,0.0,0.05,0.1,0.0,0.17,0.04,0
Doc5,0.0,0.0,0.0,0.71,0.0,0.0,0.0,0.25,0.03,0.0,3
Doc6,0.0,0.1,0.0,0.0,0.08,0.14,0.25,0.23,0.21,0.0,6
Doc7,0.22,0.58,0.0,0.0,0.08,0.0,0.0,0.0,0.07,0.05,1
Doc8,0.0,0.6,0.0,0.0,0.03,0.04,0.22,0.0,0.1,0.0,1
Doc9,0.0,0.24,0.0,0.0,0.06,0.15,0.31,0.15,0.05,0.04,6


In [None]:
# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(best_lda_model.components_)
# Assign Column and Index
df_topic_keywords.columns = vectorizer.get_feature_names()
df_topic_keywords.index = topicnames
# View
df_topic_keywords.head(10)



Unnamed: 0,aberration,ability,ablation,abnormality,absence,absorb,absorption,abstract,abundance,abuse,...,world,worsen,wound,write,year,yeast,yield,youth,zinc,zone
Topic0,0.100003,0.100031,0.1,4.306879,0.100045,0.100115,0.100006,0.100011,12.456439,34.664892,...,0.103175,0.100055,0.100012,0.10002,729.084596,0.100002,0.100026,46.557674,0.100005,0.100876
Topic1,0.100074,0.10008,25.030779,61.536946,42.558398,0.1,0.100016,0.100013,0.100007,0.100005,...,49.455407,22.222549,0.100034,0.100006,726.01716,0.100001,0.100026,0.100004,0.100018,0.100077
Topic2,0.100052,27.703989,0.100002,0.100006,37.610596,0.100118,26.028535,0.100001,0.10009,0.100001,...,31.818891,0.100018,0.101211,0.100005,25.145201,0.202167,48.581538,0.100001,0.103426,81.182856
Topic3,0.1,34.916283,0.100013,0.100013,9.391433,0.100005,0.100067,0.100029,0.100002,0.10004,...,30.07756,22.133074,0.100022,3.676073,187.093249,0.1,4.790737,37.882307,0.100002,0.100028
Topic4,0.100038,34.864609,0.10002,0.10004,24.829374,11.301761,28.22112,0.100001,0.100016,0.10001,...,0.100013,0.100007,21.110793,0.100001,0.100071,0.100017,4.41361,0.100001,0.100041,5.874948
Topic5,0.100088,68.907213,0.100011,0.10003,21.794768,0.100012,0.10006,0.100007,0.102929,0.100009,...,0.117252,0.100026,56.558743,0.100004,17.320097,0.100019,49.190092,0.100012,32.77749,14.984561
Topic6,20.198582,44.603762,0.100003,0.100024,0.100041,0.100027,0.100017,0.100022,41.115925,0.100011,...,16.666652,0.100008,0.100005,0.100003,5.540493,0.100018,0.378256,0.100003,0.100064,0.100016
Topic7,0.100008,0.100033,0.124243,0.100026,3.999338,0.100002,0.100304,0.100013,0.100006,0.100002,...,0.108926,11.773208,9.356205,0.100013,236.792392,0.100009,0.100016,0.100011,0.100002,0.100023
Topic8,0.100022,25.385876,0.100007,0.100014,0.271541,0.100123,0.100066,26.847331,0.100009,0.100015,...,43.712458,0.100031,0.100024,45.632217,12.658556,0.100006,0.100047,0.10005,0.100013,0.100011
Topic9,0.100009,0.100034,0.100002,0.100027,0.100016,0.100171,0.100046,0.100006,45.929767,0.100021,...,0.100031,0.100013,0.10002,0.100001,0.100034,58.988173,52.815776,0.100005,0.100098,0.10002


## **Get the top 15 keywords each topic:**

In [None]:
# Show top n keywords for each topic
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords
topic_keywords = show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=15)
# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords



Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,study,age,use,child,year,associate,factor,effect,level,depression,risk,adolescent,association,analysis,disorder
Topic 1,patient,case,risk,disease,year,study,mortality,infection,diagnosis,include,cause,follow,rate,report,treatment
Topic 2,specie,use,water,study,base,region,distribution,provide,area,surface,result,include,material,property,time
Topic 3,health,study,intervention,care,use,participant,community,service,measure,population,pandemic,experience,quality,report,survey
Topic 4,cell,expression,protein,tumor,tissue,cancer,induce,level,increase,rat,effect,mouse,study,role,mechanism
Topic 5,use,model,method,test,base,result,performance,time,datum,measurement,detection,develop,image,accuracy,network
Topic 6,gene,pathway,student,protein,identify,virus,target,study,interaction,use,sequence,disease,analysis,mechanism,drug
Topic 7,patient,group,treatment,cancer,study,outcome,use,control,compare,trial,fatigue,day,month,therapy,pain
Topic 8,study,review,research,use,include,datum,identify,article,process,management,health,role,provide,base,search
Topic 9,effect,increase,exposure,food,concentration,study,change,activity,temperature,level,growth,decrease,result,plant,control


# 10 topics we infered into the dataframe.

In [None]:
Topics = ["study / child / association","patient / disease / diagnosis"," area / surface / property ","health / intervention / participant","cancer/mechanism", 
          "model / performance / detection", "gene / protein / interaction", "patient / treatment /cancer ", "study/ research / datum /management ", "exposure /concentration / temperature"]
df_topic_keywords["Topics"]=Topics
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14,Topics
Topic 0,study,age,use,child,year,associate,factor,effect,level,depression,risk,adolescent,association,analysis,disorder,study / child / association
Topic 1,patient,case,risk,disease,year,study,mortality,infection,diagnosis,include,cause,follow,rate,report,treatment,patient / disease / diagnosis
Topic 2,specie,use,water,study,base,region,distribution,provide,area,surface,result,include,material,property,time,area / surface / property
Topic 3,health,study,intervention,care,use,participant,community,service,measure,population,pandemic,experience,quality,report,survey,health / intervention / participant
Topic 4,cell,expression,protein,tumor,tissue,cancer,induce,level,increase,rat,effect,mouse,study,role,mechanism,cancer/mechanism
Topic 5,use,model,method,test,base,result,performance,time,datum,measurement,detection,develop,image,accuracy,network,model / performance / detection
Topic 6,gene,pathway,student,protein,identify,virus,target,study,interaction,use,sequence,disease,analysis,mechanism,drug,gene / protein / interaction
Topic 7,patient,group,treatment,cancer,study,outcome,use,control,compare,trial,fatigue,day,month,therapy,pain,patient / treatment /cancer
Topic 8,study,review,research,use,include,datum,identify,article,process,management,health,role,provide,base,search,study/ research / datum /management
Topic 9,effect,increase,exposure,food,concentration,study,change,activity,temperature,level,growth,decrease,result,plant,control,exposure /concentration / temperature


# Predict Topics using LDA model

In [None]:
# Define function to predict topic for a given text document.
nlp = spacy.load('en', disable=['parser', 'ner'])
def predict_topic(text, nlp=nlp):
    global sent_to_words
    global lemmatization
# Step 1: Clean with simple_preprocess
    mytext_2 = list(sent_to_words(text))
# Step 2: Lemmatize
    mytext_3 = lemmatization(mytext_2, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
# Step 3: Vectorize transform
    mytext_4 = vectorizer.transform(mytext_3)
# Step 4: LDA Transform
    topic_probability_scores = best_lda_model.transform(mytext_4)
    topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), 1:14].values.tolist()
    
    # Step 5: Infer Topic
    infer_topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), -1]
    
    #topic_guess = df_topic_keywords.iloc[np.argmax(topic_probability_scores), Topics]
    return infer_topic, topic, topic_probability_scores
# Predict the topic
mytext = ["Very Useful in diabetes age 30. I need control sugar. thanks Good deal"]
infer_topic, topic, prob_scores = predict_topic(text = mytext)
print(topic)
print(infer_topic)

['increase', 'exposure', 'food', 'concentration', 'study', 'change', 'activity', 'temperature', 'level', 'growth', 'decrease', 'result', 'plant']
exposure /concentration / temperature


In [None]:
def apply_predict_topic(text):
 text = [text]
 infer_topic, topic, prob_scores = predict_topic(text = text)
 return(infer_topic)
df["Topic_key_word"]= df['Abstract'].apply(apply_predict_topic)
df.head()

Unnamed: 0,ArticleID,Title,Abstract,Topic_key_word
0,34153941,Stable Coordination Variability in Overground ...,Coordination variability (CV) is commonly anal...,model / performance / detection
1,34153942,Weak Hip Strength Increases Dynamic Knee Valgu...,Clinical Scenario: Dynamic knee valgus (DKV) i...,patient / disease / diagnosis
2,34153964,Current and Future Projections of Amyotrophic ...,Various methodologies have been reported to as...,study / child / association
3,34153968,Disparities between Asian and Non-Asian Thromb...,As outcomes for acute ischemic stroke (AIS) va...,patient / treatment /cancer
4,34153978,Maternal Factors Predicting Loss to Follow-Up ...,Because hearing loss in children can result in...,study / child / association
