In [1]:
# !pip list | grep gensim
!pip install --upgrade gensim

Requirement already up-to-date: gensim in /usr/local/lib/python3.6/dist-packages (3.8.3)


# Assignment

1 LDA: Latent Dirichlet allocation: using sklearn and Gensim

2 NNMF: non negative matrix factorization 

In [224]:
# !pip install nltk
# !pip install spacy
#!pip install gensim
# !pip install pyLDAvis



In [40]:
import pandas as pd
import numpy as np
import os
import sklearn
import sys
np.set_printoptions(threshold=sys.maxsize)
from sklearn.model_selection import train_test_split
import re
from pprint import pprint
from glob import glob
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.nmf import Nmf
# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

from pathlib import Path
import glob
from bs4 import BeautifulSoup
import operator

In [3]:
path = "./sample_50/"

In [4]:
def get_abstract(soup):
    if soup.find("abstract") is None:
        return "NaN"
    
    return soup.find('abstract').text

In [5]:
def get_sample_data(path):
    pathlist = Path(path).glob('**/*.xml')
    abstracts = []
    for path in pathlist:
        # because path is object not string
        path_in_str = str(path)
        soup = BeautifulSoup(open('./'+ path_in_str, 'r'))
        abstracts.append(get_abstract(soup))
        # todo: get full articles?

    return np.array(abstracts)


In [6]:
X = get_sample_data(path)

In [7]:
X.shape

(78,)

In [8]:
type(X)

numpy.ndarray

# Assignment1: LDA: Latent Dirichlet allocation

In [9]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
# data cleaning 

# Remove Emails
# data = [re.sub('\S*@\S*\s?', '', sent) for each in data]

# Remove new line characters
data = [re.sub('\s+', ' ', each) for each in X]

# # Remove distracting single quotes
# data = [re.sub("\'", "", sent) for each in data]

## Using Sklearn

Preprocessing include:

- tokenization
  - remove stop words

Tokenize the words

In [42]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
cv = CountVectorizer(max_df = 0.95, min_df = 2, stop_words = 'english')
df = cv.fit_transform(data)

Apply LDA

In [158]:
from sklearn.decomposition import LatentDirichletAllocation
lda_model1 = LatentDirichletAllocation(n_components = 15, random_state = 42) # n_components = topics
lda_model1.fit(df)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=15, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [None]:
# def get_lda_topics(model, num_topics):
#     word_dict = {};
#     for i in range(num_topics):
#         words = model.show_topic(i, topn = 20)
#         word_dict['Topic #' + '{:02d}'.format(i+1)] = [i[0] for i in words]
#     return pd.DataFrame(word_dict)

# get_lda_topics(lda_model1, 15)

In [192]:
for index, topic in enumerate(lda_model1.components_):
    print(f'Top 15 words for Topic #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

Top 15 words for Topic #0
['national', 'individuals', 'alaska', 'reach', 'use', 'self', 'results', 'participants', 'people', 'study', 'data', 'intervention', 'inuit', 'care', 'health']


Top 15 words for Topic #1
['greenland', 'work', 'design', 'algorithms', 'develop', 'outcomes', 'northern', 'care', 'nunavut', 'research', 'using', 'committee', 'colonoscopy', 'health', 'ethics']


Top 15 words for Topic #2
['obesity', 'level', 'support', 'use', 'related', 'efficacy', 'groups', 'used', 'online', 'data', 'patients', 'care', 'partners', 'based', 'clinical']


Top 15 words for Topic #3
['implementing', 'surveillance', 'areas', 'change', 'year', 'improved', 'early', 'control', 'drinking', 'sources', 'laboratories', 'public', 'health', 'water', 'disease']


Top 15 words for Topic #4
['alaska', 'incidence', 'impact', 'reporting', 'post', 'children', 'case', 'database', 'cases', 'analysis', 'score', 'median', 'pre', 'studies', 'period']


Top 15 words for Topic #5
['studies', 'outcomes', 'inte

## Using Gensim

Preprocessing include:
- tokenization
- remove stop words
- bigram data
- lemmatization


Tokenization: tokenize each sentence into a list of words, removing punctuations and unnecessary characters altogether.     

In [31]:
def tokenize_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(tokenize_to_words(data))
data_words

[['background',
  'most',
  'studies',
  'have',
  'suggested',
  'that',
  'elevated',
  'body',
  'mass',
  'index',
  'bmi',
  'was',
  'associated',
  'with',
  'the',
  'risk',
  'of',
  'death',
  'from',
  'all',
  'cause',
  'and',
  'from',
  'specific',
  'causes',
  'however',
  'there',
  'was',
  'little',
  'evidence',
  'illustrating',
  'the',
  'effect',
  'of',
  'bmi',
  'on',
  'the',
  'mortality',
  'in',
  'elderly',
  'hypertensive',
  'patients',
  'in',
  'chinese',
  'population',
  'methods',
  'the',
  'information',
  'of',
  'hypertensive',
  'patients',
  'at',
  'baseline',
  'not',
  'less',
  'than',
  'years',
  'were',
  'from',
  'xinzhuang',
  'town',
  'in',
  'minhang',
  'district',
  'of',
  'shanghai',
  'was',
  'extracted',
  'from',
  'the',
  'electronic',
  'health',
  'record',
  'ehr',
  'system',
  'all',
  'study',
  'participants',
  'were',
  'divided',
  'into',
  'eight',
  'categories',
  'of',
  'baseline',
  'bmi',
  'with',
 

In [32]:
data_words[:1]

[['background',
  'most',
  'studies',
  'have',
  'suggested',
  'that',
  'elevated',
  'body',
  'mass',
  'index',
  'bmi',
  'was',
  'associated',
  'with',
  'the',
  'risk',
  'of',
  'death',
  'from',
  'all',
  'cause',
  'and',
  'from',
  'specific',
  'causes',
  'however',
  'there',
  'was',
  'little',
  'evidence',
  'illustrating',
  'the',
  'effect',
  'of',
  'bmi',
  'on',
  'the',
  'mortality',
  'in',
  'elderly',
  'hypertensive',
  'patients',
  'in',
  'chinese',
  'population',
  'methods',
  'the',
  'information',
  'of',
  'hypertensive',
  'patients',
  'at',
  'baseline',
  'not',
  'less',
  'than',
  'years',
  'were',
  'from',
  'xinzhuang',
  'town',
  'in',
  'minhang',
  'district',
  'of',
  'shanghai',
  'was',
  'extracted',
  'from',
  'the',
  'electronic',
  'health',
  'record',
  'ehr',
  'system',
  'all',
  'study',
  'participants',
  'were',
  'divided',
  'into',
  'eight',
  'categories',
  'of',
  'baseline',
  'bmi',
  'with',
 

Creating Bigram and Trigram Models (2/3 words that are frequently appear together)

In [33]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold, fewer phrases to form
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])


['background', 'most', 'studies', 'have', 'suggested', 'that', 'elevated', 'body_mass_index', 'bmi', 'was', 'associated', 'with', 'the', 'risk', 'of', 'death', 'from', 'all', 'cause', 'and', 'from', 'specific', 'causes', 'however', 'there', 'was', 'little', 'evidence', 'illustrating', 'the', 'effect', 'of', 'bmi', 'on', 'the', 'mortality', 'in', 'elderly', 'hypertensive', 'patients', 'in', 'chinese', 'population', 'methods', 'the', 'information', 'of', 'hypertensive', 'patients', 'at', 'baseline', 'not', 'less', 'than', 'years', 'were', 'from', 'xinzhuang', 'town', 'in', 'minhang', 'district', 'of', 'shanghai', 'was', 'extracted', 'from', 'the', 'electronic', 'health', 'record', 'ehr', 'system', 'all', 'study', 'participants', 'were', 'divided', 'into', 'eight', 'categories', 'of', 'baseline', 'bmi', 'with', 'cut', 'points', 'at', 'and', 'kg', 'relative', 'hazard', 'ratio', 'of', 'death', 'from', 'all', 'cause', 'cardiovascular', 'and', 'non', 'cardiovascular', 'cause', 'by', 'baseline

Remove Stopwords, Make Bigrams and Lemmatize


In [34]:
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# !python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

In [35]:
# Define functions for stopwords, bigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

# def make_trigrams(texts):
#     return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [36]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['study', 'suggest', 'elevated', 'index', 'bmi', 'associate', 'risk', 'death', 'cause', 'specific', 'cause', 'however', 'little', 'evidence', 'illustrate', 'effect', 'bmi', 'mortality', 'elderly', 'hypertensive', 'patient', 'chinese', 'population', 'method', 'information', 'hypertensive', 'patient', 'baseline', 'less', 'year', 'extract', 'electronic', 'health', 'record', 'ehr', 'system', 'study', 'participant', 'divide', 'category', 'baseline', 'cut', 'point', 'relative', 'hazard', 'ratio', 'death', 'cause', 'cardiovascular', 'cardiovascular', 'group', 'calculate', 'standardized', 'sex', 'age', 'smoking', 'drink', 'systolic', 'blood', 'pressure', 'history', 'cardiovascular', 'disorder', 'serum', 'diabete', 'drug', 'treatment', 'result', 'follow', 'median', 'year', 'death', 'occur', 'underweight', 'bmi', 'significantly', 'increase', 'mortality', 'cause', 'mortality', 'mortality', 'cause', 'specific', 'analysis', 'underweight', 'associate', 'significantly', 'disorder', 'result', 'total'

Create the Dictionary and Corpus needed for Topic Modeling

two main inputs to the LDA topic model using GenSim library are the dictionary(id2word) and the corpus.

eg: for the single doc, produce: a mapping of (word_id, word_frequency), (0,1) 1st word in the doc appear once

In [37]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])
print(corpus[1:2])


[[(0, 2), (1, 1), (2, 3), (3, 2), (4, 1), (5, 3), (6, 1), (7, 4), (8, 1), (9, 8), (10, 2), (11, 1), (12, 4), (13, 1), (14, 3), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 2), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 3), (32, 1), (33, 2), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 7), (42, 1), (43, 1), (44, 1), (45, 3), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 2), (53, 1), (54, 1), (55, 1), (56, 2), (57, 2), (58, 2), (59, 4), (60, 1), (61, 1), (62, 3), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 3), (69, 2)]]
[[(25, 1), (28, 2), (33, 2), (40, 1), (62, 1), (63, 1), (70, 1), (71, 1), (72, 1), (73, 4), (74, 1), (75, 8), (76, 1), (77, 1), (78, 5), (79, 3), (80, 2), (81, 2), (82, 1), (83, 1), (84, 2), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 2), (91, 1), (92, 1), (93, 1), (94, 4), (95, 1), (96, 1), (97, 9), (98, 1), (99, 2), (100, 1), (101, 1), (102, 1), (103, 1), (104, 1), (

##### id-word mapping:

In [170]:
id2word[0]

'age'

##### Item-frequency

In [172]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]][:10]

[[('age', 2),
  ('analysis', 1),
  ('associate', 3),
  ('baseline', 2),
  ('blood', 1),
  ('bmi', 3),
  ('calculate', 1),
  ('cardiovascular', 4),
  ('category', 1),
  ('cause', 8),
  ('chinese', 2),
  ('cut', 1),
  ('death', 4),
  ('diabete', 1),
  ('disorder', 3),
  ('divide', 1),
  ('drink', 1),
  ('drug', 1),
  ('effect', 1),
  ('ehr', 1),
  ('elderly', 2),
  ('electronic', 1),
  ('elevated', 1),
  ('evidence', 1),
  ('extract', 1),
  ('follow', 1),
  ('group', 1),
  ('hazard', 1),
  ('health', 1),
  ('history', 1),
  ('however', 1),
  ('hypertensive', 3),
  ('illustrate', 1),
  ('increase', 2),
  ('index', 1),
  ('influence', 1),
  ('information', 1),
  ('less', 1),
  ('little', 1),
  ('median', 1),
  ('method', 1),
  ('mortality', 7),
  ('obesity', 1),
  ('occur', 1),
  ('participant', 1),
  ('patient', 3),
  ('point', 1),
  ('population', 1),
  ('pressure', 1),
  ('ratio', 1),
  ('record', 1),
  ('relative', 1),
  ('result', 2),
  ('reveal', 1),
  ('risk', 1),
  ('serum', 1),
  

Building the Topic Model

In [101]:
lda_model2 = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=15, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [105]:
def get_lda_topics(model, num_topics):
    word_dict = {};
    for i in range(num_topics):
        words = model.show_topic(i, topn = 20)
        word_dict['Topic #' + '{:02d}'.format(i+1)] = [i[0] for i in words]
    return pd.DataFrame(word_dict)

# Print the Keyword in the 15 topics
get_lda_topics(lda_model2, 15)

Unnamed: 0,Topic #01,Topic #02,Topic #03,Topic #04,Topic #05,Topic #06,Topic #07,Topic #08,Topic #09,Topic #10,Topic #11,Topic #12,Topic #13,Topic #14,Topic #15
0,moodmanager,hospital,health,patient,old,health,disease,patient,clinic,referral,care,intervention,case,partner,trial
1,clonotype,patient,screening,study,exercise,disease,live,practice,capacity,study,health,health,report,medium,image
2,clinical,study,inuit,risk,participant,system,dementia,time,organizational,health,continuity,participant,phone,type,abnormal
3,cancer,gwg,term,child,training,care,stage,risk,change,patient,control,information,aht,outbreak,classification
4,use,score,include,care,plan,prep,use,emergency,poag,work,patient,program,mobile,infection,study
5,significantly,strobe,system,contact,activelifestyle,community,health,cohort,eligible,cancer,use,group,adherence,report,normal
6,great,report,colonoscopy,staff,study,provide,result,year,ehr,electronic,result,woman,use,scale,performance
7,outcome,pre,different,family,disorder,work,study,admission,process,participant,age,study,year,disaster,design
8,adherence,use,need,program,people,quality,cause,include,method,recruit,study,reach,time,social,quality
9,level,guideline,care,obesity,balance,information,group,implementation,gene,population,rate,high,database,sub,large


In [104]:
# # Print the Keyword in the 10 topics
# pprint(lda_model.print_topics())
# doc_lda = lda_model[corpus]

Evaluation Metric: Compute Model Perplexity and Coherence Score


In [28]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model2, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.342669600000525

Coherence Score:  0.3601887589954983


Visualize the topics-keywords

In [29]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

# Assignment 2: Non Negative Matrix Facotrization



### Using GenSim

how to find the best number of topics using the cohenrence score?


In [39]:

# Create a list of the topic numbers we want to try
topic_nums = list(np.arange(5, 75 + 1, 5))

# Run the nmf model and calculate the coherence score
# for each number of topics
coherence_scores = []

for num in topic_nums:
    nmf = Nmf(
        corpus=corpus,
        num_topics=num,
        id2word=id2word,
        chunksize=2000,
        passes=5,
        kappa=.1,
        minimum_probability=0.01,
        w_max_iter=300,
        w_stop_condition=0.0001,
        h_max_iter=100,
        h_stop_condition=0.001,
        eval_every=10,
        normalize=True,
        random_state=42
    )
    
    # Run the coherence model to get the score
    cm = CoherenceModel(
        model=nmf,
        texts=texts,
        dictionary=id2word,
        coherence='c_v'
    )
    
    coherence_scores.append(round(cm.get_coherence(), 5))


# Get the number of topics with the highest coherence score
scores = list(zip(topic_nums, coherence_scores))
best_num_topics = sorted(scores, key=operator.itemgetter(1), reverse=True)[0][0]

print(best_num_topics)

65


Using Sklearn
- tokenization
  - remove stop words


In [43]:
#vectorizer = CountVectorizer(analyzer='word' max_features=5000)
#vectorizer = CountVectorizer(analyzer='word', stop_words = 'english')
vectorizer = CountVectorizer(max_df = 0.95, min_df = 2, stop_words = 'english')
x_counts = vectorizer.fit_transform(data)

In [44]:
transformer = TfidfTransformer(smooth_idf=False);
x_tfidf = transformer.fit_transform(x_counts);

In [45]:
from sklearn.preprocessing import normalize
xtfidf_norm = normalize(x_tfidf, norm='l1', axis=1)

In [46]:
#obtain a NMF model.
num_topics=best_num_topics
from sklearn.decomposition import NMF
nmf_model = NMF(n_components=num_topics, init='nndsvd')

In [47]:
#fit the model
nmf_model.fit(xtfidf_norm)

NMF(alpha=0.0, beta_loss='frobenius', init='nndsvd', l1_ratio=0.0, max_iter=200,
    n_components=65, random_state=None, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [48]:
def get_nmf_topics(model, n_top_words):
    
    #the word ids obtained need to be reverse-mapped to the words so we can print the topic names.
    feat_names = vectorizer.get_feature_names()
    
    word_dict = {};
    for i in range(num_topics):
        
        #for each topic, obtain the largest values, and add the words they map to into the dictionary.
        words_ids = model.components_[i].argsort()[:-20 - 1:-1]
        words = [feat_names[key] for key in words_ids]
        word_dict['Topic #' + '{:02d}'.format(i+1)] = words;
    
    return pd.DataFrame(word_dict);

In [49]:
get_nmf_topics(nmf_model, 15)

Unnamed: 0,Topic #01,Topic #02,Topic #03,Topic #04,Topic #05,Topic #06,Topic #07,Topic #08,Topic #09,Topic #10,Topic #11,Topic #12,Topic #13,Topic #14,Topic #15,Topic #16,Topic #17,Topic #18,Topic #19,Topic #20,Topic #21,Topic #22,Topic #23,Topic #24,Topic #25,Topic #26,Topic #27,Topic #28,Topic #29,Topic #30,Topic #31,Topic #32,Topic #33,Topic #34,Topic #35,Topic #36,Topic #37,Topic #38,Topic #39,Topic #40,Topic #41,Topic #42,Topic #43,Topic #44,Topic #45,Topic #46,Topic #47,Topic #48,Topic #49,Topic #50,Topic #51,Topic #52,Topic #53,Topic #54,Topic #55,Topic #56,Topic #57,Topic #58,Topic #59,Topic #60,Topic #61,Topic #62,Topic #63,Topic #64,Topic #65
0,,health,partners,food,practices,human,cancer,clinic,aor,gwas,colonoscopy,development,quality,living,case,mobile,cancer,outbreaks,older,media,ci,water,susceptibility,far,screening,ethics,terms,native,greater,cancer,referrals,laboratories,literacy,rate,telemedicine,abnormal,care,remote,program,clinics,women,cause,parents,web,willingness,diabetes,risk,health,family,care,inuit,period,clinicians,hospitals,emerging,prevalence,hiv,depressive,staff,database,seeking,disease,intervention,governance,greenland
1,younger,care,efficacy,nunavut,time,infectious,ci,medical,disorder,complex,algorithms,families,colonoscopy,diseases,cases,adherence,breast,social,balance,scales,greenland,sources,clinical,programs,cancer,committee,concepts,alaska,adherence,services,recruitment,public,items,testing,provider,normal,provider,presence,obesity,organizational,pregnancy,mortality,blood,profiles,rp,type,gene,program,scale,seer,health,score,excessive,intervention,assessment,veterans,approval,12,contacts,incidence,information,predictive,reach,capacity,nurses
2,emerging,patients,self,community,implementation,levels,95,attend,condition,studies,variables,challenges,improvement,100,reporting,reported,ppv,media,strength,post,children,disease,drug,paper,organizational,northern,communication,people,self,patients,electronic,strategic,measure,adherence,chronic,classification,model,technology,online,ehr,pregnant,cardiovascular,possible,symptoms,literacy,environmental,association,demonstrate,hiv,health,countries,studies,recommended,guidelines,trade,compared,ci,symptoms,patient,alaska,health,risk,lifestyle,support,work
3,effect,ci,partner,organizations,clinicians,diseases,raised,free,anxiety,formation,linear,experiencing,american,examined,reports,hiv,99,influence,training,types,language,drinking,significantly,populations,preventive,ethical,definitions,care,relative,ci,days,efficiency,development,control,providers,trial,end,community,activity,capacity,alcohol,bmi,increase,stage,patients,pregnancy,confidence,improvements,care,comparable,systems,pre,pregnancy,treatments,threat,groups,canadian,14,transmission,children,internet,improves,tailored,clinical,healthcare
4,effective,cancer,regular,action,resources,control,women,course,presence,successfully,reports,combat,involves,regions,vs,use,icd,infections,participants,associated,treatment,surveillance,level,diseases,delivery,health,organizations,based,produced,utilization,travel,plan,content,higher,indigenous,classified,support,patient,african,eligible,intervention,specific,public,stages,risk,infants,obesity,key,testing,reflect,canada,median,tracking,detailing,outbreaks,increased,participants,month,distinct,cases,radio,genetic,monitor,systems,areas
5,effectiveness,use,occasions,led,scale,proven,features,school,associations,require,specificity,emotional,initiatives,prevalence,non,urban,sensitivity,mortality,people,radio,oral,behavioural,collected,region,interviews,develop,various,health,weeks,95,enrolled,year,new,associated,patient,sensitivity,influence,caregivers,physical,change,measures,disorders,health,sd,cognitive,environment,001,findings,management,medicare,national,impact,problem,academic,risks,diagnosis,95,administered,interactions,source,aor,robust,online,level,remote
6,effects,study,types,development,increase,highlighting,primary,telemedicine,71,consortium,sensitivity,establishing,readily,ethnic,days,clinic,incident,providing,plans,newspapers,use,progress,guided,isolated,services,hospital,planning,community,web,stage,participants,health,groups,use,care,versus,completed,rp,american,processes,prenatal,death,consent,based,perception,monthly,odds,results,integrated,individuals,greenland,post,additional,brief,points,medication,service,telephone,individuals,multi,chinese,different,participate,australian,acute
7,efficacy,95,low,public,tobacco,consequences,platelets,created,substance,open,using,parent,detection,region,accuracy,access,algorithms,term,plan,different,treated,areas,manner,arctic,populations,experiences,awareness,core,scores,younger,referral,pressures,phase,frequent,populations,study,interviews,northern,reducing,resource,drinking,ci,conclude,symptom,low,early,associated,functionality,clinics,linked,living,58,interviews,treatment,response,values,knowledge,intervention,78,death,newspapers,goal,participants,little,observation
8,efficiency,disease,respectively,government,positively,bring,diagnosis,funding,77,frequently,classify,academic,variability,sectional,assessed,willingness,data,infectious,strategies,anxiety,time,improved,consistent,care,model,share,different,operational,depressive,age,paper,financial,measures,registered,patients,aim,communities,feasibility,change,assessing,consumption,baseline,readily,model,decision,development,studies,focuses,rapid,diagnosed,alaska,analysis,child,acute,direct,bmi,10,participants,patients,years,surveys,classification,computer,evidence,reform
9,efficient,intervention,men,political,predicted,pathogen,63,cultural,62,sizes,claims,behavioral,affects,standards,state,currently,tumor,infection,behavior,setting,communication,change,samples,presented,schedule,commitment,area,owners,significantly,mailed,20,formation,testing,previous,elements,assess,used,sessions,comparison,achieving,performance,non,samples,mean,depression,mother,conventional,foundations,service,comprehensive,data,articles,patients,usually,critical,clinical,08,post,study,infants,socioeconomic,order,delivered,current,promotion


# Todo



# Reference

[1]:https://www.youtube.com/watch?v=T05t-SqKArY

[1] LDA: https://www.youtube.com/watch?v=T05t-SqKArY

[2] https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

[3] https://medium.com/analytics-vidhya/topic-modelling-using-latent-dirichlet-allocation-in-scikit-learn-7daf770406c4

[4] https://towardsdatascience.com/topic-modeling-articles-with-nmf-8c6b2a227a45