In [1]:
# !pip list | grep gensim
!pip install --upgrade gensim

Requirement already up-to-date: gensim in /usr/local/lib/python3.6/dist-packages (3.8.3)


# Assignment

1 LDA: Latent Dirichlet allocation: using sklearn and Gensim

2 NNMF: non negative matrix factorization 

In [224]:
# !pip install nltk
# !pip install spacy
#!pip install gensim
# !pip install pyLDAvis



In [40]:
import pandas as pd
import numpy as np
import os
import sklearn
import sys
np.set_printoptions(threshold=sys.maxsize)
from sklearn.model_selection import train_test_split
import re
from pprint import pprint
from glob import glob
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.nmf import Nmf
# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

from pathlib import Path
import glob
from bs4 import BeautifulSoup
import operator

In [51]:
#path = "./sample_50/"
path = "./sample_1000/"

In [4]:
def get_abstract(soup):
    if soup.find("abstract") is None:
        return "NaN"
    
    return soup.find('abstract').text

In [5]:
def get_sample_data(path):
    pathlist = Path(path).glob('**/*.xml')
    abstracts = []
    for path in pathlist:
        # because path is object not string
        path_in_str = str(path)
        soup = BeautifulSoup(open('./'+ path_in_str, 'r'))
        abstracts.append(get_abstract(soup))
        # todo: get full articles?

    return np.array(abstracts)


In [52]:
X = get_sample_data(path)

In [53]:
X.shape

(1001,)

In [8]:
type(X)

numpy.ndarray

# Assignment1: LDA: Latent Dirichlet allocation

In [9]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [54]:
# data cleaning 

# Remove Emails
# data = [re.sub('\S*@\S*\s?', '', sent) for each in data]

# Remove new line characters
data = [re.sub('\s+', ' ', each) for each in X]

# # Remove distracting single quotes
# data = [re.sub("\'", "", sent) for each in data]

## Using Sklearn

Preprocessing include:

- tokenization
  - remove stop words

Tokenize the words

In [55]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
cv = CountVectorizer(max_df = 0.95, min_df = 2, stop_words = 'english')
df = cv.fit_transform(data)

Apply LDA

In [56]:
from sklearn.decomposition import LatentDirichletAllocation
lda_model1 = LatentDirichletAllocation(n_components = 15, random_state = 42) # n_components = topics
lda_model1.fit(df)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=15, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [None]:
# def get_lda_topics(model, num_topics):
#     word_dict = {};
#     for i in range(num_topics):
#         words = model.show_topic(i, topn = 20)
#         word_dict['Topic #' + '{:02d}'.format(i+1)] = [i[0] for i in words]
#     return pd.DataFrame(word_dict)

# get_lda_topics(lda_model1, 15)

In [81]:
for index, topic in enumerate(lda_model1.components_):
    print(f'Top 15 words for Topic #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-15:]])
    #print('\n')

Top 15 words for Topic #0
['associated', 'compared', 'years', 'treatment', '12', 'group', 'age', 'results', 'reported', 'participants', 'study', 'risk', 'ci', '95', 'patients']
Top 15 words for Topic #1
['factors', 'using', 'model', 'genetic', 'analysis', 'patient', 'data', 'treatment', 'disease', 'methods', 'results', 'clinical', 'cancer', 'risk', 'patients']
Top 15 words for Topic #2
['used', 'new', 'education', 'media', 'methods', 'social', 'students', 'internet', 'use', 'study', 'medical', 'based', 'research', 'information', 'health']
Top 15 words for Topic #3
['outcomes', 'performance', 'pediatric', 'physical', 'network', 'based', 'improvement', 'results', 'primary', 'patients', 'social', 'patient', 'quality', 'health', 'care']
Top 15 words for Topic #4
['ehr', 'used', 'diabetes', 'mortality', 'information', '95', 'methods', 'clinical', 'medical', 'study', 'using', 'results', 'data', 'patient', 'patients']
Top 15 words for Topic #5
['costs', 'medical', 'results', 'interventions', 

## Using Gensim

Preprocessing include:
- tokenization
- remove stop words
- bigram data
- lemmatization


Tokenization: tokenize each sentence into a list of words, removing punctuations and unnecessary characters altogether.     

In [58]:
def tokenize_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(tokenize_to_words(data))
data_words

[['objective',
  'common',
  'chronic',
  'diseases',
  'such',
  'as',
  'hypertension',
  'are',
  'costly',
  'and',
  'difficult',
  'to',
  'manage',
  'our',
  'ultimate',
  'goal',
  'is',
  'to',
  'use',
  'data',
  'from',
  'electronic',
  'health',
  'records',
  'to',
  'predict',
  'the',
  'risk',
  'and',
  'timing',
  'of',
  'deterioration',
  'in',
  'hypertension',
  'control',
  'towards',
  'this',
  'goal',
  'this',
  'work',
  'predicts',
  'the',
  'transition',
  'points',
  'at',
  'which',
  'hypertension',
  'is',
  'brought',
  'into',
  'as',
  'well',
  'as',
  'pushed',
  'out',
  'of',
  'control',
  'method',
  'in',
  'cohort',
  'of',
  'patients',
  'with',
  'hypertension',
  'enrolled',
  'in',
  'chronic',
  'disease',
  'management',
  'program',
  'at',
  'the',
  'vanderbilt',
  'university',
  'medical',
  'center',
  'patients',
  'are',
  'modeled',
  'as',
  'an',
  'array',
  'of',
  'features',
  'derived',
  'from',
  'the',
  'clinic

In [59]:
data_words[:1]

[['objective',
  'common',
  'chronic',
  'diseases',
  'such',
  'as',
  'hypertension',
  'are',
  'costly',
  'and',
  'difficult',
  'to',
  'manage',
  'our',
  'ultimate',
  'goal',
  'is',
  'to',
  'use',
  'data',
  'from',
  'electronic',
  'health',
  'records',
  'to',
  'predict',
  'the',
  'risk',
  'and',
  'timing',
  'of',
  'deterioration',
  'in',
  'hypertension',
  'control',
  'towards',
  'this',
  'goal',
  'this',
  'work',
  'predicts',
  'the',
  'transition',
  'points',
  'at',
  'which',
  'hypertension',
  'is',
  'brought',
  'into',
  'as',
  'well',
  'as',
  'pushed',
  'out',
  'of',
  'control',
  'method',
  'in',
  'cohort',
  'of',
  'patients',
  'with',
  'hypertension',
  'enrolled',
  'in',
  'chronic',
  'disease',
  'management',
  'program',
  'at',
  'the',
  'vanderbilt',
  'university',
  'medical',
  'center',
  'patients',
  'are',
  'modeled',
  'as',
  'an',
  'array',
  'of',
  'features',
  'derived',
  'from',
  'the',
  'clinic

Creating Bigram and Trigram Models (2/3 words that are frequently appear together)

In [60]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold, fewer phrases to form
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])


['objective', 'common', 'chronic_diseases', 'such', 'as', 'hypertension', 'are', 'costly', 'and', 'difficult', 'to', 'manage', 'our', 'ultimate', 'goal', 'is', 'to', 'use', 'data', 'from', 'electronic', 'health', 'records', 'to', 'predict', 'the', 'risk', 'and', 'timing', 'of', 'deterioration', 'in', 'hypertension', 'control', 'towards', 'this', 'goal', 'this', 'work', 'predicts', 'the', 'transition', 'points', 'at', 'which', 'hypertension', 'is', 'brought', 'into', 'as', 'well', 'as', 'pushed', 'out', 'of', 'control', 'method', 'in', 'cohort', 'of', 'patients', 'with', 'hypertension', 'enrolled', 'in', 'chronic', 'disease', 'management', 'program', 'at', 'the', 'vanderbilt', 'university', 'medical', 'center', 'patients', 'are', 'modeled', 'as', 'an', 'array', 'of', 'features', 'derived', 'from', 'the', 'clinical', 'domain', 'over', 'time', 'which', 'are', 'distilled', 'into', 'core', 'set', 'using', 'an', 'information', 'gain', 'criteria', 'regarding', 'their', 'predictive', 'performa

Remove Stopwords, Make Bigrams and Lemmatize


In [61]:
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# !python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

In [62]:
# Define functions for stopwords, bigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

# def make_trigrams(texts):
#     return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [63]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['objective', 'common', 'hypertension', 'costly', 'difficult', 'manage', 'ultimate', 'goal', 'electronic', 'health', 'record', 'predict', 'risk', 'timing', 'deterioration', 'hypertension', 'control', 'goal', 'work', 'predict', 'transition', 'point', 'hypertension', 'bring', 'well', 'push', 'control', 'cohort', 'patient', 'hypertension', 'enrol', 'chronic', 'program', 'patient', 'model', 'array', 'feature', 'derive', 'clinical', 'domain', 'time', 'distil', 'core', 'set', 'use', 'information', 'gain', 'criterion', 'regard', 'predictive', 'performance', 'model', 'transition', 'point', 'prediction', 'compute', 'use', 'classifier', 'result', 'predictive', 'feature', 'transition', 'hypertension', 'control', 'status', 'include', 'hypertension', 'assessment', 'pattern', 'comorbid', 'diagnosis', 'procedure', 'medication', 'history', 'final', 'model', 'achieve', 'study', 'achieve', 'accurate', 'prediction', 'transition', 'point', 'hypertension', 'control', 'status', 'important', 'first', 'step'

Create the Dictionary and Corpus needed for Topic Modeling

two main inputs to the LDA topic model using GenSim library are the dictionary(id2word) and the corpus.

eg: for the single doc, produce: a mapping of (word_id, word_frequency), (0,1) 1st word in the doc appear once

In [64]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])
print(corpus[1:2])


[[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 4), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 2), (26, 1), (27, 1), (28, 1), (29, 3), (30, 1), (31, 1), (32, 8), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 3), (40, 1), (41, 2), (42, 1), (43, 1), (44, 1), (45, 1), (46, 3), (47, 2), (48, 2), (49, 2), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 2), (59, 1), (60, 1), (61, 1), (62, 1), (63, 4), (64, 1), (65, 2), (66, 1), (67, 1)]]
[[(15, 1), (20, 1), (27, 1), (30, 2), (35, 6), (41, 6), (54, 1), (55, 1), (56, 1), (60, 2), (68, 2), (69, 1), (70, 1), (71, 1), (72, 1), (73, 2), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 2), (80, 2), (81, 1), (82, 6), (83, 1), (84, 1), (85, 1), (86, 2), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 1), (96, 1), (97, 1), (98, 1), (99, 1), (100, 1), (101,

##### id-word mapping:

In [65]:
id2word[0]

'accurate'

##### Item-frequency

In [66]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]][:10]

[[('accurate', 1),
  ('achieve', 2),
  ('array', 1),
  ('assessment', 1),
  ('bring', 1),
  ('chronic', 1),
  ('classifier', 1),
  ('clinical', 1),
  ('cohort', 1),
  ('common', 1),
  ('comorbid', 1),
  ('compute', 1),
  ('control', 4),
  ('core', 1),
  ('costly', 1),
  ('criterion', 1),
  ('derive', 1),
  ('deterioration', 1),
  ('develop', 1),
  ('diagnosis', 1),
  ('difficult', 1),
  ('distil', 1),
  ('domain', 1),
  ('electronic', 1),
  ('enrol', 1),
  ('feature', 2),
  ('final', 1),
  ('first', 1),
  ('gain', 1),
  ('goal', 3),
  ('health', 1),
  ('history', 1),
  ('hypertension', 8),
  ('important', 1),
  ('include', 1),
  ('information', 1),
  ('manage', 1),
  ('management', 1),
  ('medication', 1),
  ('model', 3),
  ('objective', 1),
  ('patient', 2),
  ('pattern', 1),
  ('performance', 1),
  ('personalize', 1),
  ('plan', 1),
  ('point', 3),
  ('predict', 2),
  ('prediction', 2),
  ('predictive', 2),
  ('procedure', 1),
  ('program', 1),
  ('push', 1),
  ('record', 1),
  ('reg

Building the Topic Model

In [67]:
lda_model2 = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=15, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [68]:
def get_lda_topics(model, num_topics):
    word_dict = {};
    for i in range(num_topics):
        words = model.show_topic(i, topn = 20)
        word_dict['Topic #' + '{:02d}'.format(i+1)] = [i[0] for i in words]
    return pd.DataFrame(word_dict)

# Print the Keyword in the 15 topics
get_lda_topics(lda_model2, 15)

Unnamed: 0,Topic #01,Topic #02,Topic #03,Topic #04,Topic #05,Topic #06,Topic #07,Topic #08,Topic #09,Topic #10,Topic #11,Topic #12,Topic #13,Topic #14,Topic #15
0,hospital,obesity,request,monitoring,care,child,use,virtual,consultation,cancer,patient,health,outbreak,confidence,intervention
1,share,note,participant,emergency,health,parent,datum,stay,service,event,risk,information,tweet,dementia,group
2,item,overweight,online,monitor,population,pediatric,study,oral,specialist,rural,year,diverse,caregiver,detector,trial
3,together,positive_predictive,program,sensor,quality,substance,patient,prevalent,case,response,age,research,twitter,everyday,participant
4,food,antibiotic,internet,device,level,family,result,epidemic,submit,injury,rate,manner,release,utilise,adherence
5,cross,maternal,student,save,disease,theory,method,grade,patient,epidemiology,compare,expedite,influenza,audio,study
6,seem,weight,website,can,increase,motivation,clinical,email,access,dialysis,woman,support,infectious,thematically,control
7,personnel,obese,group,map,treatment,exacerbation,base,clinically,referral,breast,high,need,city,assistive,receive
8,african,extract,week,video,high,readmission,model,ill,specialty,resident,increase,public,engagement,independence,self
9,adequate,ed,study,trigger,may,onset,thus,useful,pcps,survival,report,resource,keyword,carer,month


In [104]:
# # Print the Keyword in the 10 topics
# pprint(lda_model.print_topics())
# doc_lda = lda_model[corpus]

Evaluation Metric: Compute Model Perplexity and Coherence Score


In [70]:
# Compute Perplexity
print('\nPerplexity: ', lda_model2.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model2, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -9.78955017249181

Coherence Score:  0.44727961203275424


Visualize the topics-keywords

In [72]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model2, corpus, id2word)
vis

# Assignment 2: Non Negative Matrix Facotrization



### Using GenSim

how to find the best number of topics using the cohenrence score?


In [73]:

# Create a list of the topic numbers we want to try
topic_nums = list(np.arange(5, 75 + 1, 5))

# Run the nmf model and calculate the coherence score
# for each number of topics
coherence_scores = []

for num in topic_nums:
    nmf = Nmf(
        corpus=corpus,
        num_topics=num,
        id2word=id2word,
        chunksize=2000,
        passes=5,
        kappa=.1,
        minimum_probability=0.01,
        w_max_iter=300,
        w_stop_condition=0.0001,
        h_max_iter=100,
        h_stop_condition=0.001,
        eval_every=10,
        normalize=True,
        random_state=42
    )
    
    # Run the coherence model to get the score
    cm = CoherenceModel(
        model=nmf,
        texts=texts,
        dictionary=id2word,
        coherence='c_v'
    )
    
    coherence_scores.append(round(cm.get_coherence(), 5))


# Get the number of topics with the highest coherence score
scores = list(zip(topic_nums, coherence_scores))
best_num_topics = sorted(scores, key=operator.itemgetter(1), reverse=True)[0][0]

print(best_num_topics)

10


Using Sklearn
- tokenization
  - remove stop words


In [74]:
#vectorizer = CountVectorizer(analyzer='word' max_features=5000)
#vectorizer = CountVectorizer(analyzer='word', stop_words = 'english')
vectorizer = CountVectorizer(max_df = 0.95, min_df = 2, stop_words = 'english')
x_counts = vectorizer.fit_transform(data)

In [75]:
transformer = TfidfTransformer(smooth_idf=False);
x_tfidf = transformer.fit_transform(x_counts);

In [76]:
from sklearn.preprocessing import normalize
xtfidf_norm = normalize(x_tfidf, norm='l1', axis=1)

In [77]:
#obtain a NMF model.
num_topics=best_num_topics
from sklearn.decomposition import NMF
nmf_model = NMF(n_components=num_topics, init='nndsvd')

In [78]:
#fit the model
nmf_model.fit(xtfidf_norm)

NMF(alpha=0.0, beta_loss='frobenius', init='nndsvd', l1_ratio=0.0, max_iter=200,
    n_components=10, random_state=None, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [79]:
def get_nmf_topics(model, n_top_words):
    
    #the word ids obtained need to be reverse-mapped to the words so we can print the topic names.
    feat_names = vectorizer.get_feature_names()
    
    word_dict = {};
    for i in range(num_topics):
        
        #for each topic, obtain the largest values, and add the words they map to into the dictionary.
        words_ids = model.components_[i].argsort()[:-20 - 1:-1]
        words = [feat_names[key] for key in words_ids]
        word_dict['Topic #' + '{:02d}'.format(i+1)] = words;
    
    return pd.DataFrame(word_dict);

In [80]:
get_nmf_topics(nmf_model, 15)

Unnamed: 0,Topic #01,Topic #02,Topic #03,Topic #04,Topic #05,Topic #06,Topic #07,Topic #08,Topic #09,Topic #10
0,,health,supplemental,pandemics,certificates,patients,cancer,hiv,pain,scan
1,χ2,care,digital,evolutionary,pandemic,ci,tobacco,intervention,overlap,identifiers
2,dying,information,text,know,surveillance,95,screening,participants,constructs,fda
3,drugs,data,content,reviews,traditional,risk,cervical,group,exists,device
4,dsc,research,available,needs,useful,diabetes,prevention,self,psychological,rules
5,duke,patient,messaging,theories,resources,pregnancy,women,use,summary,devices
6,duplicate,public,message,adolescents,limited,patient,breast,trial,research,hospitals
7,duration,quality,messages,economic,influenza,associated,prostate,alcohol,combined,burden
8,dutch,clinical,weight,information,prediction,age,residents,online,single,unique
9,duty,systems,adherence,communities,research,years,rural,adherence,influence,medical


# Todo

- more preprocessing:

lower cases the text
expands out contractions
stems the text
removes punctuation, stop words, numbers, single characters and words with extra spaces (artifact from expanding out contractions)

# Reference

[1]:https://www.youtube.com/watch?v=T05t-SqKArY

[1] LDA: https://www.youtube.com/watch?v=T05t-SqKArY

[2] https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

[3] https://medium.com/analytics-vidhya/topic-modelling-using-latent-dirichlet-allocation-in-scikit-learn-7daf770406c4

[4] https://towardsdatascience.com/topic-modeling-articles-with-nmf-8c6b2a227a45