In [1]:
import nltk

#Create a sample corpus

d1 = 'Symptoms of diabetes include frequent urination, increased thirst, and increased hunger'
d2 = 'Simple sugars are called monosaccharides and include glucose also known as dextrose, fructose and galactose'
d3 = 'Prevention and treatment of diabetes involve maintaining a healthy diet, regular physical exercise, a normal body weight, and avoiding use of tobacco'
d4 = 'Insulin and some oral medications can cause low blood sugar'

corpora = [d1, d2, d3, d4]


In [2]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string

lemma = WordNetLemmatizer()

In [11]:
#Clean the data by removing stopwords, punctuation, and lemmatizing the words
def clean(doc):
    sdoc = " ".join([i for i in doc.lower().split() if i not in set(stopwords.words('english'))])
    pdoc = " ".join([ch for ch in sdoc.split() if ch not in set(set(string.punctuation))])
    cleaned = " ".join([lemma.lemmatize(w) for w in pdoc.split()])
    return cleaned
    
cleaned_corpora = [clean(doc).split() for doc in corpora]
print(cleaned_corpora)

[['symptom', 'diabetes', 'include', 'frequent', 'urination,', 'increased', 'thirst,', 'increased', 'hunger'], ['simple', 'sugar', 'called', 'monosaccharide', 'include', 'glucose', 'also', 'known', 'dextrose,', 'fructose', 'galactose'], ['prevention', 'treatment', 'diabetes', 'involve', 'maintaining', 'healthy', 'diet,', 'regular', 'physical', 'exercise,', 'normal', 'body', 'weight,', 'avoiding', 'use', 'tobacco'], ['insulin', 'oral', 'medication', 'cause', 'low', 'blood', 'sugar']]


In [15]:
import gensim
from gensim import corpora

#Create term document matrix using gensim
dictionary = corpora.Dictionary(cleaned_corpora)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in cleaned_corpora]

print(doc_term_matrix)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 2), (5, 1), (6, 1), (7, 1)], [(3, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)], [(0, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1)], [(17, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1)]]


In [16]:
#Run LDA using gensim LdaModel class
#Hyperparameter k specified as num_topics

Lda = gensim.models.ldamodel.LdaModel
ldamodel = Lda(doc_term_matrix, num_topics = 3, id2word = dictionary, passes = 30)
print(ldamodel.print_topics(num_topics = 3, num_words = 3))

[(0, '0.080*"increased" + 0.046*"include" + 0.046*"sugar"'), (1, '0.033*"treatment" + 0.033*"body" + 0.033*"maintaining"'), (2, '0.026*"sugar" + 0.026*"include" + 0.026*"diabetes"')]
