In [1]:
#----Topic Modelling --- #

#------------------------------------ Topic Modelling ---------------------------#
doc1 = "Sugar is bad to consume. My sister likes to have sugar, but not my father."
doc2 = "My father spends a lot of time driving my sister around to dance practice."
doc3 = "Doctors suggest that driving may cause increased stress and blood pressure."
doc4 = "Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better."
doc5 = "Health experts say that Sugar is not good for your lifestyle."

In [2]:
# compile documents
doc_complete = [doc1, doc2, doc3, doc4, doc5]
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

doc_clean = [clean(doc).split() for doc in doc_complete] 
print(doc_clean)

[['sugar', 'bad', 'consume', 'sister', 'like', 'sugar', 'father'], ['father', 'spends', 'lot', 'time', 'driving', 'sister', 'around', 'dance', 'practice'], ['doctor', 'suggest', 'driving', 'may', 'cause', 'increased', 'stress', 'blood', 'pressure'], ['sometimes', 'feel', 'pressure', 'perform', 'well', 'school', 'father', 'never', 'seems', 'drive', 'sister', 'better'], ['health', 'expert', 'say', 'sugar', 'good', 'lifestyle']]


In [4]:
# Importing Gensim
import gensim
from gensim import corpora

In [5]:
# Creating the term dictionary of our corpus, where every unique term is assigned an index. 

dictionary = corpora.Dictionary(doc_clean)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break
    
##--Not applicable here but, for larger datasets if filtering is required---#
#dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

0 bad
1 consume
2 father
3 like
4 sister
5 sugar
6 around
7 dance
8 driving
9 lot
10 practice


In [6]:
# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
doc_term_matrix_4=doc_term_matrix[4]

for i in range(len(doc_term_matrix)):
    print("Word {} (\"{}\") appears {} time.".format(doc_term_matrix_4[i][0], 
                                               dictionary[doc_term_matrix_4[i][0]], 
doc_term_matrix_4[i][1]))

Word 5 ("sugar") appears 1 time.
Word 30 ("expert") appears 1 time.
Word 31 ("good") appears 1 time.
Word 32 ("health") appears 1 time.
Word 33 ("lifestyle") appears 1 time.


In [7]:
#--- TF-IDF---#
    
from gensim import corpora, models
tfidf = models.TfidfModel(doc_term_matrix)
corpus_tfidf = tfidf[doc_term_matrix]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.4715096067582428),
 (1, 0.4715096067582428),
 (2, 0.14965422842541531),
 (3, 0.4715096067582428),
 (4, 0.14965422842541531),
 (5, 0.5368829444421276)]


In [8]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

In [23]:
# Running and Training LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=2)
print(ldamodel.print_topics(num_topics=3, num_words=3))

[(0, '0.066*"pressure" + 0.063*"driving" + 0.063*"blood"'), (1, '0.075*"sugar" + 0.075*"expert" + 0.075*"lifestyle"'), (2, '0.084*"sister" + 0.084*"father" + 0.059*"sugar"')]


In [22]:
for idx, topic in ldamodel.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.064*"father" + 0.064*"time" + 0.064*"sister" + 0.064*"driving" + 0.064*"practice" + 0.064*"around" + 0.064*"spends" + 0.063*"dance" + 0.063*"lot" + 0.017*"sugar"
Topic: 1 
Words: 0.063*"sugar" + 0.062*"lifestyle" + 0.062*"expert" + 0.062*"health" + 0.062*"good" + 0.062*"say" + 0.029*"pressure" + 0.027*"seems" + 0.027*"feel" + 0.026*"father"
Topic: 2 
Words: 0.064*"sugar" + 0.059*"sister" + 0.058*"father" + 0.057*"pressure" + 0.037*"like" + 0.037*"consume" + 0.037*"bad" + 0.034*"stress" + 0.034*"driving" + 0.034*"may"


In [24]:
#-- Running LDA using TF-IDF -- #
    
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=3, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.047*"doctor" + 0.047*"stress" + 0.047*"suggest" + 0.047*"increased" + 0.047*"may" + 0.047*"cause" + 0.046*"blood" + 0.037*"pressure" + 0.037*"driving" + 0.023*"sugar"
Topic: 1 Word: 0.051*"sugar" + 0.046*"bad" + 0.046*"like" + 0.046*"consume" + 0.038*"well" + 0.038*"sometimes" + 0.038*"feel" + 0.038*"never" + 0.038*"seems" + 0.038*"drive"
Topic: 2 Word: 0.045*"good" + 0.045*"health" + 0.045*"say" + 0.045*"expert" + 0.045*"lifestyle" + 0.043*"time" + 0.043*"around" + 0.042*"spends" + 0.042*"dance" + 0.042*"practice"
