In [1]:
#Latent Dirichlet Analysis: LDA assumes documents are produced from a mixture of topics. Those topics then generate words 
#based on their probability distribution. Given a dataset of documents, LDA backtracks and tries to figure out what topics
#would create those documents in the first place. 
#LDA converts this Document-Term Matrix into two lower dimensional matrices – M1 and M2. - M1 is a document-topics matrix and 
#- M2 is a topic – terms matrix with dimensions (N,  K) and (K, M) respectively, where   - N is the number of documents,  
#- K is the number of topics   - M is the vocabulary size. 
#For every topic, two probabilities p1 and p2 are calculated. 
#P1 – p(topic t / document d) = the proportion of words in document d that are currently assigned to topic t. 
#P2 – p(word w / topic t) = the proportion of assignments to topic t over all documents that come from this word w. 
#After many iterations, a steady state is achieved where the document topic and topic term distributions are fairly good. 
#This is the convergence point of LDA.

In [2]:
#Alpha and Beta Hyperparameters  alpha represents document-topic density and Beta represents topic-word density. 
#Higher the value of alpha, documents are composed of more topics and lower the value of alpha, documents contain fewer topics.
#On the other hand, higher the beta, topics are composed of many words in the corpus, and with the lower value of beta, 
#they are composed of few words. 

In [None]:
#A common reason for such a charade is that we want to determine similarity between pairs of documents, or the similarity 
#between a specific document and a set of other documents (such as a user query vs. indexed documents).

In [13]:
raw_corpus = ["Human machine interface for lab abc computer applications human",
             "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",              
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]
#raw_corpus1 = open('C:/Users/lohitaksh/Desktop/R/HealthNews/Dentistry00501.txt', 'r')
#raw_corpus2 = open('C:/Users/lohitaksh/Desktop/R/HealthNews/Dentistry00502.txt', 'r')
#raw_corpus3 = open('C:/Users/lohitaksh/Desktop/R/HealthNews/Dentistry00503.txt', 'r')
#raw_corpus = open('C:/Users/lohit/Desktop/course/KDD/HealthNews/Dentistry00504.txt', 'r')


In [14]:
 #we first use this tiny corpus to define a 2-dimensional LSI space:

In [15]:
# Create a set of frequent words
stoplist = set('for a of the and to in'.split(' '))
# Lowercase each document, split it by white space and filter out stopwords
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in raw_corpus]

# Count word frequencies
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

# Only keep words that appear more than once
processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]
processed_corpus

[['human', 'interface', 'computer', 'human'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [16]:
from gensim import corpora, models, similarities
dictionary = corpora.Dictionary.load('C:/Users/lohit/AppData/Local/Temp/den.dict')
corpus = corpora.MmCorpus('C:/Users/lohit/AppData/Local/Temp/den.mm') # comes from the first tutorial, "From strings to vectors"
print(corpus)


MmCorpus(9 documents, 35 features, 51 non-zero entries)


In [None]:
#All the text documents combined is known as the corpus. To run any mathematical model on text corpus, it is a good practice
#to convert it into a matrix representation. LDA model looks for repeating term patterns in the entire DT matrix. 
#Python provides many great libraries for text mining practices, “gensim” is one such clean and beautiful library to handle
#text data. It is scalable, robust and efficient. Following code shows how to convert a corpus into a document-term matrix.

In [17]:
# Importing Gensim
import gensim
from gensim import corpora

# Creating the term dictionary of our courpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(processed_corpus)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in processed_corpus]
doc_term_matrix

[[(0, 2), (1, 1), (2, 1)],
 [(2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(1, 1), (4, 1), (5, 1), (8, 1)],
 [(0, 1), (5, 2), (8, 1)],
 [(4, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(3, 1), (10, 1), (11, 1)]]

In [None]:
#We see that the mode has come up with probabilities associated with the words. Each line is a topic with individual 
#topic terms and weights. Topic1 science , and Topic2 can be termed as education and Topic3 could be name as family and so on. 


In [None]:
#Next step is to create an object for LDA model and train it on Document-Term matrix. The training also requires few parameters
#as input which are explained in the above section. The gensim module allows both LDA model estimation from a training corpus
#and inference of topic distribution on new, unseen documents.

In [12]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)
ldamodel

<gensim.models.ldamodel.LdaModel at 0x2052e573898>

In [26]:
print(ldamodel.print_topics(num_topics=5, num_words=5))


[(0, '0.058*"that" + 0.043*"is" + 0.031*"mucins" + 0.028*"on" + 0.028*"salivary"'), (1, '0.013*"thought" + 0.013*"more" + 0.013*"them" + 0.013*"saliva" + 0.013*"cavities"'), (2, '0.013*"thought" + 0.013*"more" + 0.013*"them" + 0.013*"saliva" + 0.013*"against"')]


In [None]:
#Each line is a topic with individual topic terms and weights. Topic1 can be termed as Bad Health, and Topic3 can be termed
#as Family.