# Latent Dirichlet Allocation for Topic Modeling

<ul>
    <li>LDA assumes documents are produced from a mixture of topics.</li> 
    <li>Those topics then generate words based on their probability distribution.</li> 
    <li>Given a dataset of documents, LDA backtracks and tries to figure out what topics would create those documents in the first place.</li>

In [1]:
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
# Importing Gensim
import gensim
from gensim import corpora

In [2]:
import os
import glob
import pandas as pd
df = pd.DataFrame()
df = pd.read_csv("../../data/light/test_light.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,id,title,content
0,0,1,spin relates subatomic particles,spin relates subatomic particles
1,1,2,simplest explanation string theory,simplest explanation string theory
2,2,3,lie theory representations particle physics,lie theory representations particle physics
3,3,7,determinism,determinism
4,4,9,hamilton principle,hamilton principle


# LDA on title of Questions

In [17]:
documents = df.title
documents.head()

0               spin relates subatomic particles
1             simplest explanation string theory
2    lie theory representations particle physics
3                                    determinism
4                             hamilton principle
Name: title, dtype: object

In [18]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()

In [19]:
def clean(doc):
    
    print(doc)
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

In [20]:
documents_clean = [clean(doc).split() for doc in documents]

spin relates subatomic particles
simplest explanation string theory
lie theory representations particle physics
determinism
hamilton principle
sound produced
experiment disprove string theory
sky change color sky blue day red sunrise set black night
energy particle collisions calculated
monte carlo
leaning banking turning bicycle
velocity object electromagnetic field
difference measurement interaction quantum mechanics
calculate average speed
lay explanation special theory relativity
show coriolis effect irrelevant whirl vortex sink bathtub
magnets energy repel
check einstein equations correspondence real world
impressions topological field theories mathematics
capacitive screen sensing
magnets spin positioned precisely
lhc circular long
polarised materials change colour stress
intuitive explanation gouy phase
proton therapy cancer treatment
physicists solutions yang baxter equation
mnemonics remember properties materials
neutrons repel
quantum entanglement mediated interaction
squeeze

AttributeError: 'float' object has no attribute 'lower'

In [None]:
# Creating the term dictionary of our courpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(documents_clean)

In [None]:
# unique words
len(dictionary)

In [None]:
# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in documents_clean]

In [None]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)

In [None]:
ldamodel.save("lda_title_content")

In [None]:
print(ldamodel.print_topics(num_topics=3, num_words=3))

# Function for predicting tags Using Trained LDA

In [None]:
# load model
import gensim
LDA = gensim.models.ldamodel.LdaModel

In [None]:
lda = LDA.load("lda_title_content")

In [None]:
import pandas as pd
df = pd.read_csv("../../data/light/test_light.csv")
df.head()

In [None]:

def getTopicForQuery (question):
    import numpy
    # clean the question
    question_clean = clean(question).split()
    ques_vec = []
    ques_vec = dictionary.doc2bow(question_clean)

    topic_vec = []
    topic_vec = lda[ques_vec]

    word_count_array = numpy.empty((len(topic_vec), 2), dtype = numpy.object)
    for i in range(len(topic_vec)):
        word_count_array[i, 0] = topic_vec[i][0]
        word_count_array[i, 1] = topic_vec[i][1]

    idx = numpy.argsort(word_count_array[:, 1])
    idx = idx[::-1]
    word_count_array = word_count_array[idx]

    final = []
    final = lda.print_topic(word_count_array[0, 0], 3)
    
    tags = [x.split("*")[1].replace("\"",'').strip() for x in final.split("+") ]
    
    return " ".join(tags)

In [None]:
results = []
resultDF1 = pd.DataFrame(columns = ["id","tags"])

tags1 = []
counter=1
for ques in df.content[0:50]:
#     tags1.append(getTopicForQuery(ques))
    print(ques)
    print(getTopicForQuery(ques))
    if(counter%10000 == 0):
        print(str(counter)+" processed")
    counter = counter + 1
    
# results
# resultDF1["id"]=df.id
# resultDF1["tags"]=tags1
print(resultDF.head())

In [None]:
resultDF.id = df.id

In [None]:
resultDF.to_csv("results.csv", index=False)