In [117]:
import xml.etree.ElementTree as ET
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 

In [118]:
Posts_tree = ET.parse('Data_Dump/Posts.xml')
Posts_root = Posts_tree.getroot()
len(Posts_root)

7347

In [120]:
df = pd.DataFrame(columns = ['Title', 'Body'])

In [122]:
posts = []
for child in Posts_root:
    posts.append(child.attrib)

In [123]:
def remove_html_tags(text):
    """Remove html tags from a string"""
    import re
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

ps = PorterStemmer()
lemmatizer = WordNetLemmatizer() 

In [124]:
for i, post in enumerate(posts):
    if 'Title' in post:
        df = df.append({'Title': post['Title'], 'Body': remove_html_tags(post['Body'])}, ignore_index=True)

In [125]:
df.head(6)

Unnamed: 0,Title,Body
0,Why would anyone accept an answer?,I'm looking at the questions proposed during t...
1,What should our FAQ contain?,One of the big 7 questions.\n
2,What should our domain name be?,\nPossible Duplicate:\nWrite an Elevator Pitch...
3,What should our logo and site design look like?,One of the big 7 questions.\n\n\nOne suggestio...
4,Who should the moderators be?,\n Possible Duplicate:\n Moderator Pro Tem A...
5,How do we promote our site?,One of the big 7 questions.\n\n\nOne sugestion...


In [127]:
## tokenize, stem and lemmatize

for i, j in df.iterrows(): 
    title_tokens_stemmed = [ps.stem(word) for word in word_tokenize(j["Title"])]
    title_tokens_stemmed_lemmetized = [lemmatizer.lemmatize(word) for word in title_tokens_stemmed]
    title_tokens_stemmed_lemmetized = ' '.join(title_tokens_stemmed_lemmetized)
    df.at[i, "Title"] = title_tokens_stemmed_lemmetized
    
    body_tokens_stemmed = [ps.stem(word) for word in word_tokenize(j["Body"])]
    body_tokens_stemmed_lemmetized = [lemmatizer.lemmatize(word) for word in body_tokens_stemmed]
    body_tokens_stemmed_lemmetized = ' '.join(body_tokens_stemmed_lemmetized)
    df.at[i, "Body"] = body_tokens_stemmed_lemmetized


In [128]:
df.head(5)

Unnamed: 0,Title,Body
0,whi would anyon accept an answer ?,I 'm look at the question propos dure the area...
1,what should our faq contain ?,one of the big 7 question .
2,what should our domain name be ?,possibl duplic : write an elev pitch / taglin ...
3,what should our logo and site design look like ?,one of the big 7 question . one suggest per an...
4,who should the moder be ?,possibl duplic : moder pro tem announc In abou...


In [129]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(stop_words='english')

In [130]:
from sklearn.decomposition import LatentDirichletAllocation as LDA
 
from pyLDAvis import sklearn as sklearn_lda
import pyLDAvis


def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print('Topic {}: {}'.format(topic_idx, ' | '.join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]])))
        
number_topics = 10
number_words = 10


# fit the LDA model on all of the question titles for the data
post_titles = count_vectorizer.fit_transform(df['Title'])
lda_titles = LDA(n_components=number_topics)
lda_titles.fit(post_titles)

# fit the LDA model on all of the question bodies for the data
post_body = count_vectorizer.fit_transform(df['Title'])
lda_body = LDA(n_components=number_topics)
lda_body.fit(post_body)

LatentDirichletAllocation()

In [131]:
# print the top topics for titles
print("Post Titles: Top topics")
print()
print_topics(lda_titles, count_vectorizer, number_words)

Post Titles: Top topics

Topic 0: question | softwar | ask | engin | tag | programm | ban | whi | design | recommend
Topic 1: question | close | duplic | commun | edit | develop | thi | site | good | wiki
Topic 2: close | reason | question | tag | point | programm | need | ask | list | add
Topic 3: question | whi | thi | close | answer | wa | delet | doe | accept | ha
Topic 4: question | ask | help | best | bug | practic | ad | migrat | promot | site
Topic 5: stack | moder | overflow | programm | reput | user | exchang | elect | thi | blog
Topic 6: tag | request | comment | merg | flag | synonym | edit | question | post | old
Topic 7: question | thi | code | programmers | opinion | ask | base | site | new | allow
Topic 8: question | programm | ask | topic | program | languag | subject | thi | faq | site
Topic 9: career | tag | advic | review | blacklist | sourc | open | develop | edit | ask


In [132]:
# print the top topics for post contents
print("Post Content: Top topics")
print_topics(lda_body, count_vectorizer, number_words)

Post Content: Top topics
Topic 0: question | ask | thi | topic | good | whi | subject | close | wrong | discus
Topic 1: question | whi | close | wa | thi | programm | stackoverflow | doe | ban | programmers
Topic 2: question | close | whi | softwar | vote | thi | answer | duplic | delet | wa
Topic 3: question | thi | stack | overflow | programm | site | improv | exchang | base | lock
Topic 4: question | programm | ask | edit | blog | post | place | program | review | better
Topic 5: moder | commun | delet | comment | whi | answer | elect | ad | wa | post
Topic 6: answer | site | question | close | thi | chang | accept | post | specif | new
Topic 7: request | tag | merg | synonym | flag | badg | faq | edit | link | comment
Topic 8: tag | question | career | answer | best | wiki | link | develop | point | merg
Topic 9: question | help | user | page | close | new | whi | construct | reason | answer


In [133]:
# display the topic mappings with top words for 'titles'
visual_lda_titles = sklearn_lda.prepare(lda_titles,post_titles, count_vectorizer)
pyLDAvis.display(visual_lda_titles)

In [134]:
# display the topic mappings with top words for 'post contents'
visual_lda_body = sklearn_lda.prepare(lda_body,post_body, count_vectorizer)
pyLDAvis.display(visual_lda_body)