In [14]:
import xml.etree.ElementTree as ET
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 

In [15]:
Posts_tree = ET.parse('Data_Dump/Posts.xml')
Posts_root = Posts_tree.getroot()
len(Posts_root)

7347

In [16]:
df = pd.DataFrame(columns = ['Title', 'Body'])

In [17]:
posts = []
for child in Posts_root:
    posts.append(child.attrib)

In [18]:
def remove_html_tags(text):
    """Remove html tags from a string"""
    import re
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

ps = PorterStemmer()
lemmatizer = WordNetLemmatizer() 

In [19]:
for i, post in enumerate(posts):
    if 'Title' in post:
        df = df.append({'Title': post['Title'], 'Body': remove_html_tags(post['Body'])}, ignore_index=True)

In [20]:
df.head(6)

Unnamed: 0,Title,Body
0,Why would anyone accept an answer?,I'm looking at the questions proposed during t...
1,What should our FAQ contain?,One of the big 7 questions.\n
2,What should our domain name be?,\nPossible Duplicate:\nWrite an Elevator Pitch...
3,What should our logo and site design look like?,One of the big 7 questions.\n\n\nOne suggestio...
4,Who should the moderators be?,\n Possible Duplicate:\n Moderator Pro Tem A...
5,How do we promote our site?,One of the big 7 questions.\n\n\nOne sugestion...


In [21]:
# tokenize, stem and lemmatize
import re 
for i, j in df.iterrows(): 
    title_sans_punc = re.sub(r'[^\w\s]', '', j['Title']) 
    title_tokens_stemmed = [ps.stem(word) for word in word_tokenize(title_sans_punc)]
    title_tokens_stemmed_lemmetized = [lemmatizer.lemmatize(word) for word in title_tokens_stemmed]
    title_tokens_stemmed_lemmetized = ' '.join(title_tokens_stemmed_lemmetized)
    df.at[i, "Title"] = title_tokens_stemmed_lemmetized
    
    body_sans_punc = re.sub(r'[^\w\s]', '', j['Body']) 
    body_tokens_stemmed = [ps.stem(word) for word in word_tokenize(body_sans_punc)]
    body_tokens_stemmed_lemmetized = [lemmatizer.lemmatize(word) for word in body_tokens_stemmed]
    body_tokens_stemmed_lemmetized = ' '.join(body_tokens_stemmed_lemmetized)
    df.at[i, "Body"] = body_tokens_stemmed_lemmetized


In [22]:
df.head(5)

Unnamed: 0,Title,Body
0,whi would anyon accept an answer,Im look at the question propos dure the area 5...
1,what should our faq contain,one of the big 7 question
2,what should our domain name be,possibl duplic write an elev pitch taglin note...
3,what should our logo and site design look like,one of the big 7 question one suggest per answ...
4,who should the moder be,possibl duplic moder pro tem announc In about ...


In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

In [24]:
from sklearn.decomposition import LatentDirichletAllocation as LDA
 
from pyLDAvis import sklearn as sklearn_lda
import pyLDAvis


def print_topics(model, tfidf_vectorizer, n_top_words):
    words = tfidf_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print('Topic {}: {}'.format(topic_idx, ' | '.join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]])))
        
number_topics = 10
number_words = 10


# fit the LDA model on all of the question titles for the data
post_titles = tfidf_vectorizer.fit_transform(df['Title'])
lda_titles = LDA(n_components=number_topics)
lda_titles.fit(post_titles)

# fit the LDA model on all of the question bodies for the data
post_body = tfidf_vectorizer.fit_transform(df['Title'])
lda_body = LDA(n_components=number_topics)
lda_body.fit(post_body)

LatentDirichletAllocation()

In [25]:
# print the top topics for titles
print("Post Titles: Top topics")
print()
print_topics(lda_titles, tfidf_vectorizer, number_words)

Post Titles: Top topics

Topic 0: question | whi | wa | close | reopen | thi | answer | delet | downvot | comment
Topic 1: question | ask | subject | offtop | career | close | help | site | need | thi
Topic 2: question | thi | duplic | programm | wrong | softwar | whi | base | flag | engin
Topic 3: question | overflow | stack | migrat | lock | book | histor | close | recommend | kind
Topic 4: review | tag | question | custom | ask | close | workplac | current | number | thi
Topic 5: question | whi | close | answer | did | thi | bad | mani | programm | site
Topic 6: question | stackoverflow | thi | hold | avoid | edit | differ | programm | whi | site
Topic 7: commun | question | tag | blog | wiki | vote | bounti | pse | discus | promot
Topic 8: tag | synonym | merg | request | question | remov | thi | relat | ontop | account
Topic 9: question | answer | link | programmer | code | moder | best | anoth | dont | post


In [26]:
# print the top topics for post contents
print("Post Content: Top topics")
print_topics(lda_body, tfidf_vectorizer, number_words)

Post Content: Top topics
Topic 0: discus | question | cleanup | tag | plea | review | delet | remov | answer | upvot
Topic 1: question | request | stack | reopen | close | synonym | overflow | tag | new | merg
Topic 2: softwar | ask | question | engin | thi | blog | site | best | specif | old
Topic 3: question | thi | programm | accept | wrong | answer | improv | ask | differ | help
Topic 4: question | topic | base | opinion | ask | thi | programm | whi | faq | reason
Topic 5: duplic | question | delet | comment | tag | moder | answer | whi | lock | possibl
Topic 6: question | whi | migrat | close | ontop | offtop | tag | subject | career | thi
Topic 7: ban | tag | question | programm | badg | site | multipl | add | ask | reason
Topic 8: close | question | whi | vote | wa | thi | answer | commun | did | mani
Topic 9: user | vote | limit | valu | badg | question | construct | thi | close | review


In [27]:
# display the topic mappings with top words for 'titles'
visual_lda_titles = sklearn_lda.prepare(lda_titles,post_titles, tfidf_vectorizer)
pyLDAvis.display(visual_lda_titles)

In [28]:
# display the topic mappings with top words for 'post contents'
visual_lda_body = sklearn_lda.prepare(lda_body,post_body, tfidf_vectorizer)
pyLDAvis.display(visual_lda_body)