## LDA Topic Modeling in Python
The tutorial can be found [here](https://rstudio-pubs-static.s3.amazonaws.com/79360_850b2a69980c4488b1db95987a24867a.html)

In [76]:
from nltk.corpus import twitter_samples, TwitterCorpusReader
import sys
sys.path.append("../bhtsa")
from process_twt import preprocess, get_stopwords, get_slang_dict

fileIds = twitter_samples.fileids()
root = twitter_samples.root
# read some negative tweet data from corpus
negReader = TwitterCorpusReader(root, fileIds[0])
negTwt = []
for tweet in negReader.docs():
    negTwt.append((tweet['text']))
# take a look at some of the tweets
for twt in negTwt[:10]:
    print twt

hopeless for tmr :(
Everything in the kids section of IKEA is so cute. Shame I'm nearly 19 in 2 months :(
@Hegelbon That heart sliding into the waste basket. :(
“@ketchBurning: I hate Japanese call him "bani" :( :(”

Me too
Dang starting next week I have "work" :(
oh god, my babies' faces :( https://t.co/9fcwGvaki0
@RileyMcDonough make me smile :((
@f0ggstar @stuartthull work neighbour on motors. Asked why and he said hates the updates on search :( http://t.co/XvmTUikWln
why?:("@tahuodyy: sialan:( https://t.co/Hv1i0xcrL2"
Athabasca glacier was there in #1948 :-( #athabasca #glacier #jasper #jaspernationalpark #alberta #explorealberta #… http://t.co/dZZdqmf7Cz


In [8]:
# preprocess them and show them again
processed_twt = [preprocess(twt) for twt in negTwt]
for twt in processed_twt[:10]:
    print twt

hopeless for tmr :(
everything in the kids section of ikea is so cute. shame i'm nearly 19 in 2 months :(
AT_USER that heart sliding into the waste basket. :(
“AT_USER i hate japanese call him "bani" :( :(” me too
dang starting next week i have "work" :(
oh god, my babies' faces :( URL
AT_USER make me smile :((
AT_USER AT_USER work neighbour on motors. asked why and he said hates the updates on search :( URL
why?:("AT_USER sialan:( URL
athabasca glacier was there in 1948 :-( athabasca glacier jasper jaspernationalpark alberta explorealberta … URL


In [79]:
# remove stop words and transfer slangs
import re
stop_words = get_stopwords()
slang_dict = get_slang_dict()
doc_set_removed = []
for twt in processed_twt:
    removed = []
    words = twt.split()
    for w in words:
        # strip punctuation
        w = w.strip('\'"?,.')
        # check if the word stats with an alphabet
        val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w)
        # ignore if it is a stop word
        if w.strip() in slang_dict:
            w = slang_dict[w.strip()]
        if w in stop_words or val is None:
            continue
        else:
            removed.append(w)
    doc_set_removed.append(removed)
print doc_set_removed[1]

[u'kids', u'section', u'ikea', u'cute', u'shame', u'nearly', u'months']


In [80]:
# tokenization
doc_set = []
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
for twt in doc_set_removed[:10]:
    tokens = doc_set.append(tokenizer.tokenize(' '.join(twt)))
print doc_set[1]

[u'kids', u'section', u'ikea', u'cute', u'shame', u'nearly', u'months']


In [81]:
# stemming
from nltk.stem.porter import PorterStemmer
p_stemmer = PorterStemmer()
texts = []
for txt in doc_set:
    texts.append([p_stemmer.stem(i) for i in txt])
print texts[1]

[u'kid', u'section', u'ikea', u'cute', u'shame', u'nearli', u'month']


In [82]:
# construct document matrix
from gensim import corpora, models
dictionary = corpora.Dictionary(texts)
# convert to bag of words
corpus = [dictionary.doc2bow(text) for text in texts]
print corpus[0]

[(0, 1), (1, 1)]


In [83]:
# apply lda model
import gensim
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=100)

In [85]:
ldamodel.print_topics(num_topics=5, num_words=10)

[(0,
  u'0.029*"smile" + 0.029*"tomorrow" + 0.029*"hopeless" + 0.029*"hate" + 0.029*"week" + 0.029*"dang" + 0.029*"start" + 0.029*"oh" + 0.029*"god" + 0.029*"babi"'),
 (1,
  u'0.131*"glacier" + 0.131*"athabasca" + 0.071*"jaspernationalpark" + 0.071*"explorealberta" + 0.071*"jasper" + 0.071*"alberta" + 0.071*"hopeless" + 0.071*"tomorrow" + 0.012*"smile" + 0.012*"hate"'),
 (2,
  u'0.067*"kid" + 0.067*"nearli" + 0.067*"shame" + 0.067*"month" + 0.067*"cute" + 0.067*"section" + 0.067*"ikea" + 0.067*"start" + 0.067*"week" + 0.067*"dang"'),
 (3,
  u'0.081*"basket" + 0.081*"heart" + 0.081*"slide" + 0.081*"wast" + 0.081*"call" + 0.081*"japanes" + 0.081*"bani" + 0.081*"hate" + 0.014*"smile" + 0.014*"hopeless"'),
 (4,
  u'0.081*"hate" + 0.081*"neighbour" + 0.081*"motor" + 0.081*"search" + 0.081*"updat" + 0.081*"babi" + 0.081*"oh" + 0.081*"god" + 0.014*"smile" + 0.014*"tomorrow"')]