## LDA Topic Modeling in Python
The tutorial can be found [here](https://rstudio-pubs-static.s3.amazonaws.com/79360_850b2a69980c4488b1db95987a24867a.html)

In [2]:
from nltk.corpus import twitter_samples, TwitterCorpusReader
import sys
sys.path.append("../bhtsa")
from process_twt import preprocess, get_stopwords, get_slang_dict

fileIds = twitter_samples.fileids()
root = twitter_samples.root
# read some negative tweet data from corpus
negReader = TwitterCorpusReader(root, fileIds[0])
negTwt = []
for tweet in negReader.docs():
    negTwt.append((tweet['text']))
# take a look at some of the tweets
for twt in negTwt[:10]:
    print twt

hopeless for tmr :(
Everything in the kids section of IKEA is so cute. Shame I'm nearly 19 in 2 months :(
@Hegelbon That heart sliding into the waste basket. :(
“@ketchBurning: I hate Japanese call him "bani" :( :(”

Me too
Dang starting next week I have "work" :(
oh god, my babies' faces :( https://t.co/9fcwGvaki0
@RileyMcDonough make me smile :((
@f0ggstar @stuartthull work neighbour on motors. Asked why and he said hates the updates on search :( http://t.co/XvmTUikWln
why?:("@tahuodyy: sialan:( https://t.co/Hv1i0xcrL2"
Athabasca glacier was there in #1948 :-( #athabasca #glacier #jasper #jaspernationalpark #alberta #explorealberta #… http://t.co/dZZdqmf7Cz


In [3]:
# preprocess them and show them again
import re
processed_twt = [preprocess(twt) for twt in negTwt]
p_twt = []
for twt in processed_twt:
    twt = re.sub('\'m', 'm', twt)
    twt = re.sub('\'t', 't', twt)
    twt = re.sub('\'s', '', twt)
    p_twt.append(twt)
for twt in p_twt[:10]:
    print twt

hopeless for tmr :(
everything in the kids section of ikea is so cute. shame im nearly 19 in 2 months :(
AT_USER that heart sliding into the waste basket. :(
“AT_USER i hate japanese call him "bani" :( :(” me too
dang starting next week i have "work" :(
oh god, my babies' faces :( URL
AT_USER make me smile :((
AT_USER AT_USER work neighbour on motors. asked why and he said hates the updates on search :( URL
why?:("AT_USER sialan:( URL
athabasca glacier was there in 1948 :-( athabasca glacier jasper jaspernationalpark alberta explorealberta … URL


In [4]:
# remove stop words and transfer slangs
import re
stop_words = get_stopwords()
slang_dict = get_slang_dict()
doc_set_removed = []
for twt in p_twt:
    removed = []
    words = twt.split()
    for w in words:
        # strip punctuation
        w = w.strip('\'"?,.')
        # check if the word stats with an alphabet
        val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w)
        # ignore if it is a stop word
        if w.strip() in slang_dict:
            w = slang_dict[w.strip()]
        if w in stop_words or val is None:
            continue
        else:
            removed.append(w)
    doc_set_removed.append(removed)
print doc_set_removed[1]

[u'kids', u'section', u'ikea', u'cute', u'shame', 'Instant Message', u'nearly', u'months']


In [5]:
# tokenization
doc_set = []
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
for twt in doc_set_removed:
    tokens = doc_set.append(tokenizer.tokenize(' '.join(twt)))
print doc_set[1]

[u'kids', u'section', u'ikea', u'cute', u'shame', u'Instant', u'Message', u'nearly', u'months']


In [6]:
# stemming
from nltk.stem.porter import PorterStemmer
p_stemmer = PorterStemmer()
texts = []
for txt in doc_set:
    texts.append([p_stemmer.stem(i) for i in txt])
print texts[1]

[u'kid', u'section', u'ikea', u'cute', u'shame', u'Instant', u'Messag', u'nearli', u'month']


In [7]:
# construct document matrix
from gensim import corpora, models
dictionary = corpora.Dictionary(texts)
# convert to bag of words
corpus = [dictionary.doc2bow(text) for text in texts]
print corpus[0]

[(0, 1), (1, 1)]


In [8]:
# apply lda model, this might take a long time
import gensim
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=100)

In [9]:
ldamodel.print_topics(num_topics=10, num_words=10)

[(0,
  u'0.035*"day" + 0.020*"hate" + 0.020*"time" + 0.018*"yeah" + 0.015*"mean" + 0.012*"final" + 0.010*"a" + 0.009*"pictur" + 0.008*"holiday" + 0.008*"buy"'),
 (1,
  u'0.039*"wish" + 0.037*"sleep" + 0.028*"sad" + 0.018*"cant" + 0.016*"applic" + 0.013*"day" + 0.013*"not" + 0.011*"birthday" + 0.010*"mayb" + 0.008*"tonight"'),
 (2,
  u'0.043*"didnt" + 0.015*"ye" + 0.013*"heart" + 0.011*"twitter" + 0.010*"wait" + 0.009*"thesi" + 0.008*"true" + 0.008*"hour" + 0.008*"singl" + 0.008*"car"'),
 (3,
  u'0.096*"pleas" + 0.076*"follow" + 0.037*"love" + 0.033*"thank" + 0.015*"miss" + 0.013*"cute" + 0.011*"ice" + 0.011*"believ" + 0.010*"justin" + 0.010*"feel"'),
 (4,
  u'0.100*"t" + 0.098*"don" + 0.026*"kik" + 0.020*"guy" + 0.015*"know" + 0.015*"I" + 0.011*"rain" + 0.009*"nice" + 0.008*"bye" + 0.008*"money"'),
 (5,
  u'0.130*"Messag" + 0.130*"Instant" + 0.022*"okay" + 0.019*"miss" + 0.015*"babi" + 0.012*"hope" + 0.012*"peopl" + 0.010*"sad" + 0.008*"time" + 0.007*"hair"'),
 (6,
  u'0.036*"snapchat"