## LDA Topic Modeling in Python
The tutorial can be found [here](https://rstudio-pubs-static.s3.amazonaws.com/79360_850b2a69980c4488b1db95987a24867a.html)

In [1]:
from nltk.corpus import twitter_samples, TwitterCorpusReader
import sys
sys.path.append("../bhtsa")
from process_twt import preprocess, get_stopwords, get_slang_dict

fileIds = twitter_samples.fileids()
root = twitter_samples.root
# read some negative tweet data from corpus
negReader = TwitterCorpusReader(root, fileIds[0])
negTwt = []
for tweet in negReader.docs():
    negTwt.append((tweet['text']))
# take a look at some of the tweets
for twt in negTwt[:10]:
    print twt

hopeless for tmr :(
Everything in the kids section of IKEA is so cute. Shame I'm nearly 19 in 2 months :(
@Hegelbon That heart sliding into the waste basket. :(
“@ketchBurning: I hate Japanese call him "bani" :( :(”

Me too
Dang starting next week I have "work" :(
oh god, my babies' faces :( https://t.co/9fcwGvaki0
@RileyMcDonough make me smile :((
@f0ggstar @stuartthull work neighbour on motors. Asked why and he said hates the updates on search :( http://t.co/XvmTUikWln
why?:("@tahuodyy: sialan:( https://t.co/Hv1i0xcrL2"
Athabasca glacier was there in #1948 :-( #athabasca #glacier #jasper #jaspernationalpark #alberta #explorealberta #… http://t.co/dZZdqmf7Cz


In [2]:
# preprocess them and show them again
import re
processed_twt = [preprocess(twt) for twt in negTwt]
p_twt = []
for twt in processed_twt:
    twt = re.sub('\'m', 'm', twt)
    twt = re.sub('\'t', 't', twt)
    twt = re.sub('\'s', '', twt)
    p_twt.append(twt)
for twt in p_twt[:10]:
    print twt

hopeless for tmr :(
everything in the kids section of ikea is so cute. shame im nearly 19 in 2 months :(
AT_USER that heart sliding into the waste basket. :(
“AT_USER i hate japanese call him "bani" :( :(” me too
dang starting next week i have "work" :(
oh god, my babies' faces :( URL
AT_USER make me smile :((
AT_USER AT_USER work neighbour on motors. asked why and he said hates the updates on search :( URL
why?:("AT_USER sialan:( URL
athabasca glacier was there in 1948 :-( athabasca glacier jasper jaspernationalpark alberta explorealberta … URL


In [3]:
# remove stop words and transfer slangs
import re
stop_words = get_stopwords()
slang_dict = get_slang_dict()
doc_set_removed = []
for twt in p_twt:
    removed = []
    words = twt.split()
    for w in words:
        # strip punctuation
        w = w.strip('\'"?,.')
        # check if the word stats with an alphabet
        val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w)
        # ignore if it is a stop word
        if w.strip() in slang_dict:
            w = slang_dict[w.strip()]
        if w in stop_words or val is None:
            continue
        else:
            removed.append(w)
    doc_set_removed.append(removed)
print doc_set_removed[1]

[u'kids', u'section', u'ikea', u'cute', u'shame', 'Instant Message', u'nearly', u'months']


In [4]:
# tokenization
doc_set = []
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
for twt in doc_set_removed:
    tokens = doc_set.append(tokenizer.tokenize(' '.join(twt)))
print doc_set[1]

[u'kids', u'section', u'ikea', u'cute', u'shame', u'Instant', u'Message', u'nearly', u'months']


In [5]:
# stemming
from nltk.stem.porter import PorterStemmer
p_stemmer = PorterStemmer()
texts = []
for txt in doc_set:
    texts.append([p_stemmer.stem(i) for i in txt])
print texts[1]

[u'kid', u'section', u'ikea', u'cute', u'shame', u'Instant', u'Messag', u'nearli', u'month']


In [6]:
# construct document matrix
from gensim import corpora, models
dictionary = corpora.Dictionary(texts)
# convert to bag of words
corpus = [dictionary.doc2bow(text) for text in texts]
print corpus[0]

[(0, 1), (1, 1)]




In [7]:
# apply lda model, this might take a long time
import gensim
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=50, id2word=dictionary, passes=100)

In [8]:
ldamodel.print_topics(num_topics=50, num_words=10)

[(0,
  u'0.081*"friend" + 0.048*"nice" + 0.045*"left" + 0.033*"money" + 0.033*"hello" + 0.030*"name" + 0.027*"that" + 0.024*"sa" + 0.023*"disappoint" + 0.020*"bet"'),
 (1,
  u'0.146*"wish" + 0.058*"cute" + 0.038*"weather" + 0.035*"cri" + 0.028*"thesi" + 0.017*"beauti" + 0.014*"be" + 0.013*"probabl" + 0.011*"horribl" + 0.010*"one"'),
 (2,
  u'0.155*"didnt" + 0.059*"ice" + 0.045*"cream" + 0.041*"phone" + 0.037*"gone" + 0.026*"anymor" + 0.023*"pretti" + 0.021*"fix" + 0.021*"terribl" + 0.018*"time"'),
 (3,
  u'0.147*"look" + 0.096*"fuck" + 0.087*"tire" + 0.041*"make" + 0.029*"read" + 0.026*"text" + 0.024*"tonight" + 0.022*"say" + 0.021*"avail" + 0.020*"book"'),
 (4,
  u'0.079*"meet" + 0.069*"head" + 0.047*"god" + 0.037*"holiday" + 0.029*"post" + 0.025*"take" + 0.024*"crazi" + 0.023*"fan" + 0.021*"enjoy" + 0.018*"my"'),
 (5,
  u'0.154*"peopl" + 0.041*"die" + 0.036*"job" + 0.035*"sound" + 0.028*"news" + 0.024*"link" + 0.022*"world" + 0.018*"forget" + 0.017*"joke" + 0.017*"hand"'),
 (6,
  u'0

In [9]:
import pyLDAvis.gensim
vis_data = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis_data)