In [101]:
%run ../00_AdvancedPythonConcepts/talktools.py

# Topic Modelling

<font color="grey">Python for Data Science (AY250, UC Berkeley 2016&mdash;2018, J. Bloom)</font>

What are some recent topics tweeted about in Berkeley, CA with the word "Bears" in it? 

We can use LDA  (http://ai.stanford.edu/~ang/papers/jair03-lda.pdf) to help us find themes.

"a generative probabilistic model for collections of discrete dataset such as text corpora. It is also a topic model that is used for discovering abstract topics from a collection of documents."

<img src="http://mcburton.net/blog/joy-of-tm/images/image02.png">

- http://scikit-learn.org/stable/modules/decomposition.html#latentdirichletallocation


### Get Tweet IDs from near Berkeley, CA where Bears is mentioned

In [None]:
import pandas as pd
import tweepy
import csv

import json
cred = json.load(open(".cred.json","r"))

consumer_key = cred["consumer_key"]
consumer_secret = cred["consumer_secret"]
access_token = cred["access_token"]
access_secret = cred["access_secret"]

In [None]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True, 
                     retry_count=3, retry_delay=5, retry_errors=set([401, 404, 500, 503]))

In [None]:
n=2500
ids = []
a = tweepy.Cursor(api.search, q='bears', geocode="37.8716,-122.2727,100km",since_id=874829847523414016)
for t in a.items():
    ids.append(t.id)
    n-=1
    if n<0:
        break
    if n % 100 == 0:
        print(n,end="...")

In [None]:
len(ids)

In [None]:
import pandas as pd
df = pd.DataFrame(ids,columns=["bears_ids"])
df.to_csv("bears.csv",index=False)

In [None]:
!head bears.csv

### Get the body of text from those tweets

In [None]:
# this takes awhile!
%run get_tweets.py
retrieve_tweets("bears.csv","tweet_bears.csv")

In [None]:
#!pip install tweet-preprocessor
#!conda install --channel mpi4py mpich mpi4py -y

In [None]:
#!pip install pyLDAvis
#!pip install gensim

In [None]:
import pandas as pd
import preprocessor as p
df = pd.read_csv("tweet_bears.csv",usecols=["text","id"],index_col=["id"])

In [None]:
print(len(df))

In [None]:
df.head(20)

In [None]:
# p.set_options(p.OPT.URL, p.OPT.EMOJI,p.OPT.MENTION,p.OPT.SMILEY)
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.HASHTAG, p.OPT.MENTION)
df["clean"] = df["text"].apply(p.clean).apply(lambda x: x.replace("RT", "")).apply(lambda x: x.replace(":", ""))

In [None]:
df["clean"]

## Generate the LDA topics

In [None]:
%run make_corpus.py

In [None]:
make_corpus(df["clean"].values, outdictfile='bears.dict',mmfile='bears.mm')

In [None]:
from gensim import corpora, models, similarities

In [None]:
lda_params      = {'num_topics': 10, 'passes': 25, 'alpha': 0.001}

# Load the corpus and Dictionary
corpus = corpora.MmCorpus("bears.mm")
dictionary = corpora.Dictionary.load("bears.dict")

print("Running LDA with: %s  " % lda_params)
lda = models.LdaModel(corpus, id2word=dictionary,
                      num_topics=lda_params['num_topics'],
                      passes=lda_params['passes'],
                      alpha = lda_params['alpha'])
lda.save("bears.lda")

In [None]:
lda = models.LdaModel.load("bears.lda")
lda.print_topics()

## Visualize

In [None]:
import pyLDAvis.gensim

debate_data =  pyLDAvis.gensim.prepare(lda,corpus, dictionary)
pyLDAvis.display(debate_data)