In [2]:
%run ../00_AdvancedPythonConcepts/talktools.py

# Topic Modelling

<font color="grey">Python for Data Science (AY250, UC Berkeley 2016, J. Bloom)</font>

What were the topics tweeted about during the 3rd Presidential Debate? We can use LDA to help us find themes.



a) Get Twitter data from the 3rd Debate (Oct 19, 2016) via https://github.com/chrisalbon/third_2016_presidential_debate_twitter

In [None]:
# this takes awhile!
%run get_tweets.py
retrieve_tweets("clean_data.csv","tweet_debate.csv")

b) Clean the text and tokenize it similar to what we saw with NLTK last week

In [None]:
#!pip install tweet-preprocessor
#conda install --channel mpi4py mpich mpi4py
#sudo mkdir -p /opt
#sudo ln -s ~/anaconda /opt/anaconda1anaconda2anaconda3
#!pip install pyLDAvis
#!pip install gensim

In [1]:
import pandas as pd
import preprocessor as p
df = pd.read_csv("tweet_debate.csv",usecols=["text","id"],index_col=["id"])

In [2]:
print(len(df))

7441


In [5]:
df.head()

Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
788826951897186304,Complaint Filed Against Clinton’s Campaign Fol...
788826954837307392,"These reasons are not just for Millennials. ""W..."
788826954707374080,Un Resumen De Lo Atroz E Irresponsable Que Ser...
788826955361640448,@Talkmaster Here's something I think u'll enjo...
788826956435451904,'How to Commit Voter Fraud on a Massive Scale'...


In [6]:
#p.set_options(p.OPT.URL, p.OPT.EMOJI,p.OPT.MENTION,p.OPT.SMILEY)
df["clean"] = df["text"].apply(p.tokenize)

In [9]:
# following http://nbviewer.jupyter.org/github/alexperrier/datatalks/blob/master/twitter/LDAvis_V2.ipynb#topic=2&lambda=0.57&term=
%run make_corpus.py

In [10]:
make_corpus(df["clean"].values)

c) Run LDA on the corpus

In [12]:
from gensim import corpora, models, similarities

In [14]:
lda_params      = {'num_topics': 20, 'passes': 20, 'alpha': 0.001}

# Load the corpus and Dictionary
corpus = corpora.MmCorpus("debate3.mm")
dictionary = corpora.Dictionary.load("debate3.dict")

print("Running LDA with: %s  " % lda_params)
lda = models.LdaModel(corpus, id2word=dictionary,
                        num_topics=lda_params['num_topics'],
                        passes=lda_params['passes'],
                        alpha = lda_params['alpha'])
lda.save("debate3.lda")

Running LDA with: {'passes': 20, 'num_topics': 20, 'alpha': 0.001}  


In [17]:
lda = models.LdaModel.load("debate3.lda")
lda.print_topics()

[(15,
  '0.282*mention + 0.058*clinton + 0.051*url + 0.021*bill + 0.019*trump + 0.018*hillary + 0.013*sexual + 0.012*assault + 0.012*time + 0.012*campaign'),
 (1,
  '0.062*clinton + 0.045*url + 0.038*mention + 0.030*bill + 0.021*hilary + 0.021*women + 0.019*rights + 0.018*hillary + 0.014*hashtag + 0.014*reporter'),
 (10,
  '0.068*url + 0.057*clinton + 0.040*mention + 0.026*trump + 0.020*number + 0.019*wikileaks + 0.017*google + 0.016*media + 0.013*campaign + 0.010*used'),
 (14,
  '0.470*hashtag + 0.050*url + 0.020*clinton + 0.017*mention + 0.013*number + 0.010*trump + 0.010*hillary + 0.006*media + 0.005*says + 0.004*vote'),
 (16,
  '0.068*url + 0.065*clinton + 0.063*mention + 0.045*hillary + 0.032*emails + 0.014*security + 0.012*house + 0.012*white + 0.012*voter + 0.012*russia'),
 (9,
  '0.061*url + 0.057*clinton + 0.028*mention + 0.022*trump + 0.019*hillary + 0.015*court + 0.015*number + 0.013*supreme + 0.013*everyone + 0.012*obama'),
 (8,
  '0.061*url + 0.040*clinton + 0.031*trump + 

In [18]:
import pyLDAvis.gensim

debate_data =  pyLDAvis.gensim.prepare(lda,corpus, dictionary)
pyLDAvis.display(debate_data)