In [19]:
%run ../00_AdvancedPythonConcepts/talktools.py

# Topic Modelling

<font color="grey">Python for Data Science (AY250, UC Berkeley 2016, J. Bloom)</font>

What were the topics tweeted about during the 3rd Presidential Debate? We can use LDA to help us find themes.



a) Get Twitter data from the 3rd Debate (Oct 19, 2016) via https://github.com/chrisalbon/third_2016_presidential_debate_twitter

In [None]:
# this takes awhile!
%run get_tweets.py
retrieve_tweets("clean_data.csv","tweet_debate.csv")

b) Clean the text and tokenize it similar to what we saw with NLTK last week

In [None]:
#!pip install tweet-preprocessor
#conda install --channel mpi4py mpich mpi4py
#sudo mkdir -p /opt
#sudo ln -s ~/anaconda /opt/anaconda1anaconda2anaconda3
#!pip install pyLDAvis
#!pip install gensim

In [20]:
import pandas as pd
import preprocessor as p
df = pd.read_csv("tweet_debate.csv",usecols=["text","id"],index_col=["id"])

In [21]:
print(len(df))

7441


In [25]:
df.head(20)

Unnamed: 0_level_0,text,clean
id,Unnamed: 1_level_1,Unnamed: 2_level_1
788826951897186304,Complaint Filed Against Clinton’s Campaign Fol...,Complaint Filed Against Clinton’s Campaign Fol...
788826954837307392,"These reasons are not just for Millennials. ""W...","These reasons are not just for Millennials. ""W..."
788826954707374080,Un Resumen De Lo Atroz E Irresponsable Que Ser...,Un Resumen De Lo Atroz E Irresponsable Que Ser...
788826955361640448,@Talkmaster Here's something I think u'll enjo...,$MENTION$ Here's something I think u'll enjoy!...
788826956435451904,'How to Commit Voter Fraud on a Massive Scale'...,'How to Commit Voter Fraud on a Massive Scale'...
788826959241289728,#Security#Character Hillary Clinton's security...,$HASHTAG$$HASHTAG$ Hillary Clinton's security ...
788826958402428928,@KimwithpanacheG @GideonHenry I'm all in for T...,$MENTION$ $MENTION$ I'm all in for Trump! Here...
788826960499703809,McAuliffe considered as potential Clinton runn...,McAuliffe considered as potential Clinton runn...
788826961460158464,"Let's ban the phrase ""edges Trump"" https://t.c...","Let's ban the phrase ""edges Trump"" $URL$"
788826961883914240,Lin-Manuel Miranda and Renée Elise Goldsberry ...,Lin-Manuel Miranda and Renée Elise Goldsberry ...


In [23]:
#p.set_options(p.OPT.URL, p.OPT.EMOJI,p.OPT.MENTION,p.OPT.SMILEY)
df["clean"] = df["text"].apply(p.tokenize)

In [24]:
df["clean"].head()

id
788826951897186304    Complaint Filed Against Clinton’s Campaign Fol...
788826954837307392    These reasons are not just for Millennials. "W...
788826954707374080    Un Resumen De Lo Atroz E Irresponsable Que Ser...
788826955361640448    $MENTION$ Here's something I think u'll enjoy!...
788826956435451904    'How to Commit Voter Fraud on a Massive Scale'...
Name: clean, dtype: object

In [32]:
# following http://nbviewer.jupyter.org/github/alexperrier/datatalks/blob/master/twitter/LDAvis_V2.ipynb#topic=2&lambda=0.57&term=
%run make_corpus.py

In [33]:
make_corpus(df["clean"].values)

c) Run LDA on the corpus

In [34]:
from gensim import corpora, models, similarities

In [35]:
lda_params      = {'num_topics': 20, 'passes': 20, 'alpha': 0.001}

# Load the corpus and Dictionary
corpus = corpora.MmCorpus("debate3.mm")
dictionary = corpora.Dictionary.load("debate3.dict")

print("Running LDA with: %s  " % lda_params)
lda = models.LdaModel(corpus, id2word=dictionary,
                      num_topics=lda_params['num_topics'],
                      passes=lda_params['passes'],
                      alpha = lda_params['alpha'])
lda.save("debate3.lda")

Running LDA with: {'passes': 20, 'num_topics': 20, 'alpha': 0.001}  


In [36]:
lda = models.LdaModel.load("debate3.lda")
lda.print_topics()

[(11,
  '0.074*clinton + 0.054*trump + 0.052*hillary + 0.049*video + 0.042*keefe + 0.037*election + 0.030*rigged + 0.029*que + 0.026*number + 0.021*las'),
 (18,
  '0.156*number + 0.082*trump + 0.071*clinton + 0.032*watch + 0.024*poll + 0.017*isis + 0.017*syria + 0.014*defeating + 0.012*hillary + 0.012*white'),
 (7,
  '0.058*clinton + 0.052*trump + 0.025*hillary + 0.022*question + 0.021*says + 0.018*better + 0.018*every + 0.017*answer + 0.015*another + 0.015*cnn'),
 (8,
  '0.055*trump + 0.053*clinton + 0.030*number + 0.025*hillary + 0.022*news + 0.014*women + 0.014*smiley + 0.013*chance + 0.013*never + 0.012*fraud'),
 (19,
  '0.104*obama + 0.093*clinton + 0.059*court + 0.056*supreme + 0.048*promised + 0.046*scalia + 0.046*spot + 0.042*trump + 0.031*women + 0.029*campaign'),
 (5,
  '0.072*clinton + 0.058*hillary + 0.020*twitter + 0.019*breaking + 0.017*conservative + 0.017*vote + 0.016*war + 0.015*emails + 0.015*vogue + 0.014*una'),
 (6,
  '0.070*clinton + 0.052*hillary + 0.026*smiley + 

In [37]:
import pyLDAvis.gensim

debate_data =  pyLDAvis.gensim.prepare(lda,corpus, dictionary)
pyLDAvis.display(debate_data)