# pyLDAvis Topic Modeling

In [1]:
# Basics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle 

#sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [4]:
import warnings
warnings.filterwarnings('ignore')

  and should_run_async(code)


In [2]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [5]:
with open('../data_files/processed_tweets.pickle', 'rb') as read_file:
    tweets = pickle.load(read_file)

In [6]:
tweets.head()

Unnamed: 0,text,processed
0,#IslamKills Are you trying to say that there w...,islam kill try say terrorist attack europe ref...
1,"Clinton: Trump should’ve apologized more, atta...",clinton trump apologize attack little
2,RT @ltapoll: Who was/is the best president of ...,well president past retweet
3,RT @jww372: I don't have to guess your religio...,guess religion christmas aftermath
4,RT @Shareblue: Pence and his lawyers decided w...,pence lawyer decide official email public can see


In [7]:
proc_tweets = tweets.processed

In [8]:
vectorizer = CountVectorizer(token_pattern="\\b[a-z][a-z]+\\b",
                             binary=True,
                             stop_words='english')

In [9]:
dtm_tf = vectorizer.fit_transform(proc_tweets)
print(dtm_tf.shape)

(203482, 77793)


## 4 Topics

In [10]:
%%time

lda_4 = LatentDirichletAllocation(n_components=4, random_state=42)
lda_4.fit(dtm_tf)

CPU times: user 6min 2s, sys: 895 ms, total: 6min 3s
Wall time: 6min 3s


LatentDirichletAllocation(n_components=4, random_state=42)

In [11]:
pyLDAvis.sklearn.prepare(lda_4, dtm_tf, vectorizer)

**Topic 1: 2016 Election**
* Topics around the 2016 US Presidential Election 
* Main entities: Trump, Clinton, Obama
* Additional keywords: president, campaign, election, maga, email, fbi

**Topics 2 and 4: General Twitter topics**
* This category is more general twitter discussion
* Black Lives Matter and associated themes (police violence, the Oscars and its lack of diversity)
* Some German topics: tweets about Angela Merkel and Germany's stance on Brexit
* ISIS and the 2016 Brussels bombings 
* Thanksgiving and Christmas holidays

**Topic 3: Non-election conservative/Christian Twitter topics**
* tweets about Patriot Journalist Network - [a conservative Christian 'news' network known for spamming](http://www.slate.com/blogs/future_tense/2017/10/17/twitter_has_labeled_a_conservative_group_s_automated_tweets_as_spam.html)
* Ted Cruz
* Tea Party

## 6 Topics

In [12]:
%%time

lda_6 = LatentDirichletAllocation(n_components=6, random_state=42)
lda_6.fit(dtm_tf)

CPU times: user 4min 57s, sys: 393 ms, total: 4min 57s
Wall time: 4min 57s


LatentDirichletAllocation(n_components=6, random_state=42)

In [13]:
pyLDAvis.sklearn.prepare(lda_6, dtm_tf, vectorizer)

**Topic 1: **
* 

**Topic 2: **
* 

**Topic 3: **
* 

**Topic 4: **
* 

**Topic 5: **
* 

**Topic 6: **
* 


In [24]:
lda_doc_matrix = lda_6.fit_transform(dtm_tf)

In [25]:
lda_doc_matrix

array([[0.01675371, 0.01673653, 0.01674448, 0.01678994, 0.01674582,
        0.91622953],
       [0.02795393, 0.02812606, 0.02782292, 0.86003962, 0.0280738 ,
        0.02798367],
       [0.0421523 , 0.04166824, 0.04206904, 0.04210744, 0.78980418,
        0.0421988 ],
       ...,
       [0.88001883, 0.02402086, 0.02383027, 0.02402325, 0.02413116,
        0.02397562],
       [0.0242457 , 0.02398096, 0.21105365, 0.69263077, 0.02391857,
        0.02417035],
       [0.42841329, 0.01870377, 0.49710718, 0.01858861, 0.01863595,
        0.0185512 ]])

In [27]:
lda_doc_matrix.shape

(203482, 6)

In [42]:
doc_matrix = pd.DataFrame(lda_doc_matrix, columns=['one', 'two', 'three', 'four', 'five', 'six'])

In [43]:
doc_matrix.head()

Unnamed: 0,one,two,three,four,five,six
0,0.016754,0.016737,0.016744,0.01679,0.016746,0.91623
1,0.027954,0.028126,0.027823,0.86004,0.028074,0.027984
2,0.042152,0.041668,0.042069,0.042107,0.789804,0.042199
3,0.831839,0.034092,0.033443,0.033333,0.033442,0.033851
4,0.023893,0.023874,0.023912,0.880432,0.023982,0.023907


In [44]:
doc_matrix['label'] = doc_matrix.idxmax(axis=1)

In [45]:
doc_matrix.head()

Unnamed: 0,one,two,three,four,five,six,label
0,0.016754,0.016737,0.016744,0.01679,0.016746,0.91623,six
1,0.027954,0.028126,0.027823,0.86004,0.028074,0.027984,four
2,0.042152,0.041668,0.042069,0.042107,0.789804,0.042199,five
3,0.831839,0.034092,0.033443,0.033333,0.033442,0.033851,one
4,0.023893,0.023874,0.023912,0.880432,0.023982,0.023907,four


In [46]:
doc_matrix['tweet'] = tweets['text']

In [48]:
doc_matrix_df = doc_matrix[['tweet', 'label', 'one', 'two', 'three', 'four', 'five', 'six']]

In [49]:
doc_matrix_df

Unnamed: 0,tweet,label,one,two,three,four,five,six
0,#IslamKills Are you trying to say that there w...,six,0.016754,0.016737,0.016744,0.016790,0.016746,0.916230
1,"Clinton: Trump should’ve apologized more, atta...",four,0.027954,0.028126,0.027823,0.860040,0.028074,0.027984
2,RT @ltapoll: Who was/is the best president of ...,five,0.042152,0.041668,0.042069,0.042107,0.789804,0.042199
3,RT @jww372: I don't have to guess your religio...,one,0.831839,0.034092,0.033443,0.033333,0.033442,0.033851
4,RT @Shareblue: Pence and his lawyers decided w...,four,0.023893,0.023874,0.023912,0.880432,0.023982,0.023907
...,...,...,...,...,...,...,...,...
203477,"RT @AndreaChalupa: In intel circles, the story...",four,0.314942,0.011947,0.011943,0.637246,0.011955,0.011966
203478,RT @KansasCityDNews: Tonganoxie police: Middle...,two,0.016694,0.572151,0.016829,0.016724,0.016769,0.360834
203479,RT @signsinyork: Getting the right #company lo...,one,0.880019,0.024021,0.023830,0.024023,0.024131,0.023976
203480,The Latest: Obama affirms continuity of ties w...,four,0.024246,0.023981,0.211054,0.692631,0.023919,0.024170


In [50]:
# with open('../data_files/doc_matrix_df.pickle', 'wb') as to_write:
#    pickle.dump(doc_matrix_df, to_write)

## 10 Topics

In [14]:
%%time

lda_10 = LatentDirichletAllocation(n_components=10, random_state=42)
lda_10.fit(dtm_tf)

CPU times: user 4min 38s, sys: 407 ms, total: 4min 38s
Wall time: 4min 38s


LatentDirichletAllocation(random_state=42)

In [15]:
pyLDAvis.sklearn.prepare(lda_10, dtm_tf, vectorizer)

stuff abou tthis model

## 13 Topics

In [16]:
%%time

lda_13 = LatentDirichletAllocation(n_components=13, random_state=42)
lda_13.fit(dtm_tf)

CPU times: user 4min 28s, sys: 375 ms, total: 4min 28s
Wall time: 4min 28s


LatentDirichletAllocation(n_components=13, random_state=42)

In [17]:
pyLDAvis.sklearn.prepare(lda_13, dtm_tf, vectorizer)

## 16 Topics

In [20]:
%%time

lda_16 = LatentDirichletAllocation(n_components=16, random_state=42)
lda_16.fit(dtm_tf)

CPU times: user 4min 29s, sys: 446 ms, total: 4min 29s
Wall time: 4min 29s


LatentDirichletAllocation(n_components=16, random_state=42)

In [21]:
pyLDAvis.sklearn.prepare(lda_16, dtm_tf, vectorizer)