In [76]:
from __future__ import print_function

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import LatentDirichletAllocation
from langdetect import detect
from langdetect import detect_langs

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
buttigieg = pd.read_csv('buttigieg_cleaned.csv')

warren = pd.read_csv('warren_cleaned.csv')

bernie = pd.read_csv('bernie_cleaned.csv')

biden = pd.read_csv('biden_cleaned.csv')

amy = pd.read_csv('klobuchar_cleaned.csv')

bloomberg = pd.read_csv('bloomberg_cleaned.csv')

trump = pd.read_csv('trump_cleaned.csv')

In [4]:
# Create list of all tweets (from every candidate combined)
all_tweets = list(buttigieg['tweet']) + list(warren['tweet']) + list(bernie['tweet']) + list(biden['tweet']) + list(amy['tweet']) + list(bloomberg['tweet']) + list(trump['tweet'])

In [5]:
len(all_tweets)

90824

In [93]:
# Make custom list of stopwords
nltk.download('stopwords')
stop_words=set(stopwords.words("english"))
stop_words=(stopwords.words("english"))
political_stopwords = ['mikebloomberg', 'ewarren', 'senwarren', 'berniesanders', 'sensanders', 'joebiden', 'mike', 'michael', 'cream', 'pony',
                       'petebuttigieg', 'sentinasmith', 'sen', 'amyklobuchar', 'peter', 'donaldtrump', 'realdonaldtrump', 'oscar', 'oscars',
                       'madoff', 'taupin', 'joseph', 'i', 'says', 'yang', 'steyer', 'tom', 'andrew', 'elizabeth', 'joe', 'donald', 'medal',
                      'song', 'elton', 'john', 'rocketman', 'mitt', 'romney', 'ponzi', 'kobe', 'news', 'wine', 'meatball', 'sexually', 'original',
                      'best', 'diane', 'palestinian', 'every', 'alexander', 'rush', 'limbaugh', 'vindman', 'said', 'say', 'says', 'mary', 'winstead',
                      'people', 'like', 'think', 'one', 'would', 'get', 'know', 'want', 'going', 'also', 'even', 'voters', 'votes', 'new', 'need',
                      'year', 'take', 'good', 'tonight', 'see', 'still', 'back', 'big', 'night', 'day', 'love', 'first', 'last', 'could', 'time',
                      'thing', 'city', 'town', 'hall', 'care', 'right', 'long', 'well', 'country', 'make', 'kamala', 'harris', 'dog', 'faced',
                      'soldier', 'bro', 'judy', 'super', 'mini', 'superbowl']
for i in range(len(political_stopwords)):
    stop_words.append(political_stopwords[i])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Robert/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [94]:
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = stop_words,
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                ngram_range = (1,2),
                                max_df = 0.5, 
                                min_df = 10)
dtm_tf = tf_vectorizer.fit_transform(all_tweets)
print(dtm_tf.shape)

(90824, 17335)


In [95]:
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(all_tweets)
print(dtm_tfidf.shape)



(90824, 17335)


In [96]:
lda_tf = LatentDirichletAllocation(n_components=7, random_state=0)
tf_fit = lda_tf.fit(dtm_tf)
tf_fit

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=7, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [107]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_tf.score(dtm_tf))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_tf.perplexity(dtm_tf))

Log Likelihood:  -9794002.038919557
Perplexity:  3899.3578245044473


In [97]:
lda_tfidf = LatentDirichletAllocation(n_components=7, random_state=0)
tfidf_fit = lda_tfidf.fit(dtm_tfidf)
tfidf_fit

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=7, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [105]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_tfidf.score(dtm_tfidf))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_tfidf.perplexity(dtm_tfidf))

Log Likelihood:  -2873028.5040659616
Perplexity:  15716.662230788423
{'batch_size': 128, 'doc_topic_prior': None, 'evaluate_every': -1, 'learning_decay': 0.7, 'learning_method': 'batch', 'learning_offset': 10.0, 'max_doc_update_iter': 100, 'max_iter': 10, 'mean_change_tol': 0.001, 'n_components': 7, 'n_jobs': None, 'perp_tol': 0.1, 'random_state': 0, 'topic_word_prior': None, 'total_samples': 1000000.0, 'verbose': 0}


In [101]:
tf_model = pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)
tf_model

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [100]:
tfidf_model = pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)
tfidf_model

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [99]:
# Top model
mmds_model = pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='mmds')
mmds_model

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [109]:
mmds_model = pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer, mds='mmds')
mmds_model

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [98]:
# Also a top model
tsne_model = pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='tsne')
tsne_model

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [108]:
tsne_model = pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer, mds='tsne')
tsne_model

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
