# Playground for Topic Modeling slides - Joe Biden Tweets
- Stephen W. Thomas
- Used for MMA 865; MMAI 891; Exec Ed

In [6]:
import datetime
print(datetime.datetime.now())

2022-03-01 08:14:07.735002


In [7]:
import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns 

import re
import string
import unicodedata

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

from bs4 import BeautifulSoup

import os

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Read in the Data

In [18]:
df=pd.read_csv("https://raw.githubusercontent.com/stepthom/NLP_course/main/data/JoeBidenTweets.csv")

df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4694 entries, 0 to 4693
Data columns (total 7 columns):
id           4694 non-null float64
username     4694 non-null object
timestamp    4694 non-null object
link         4694 non-null object
tweet        4694 non-null object
retweets     4694 non-null int64
likes        4694 non-null int64
dtypes: float64(1), int64(2), object(4)
memory usage: 256.8+ KB


Unnamed: 0,id,username,timestamp,link,tweet,retweets,likes
0,1.27883e+18,JoeBiden,7/3/2020 5:00,https://twitter.com/JoeBiden/status/1278833330...,Wear a mask. pic. twitter.com/HBDMNA4ary,18191,92047
1,1.27881e+18,JoeBiden,7/3/2020 3:25,https://twitter.com/JoeBiden/status/1278809602...,Let me be clear: There’s no victory to be cele...,10644,47486
2,1.27878e+18,JoeBiden,7/3/2020 1:15,https://twitter.com/JoeBiden/status/1278776708...,This is a job Donald Trump is entirely unfit f...,15149,51443
3,1.27875e+18,JoeBiden,7/2/2020 23:18,https://twitter.com/JoeBiden/status/1278747263...,"“Mr. President, it's too much.”pic.twitter.com...",44466,137553
4,1.27873e+18,JoeBiden,7/2/2020 22:23,https://twitter.com/JoeBiden/status/1278733658...,This is a shameful move. Now is the time for e...,6357,23150


In [23]:
df = df.drop(['id', 'username', 'link', 'retweets', 'likes'], axis=1)

# Text Preprocessing

In [24]:
from nltk.corpus import stopwords 
from bs4 import BeautifulSoup
import unidecode

import warnings

with warnings.catch_warnings():
    warnings.filterwarnings("ignore",category=DeprecationWarning)

stop_words = set(stopwords.words('english') + stopwords.words('spanish'))

lemmer = WordNetLemmatizer()

def preprocess(x):
    # Remove HTML tags
    # (A note about the warnings: BeautifulSoup is throwing lots of DepecationWarnings, and 
    # I just don't want to see them right now.)
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore",category=DeprecationWarning)
        x = BeautifulSoup(x, "lxml").get_text()

    # Lower case
    x = x.lower()
    
    # Remove punctuation
    x = re.sub(r'[^\w\s]', '', x)
    
    # Remove non-unicode
    x = unidecode.unidecode(x)
    
    # Remove numbers
    #x = re.sub(r'\d+', '', x)
    
    # Remove stopwords
    x = ' '.join([w for w in x.split() if w not in stop_words])
    
    # Lemmatize
    #x = ' '.join([lemmer.lemmatize(w) for w in x.split()])
    
    return x

df['tweet_clean'] = df['tweet'].apply(preprocess)
df.head()

Unnamed: 0,timestamp,tweet,tweet_clean,tweet_clean_filter
0,7/3/2020 5:00,Wear a mask. pic. twitter.com/HBDMNA4ary,wear mask pic twittercomhbdmna4ary,
1,7/3/2020 3:25,Let me be clear: There’s no victory to be cele...,let clear theres victory celebrated still near...,be no be were is not
2,7/3/2020 1:15,This is a job Donald Trump is entirely unfit f...,job donald trump entirely unfit forpictwitterc...,this is donald trump is
3,7/2/2020 23:18,"“Mr. President, it's too much.”pic.twitter.com...",mr president muchpictwittercomweebjipkgj,president its
4,7/2/2020 22:23,This is a shameful move. Now is the time for e...,shameful move time employers empathy families ...,this is is time for have for its time for pres...


## Topic Modeling with LDA

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.5, 
                             min_df=0.05, 
                             max_features=300, 
                             ngram_range=[1,3])

vectorizer = vectorizer.fit(df['tweet_clean'])

dtm = vectorizer.transform(df['tweet_clean'])

feature_names = vectorizer.get_feature_names()
def filter_words(x):
    return ' '.join([w for w in x.split() if w in feature_names])

# Create a new column, which is the same as tweet_clean, but only keeps the 
# words from the vectorizer's vocabulary
df['tweet_clean_filter'] = df['tweet_clean'].apply(filter_words)
df.head()

Unnamed: 0,timestamp,tweet,tweet_clean,tweet_clean_filter
0,7/3/2020 5:00,Wear a mask. pic. twitter.com/HBDMNA4ary,wear mask pic twittercomhbdmna4ary,
1,7/3/2020 3:25,Let me be clear: There’s no victory to be cele...,let clear theres victory celebrated still near...,
2,7/3/2020 1:15,This is a job Donald Trump is entirely unfit f...,job donald trump entirely unfit forpictwitterc...,donald trump
3,7/2/2020 23:18,"“Mr. President, it's too much.”pic.twitter.com...",mr president muchpictwittercomweebjipkgj,president
4,7/2/2020 22:23,This is a shameful move. Now is the time for e...,shameful move time employers empathy families ...,time time president trump get


In [26]:
from sklearn.decomposition import LatentDirichletAllocation



lda_model = LatentDirichletAllocation(n_components=30,
                                      #doc_topic_prior=alpha,
                                      #topic_word_prior=beta,
                                      max_iter=200, 
                                      learning_method='batch', 
                                      random_state=123,
                                      n_jobs=2,
                                      verbose=0)
lda_output = lda_model.fit(dtm)

# Log Likelyhood: Higher the better
ll = lda_model.score(dtm)

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
perp = lda_model.perplexity(dtm)

# Theta = document-topic matrix
# Beta = components_ = topic-term matrix
theta = pd.DataFrame(lda_model.transform(dtm))
beta = pd.DataFrame(lda_model.components_)

# Build Topic Summary
no_top_words = 10
weight = theta.sum(axis=0)
support50 = (theta > 0.5).sum(axis=0)
support10 = (theta > 0.1).sum(axis=0)
termss = list()
for topic_id, topic in enumerate(lda_model.components_):
    terms = " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])
    termss.append(terms)
topic_summary = pd.DataFrame({'TopicID': range(0, len(termss)), "Support50": support50, "Support10": support10, "Weight": weight, "Terms": termss})

print(topic_summary)



    TopicID  Support50  Support10      Weight  \
0         0        128        336  214.984897   
1         1         61        262  158.569902   
2         2        198        549  323.610483   
3         3        106        182  170.400497   
4         4         58        326  166.835626   
5         5         56        339  171.391695   
6         6        130        300  193.322127   
7         7          1          6   82.882537   
8         8         42        200  143.195407   
9         9        164        255  185.048114   
10       10         53        404  177.550548   
11       11          0          0   80.750227   
12       12         47        282  155.175387   
13       13         91        164  144.851825   
14       14         74        361  184.212644   
15       15         40        324  166.014882   
16       16         81        311  183.385425   
17       17         57        278  159.050056   
18       18          0          0   80.750227   
19       19         

# Visualize Topics with LDAVis

In [27]:
import pyLDAvis.sklearn
 
pyLDAvis.enable_notebook()
% time pyLDAvis.sklearn.prepare(lda_model, dtm, vectorizer, mds="tsne")

KeyboardInterrupt: 