In [1]:
import pickle
import re
import string
import pandas as pd
import numpy as np
import json
import copy 
from datetime import datetime, timedelta

import GetOldTweets3 as got

from pymongo import MongoClient
import pymongo
from pymongo.errors import BulkWriteError
from nltk.corpus import wordnet

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction import text 
from sklearn.manifold import TSNE
import umap
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import gensim, spacy, logging, warnings
import en_core_web_sm
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
from gensim.models import Word2Vec
from gensim.models.nmf import Nmf

from matplotlib import pyplot as plt

[nltk_data] Downloading package punkt to /Users/samir/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/samir/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Preprocessing

In [2]:
with open('basic_cleaned_df.pkl', 'rb') as handle:
    df = pickle.load(handle)

In [3]:
df.dtypes

date         object
username     object
to           object
replies      object
retweets     object
favorites    object
text         object
mentions     object
hashtags     object
dtype: object

In [4]:
df.replies = df.replies.astype(int)

In [5]:
df.retweets = df.retweets.astype(int)

In [6]:
df.favorites = df.favorites.astype(int)

In [7]:
df.date = pd.to_datetime(df.date)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84615 entries, 0 to 84614
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date       84615 non-null  datetime64[ns]
 1   username   84615 non-null  object        
 2   to         84615 non-null  object        
 3   replies    84615 non-null  int64         
 4   retweets   84615 non-null  int64         
 5   favorites  84615 non-null  int64         
 6   text       84615 non-null  object        
 7   mentions   84615 non-null  object        
 8   hashtags   84615 non-null  object        
dtypes: datetime64[ns](1), int64(3), object(5)
memory usage: 5.8+ MB


In [9]:
#taking our URLs
urls = lambda x: re.sub(r'http\S+', '' ,x)

#taking out capitalization and digits
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)

#removing punctuation
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

df.text = df.text.map(urls).map(alphanumeric).map(punc_lower)

In [10]:
X = df.text.drop_duplicates()
data_list = [x for x in X]

In [12]:
#lemmatizer
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = WordNetLemmatizer()

data = []
for sentence in data_list:
    data.append([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)])

In [13]:
#final step of getting data ready for a vectorizer
final = []
for sentence in data:
    final.append(' '.join(sentence))

In [14]:
#adding custom stop words for this use case
addl_stop_words = (['coronavirus','corona virus', 'covid', 'covid-19', 'covid 19', 'corona',
                   'virus', 'new', 'case','cases', 'deaths', 'total', 'people', 'confirmed', 'novel',
                   'outbreak', 'pandemic', 'epidemic', 'death', 'like', 'just', 'news', 'rt', 'increasingly',
                   'illness', 'infection', 'infected', 'diagnosed', 'reports', "breaking", 'reported', 'dead'
                  ,'looks', 'know', 'big', 'type', 'make', 'unveil', 'experts', 'say', 'says', 'said', 
                    'grows', 'growing','day', 'days', "foxnews",'week','patient', 'hospital', 'number', 
                    'sick', 'doctor', 'next', 'health', 'first', 'even', 'press', 'youtube', 'fact', 
                    'likely', 'global', 'disease', 'thing', 'really','world', 'man', 'also', 'month', 
                    'job', 'many', 'time', 'way', 'get', 'think', 'need', 'home', 'go', 'may', 'going', 'would',
                    'live', 'see', 'update', 'far', 'last', 'year', 'back', 'much', 'medical', 'one', 'via',
                    'could', 'maybe', 'details', 'today', 'three', 'ninth', 'epoch', 'epoch times', 'download', 'app'
                   ,'pron', 'daily', 'updates', 'coverage', 'fox', 'virtual', 'hall', 'programming', 'alert',
                   'coronavirusoutbreak', 'confirm','due','die', 'gon', 'na', 'gonna', 'wan', 'wanna', 'come', 'take'
                   , 'kill'])


custom_stop_words = stopwords.words('english') + addl_stop_words

In [15]:
#function taken from lecture slides to help display the topics and the top words per topic
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

## Topics surrounding quarantine and gov't shutdown

**Before we get into months, I wanted to look at the discussion around the gov't shutdown in particular to see what the topics are there**

In [16]:
quarantine = (df['text'].str.contains("quarantine"))
shutdown = (df['text'].str.contains("shutdown"))
lockdown = (df['text'].str.contains("lockdown"))
shut_down = (df['text'].str.contains("shut down"))
lock_down = (df['text'].str.contains("lock down"))
shelter1 = (df['text'].str.contains("shelter in-place"))
shelter2 = (df['text'].str.contains("shelter inplace"))
shelter3 = (df['text'].str.contains("shelter in place"))
social_distancing = (df['text'].str.contains("social distancing"))

mask = quarantine | shutdown | lockdown | shut_down | lock_down | shelter1 | shelter2 | shelter3 | social_distancing

quarantine_df = df[mask]


In [17]:
X_new = quarantine_df.text

In [18]:
#adding custom stop words for this use case
quarantine_words = ['lockdown', 'shutdown', 'shut down', 'lock down', 'quarantine', 'shelter', 'in-place', 'social distancing', 'social', 'distancing']

quarantine_stop_words = custom_stop_words + quarantine_words

In [19]:
qte_nmf_tfidf = TfidfVectorizer(stop_words=quarantine_stop_words,ngram_range=(1,3), min_df=5, binary=True)
qte_tfidf_nmf_doc_word = qte_nmf_tfidf.fit_transform(X_new)
qte_nmf_model = NMF(15)
qte_tfidf_nmf_doc_topic = qte_nmf_model.fit_transform(qte_tfidf_nmf_doc_word)
display_topics(qte_nmf_model, qte_nmf_tfidf.get_feature_names(), 10)

  'stop_words.' % sorted(inconsistent))



Topic  0
shut transportation, transportation wuhan, shut transportation wuhan, transportation, authorities limit, travel million, travel million residents, limit travel, limit travel million, authorities limit travel

Topic  1
april, guidelines, extends, trump, guidelines april, extends guidelines, extends guidelines april, trump extends, trump extends guidelines, peak

Topic  2
authoritarian orders, persist authoritarian, persist authoritarian orders, governors persist, governors persist authoritarian, barr governors, barr governors persist, ag barr governors, ag, ag barr

Topic  3
disney, furlough, co, walt disney, disney co furlough, disney co, co furlough workers, co furlough, walt, furlough workers

Topic  4
mental, practices, asked stay, practices help, collective mental uncertain, psychologically practices, psychologically, easy psychologically practices, easy psychologically, asked stay fight

Topic  5
italian towns, towns, italian, italian towns fears, towns fears, fears, tow

**The shutdown topics are interesting and Tiger King made an appearance. But nothing too juicy here**

# Seeing how topics changed over time with TF-IDF and NMF

In [20]:
dec_df = df[df.date.apply(lambda x: x.month==12)]
jan_df = df[df.date.apply(lambda x: x.month==1)]
feb_df = df[df.date.apply(lambda x: x.month==2)]
mar_df = df[df.date.apply(lambda x: x.month==3)]
apr_df = df[df.date.apply(lambda x: x.month==4)]
may_df = df[df.date.apply(lambda x: x.month==5)]

## Dec Topics:

In [21]:
data = dec_df.text
dec_nmf_tfidf = TfidfVectorizer(stop_words=custom_stop_words,ngram_range=(1,3), binary=True)
dec_tfidf_nmf_doc_word = dec_nmf_tfidf.fit_transform(data)
dec_nmf_model = NMF(10)
dec_tfidf_nmf_doc_topic = dec_nmf_model.fit_transform(dec_tfidf_nmf_doc_word)
display_topics(dec_nmf_model, dec_nmf_tfidf.get_feature_names(), 10)


Topic  0
qatar, cov qatar, mers cov qatar, syndrome mers cov, syndrome mers, respiratory syndrome mers, mers cov, cov, middle east respiratory, middle east

Topic  1
dromedary, dromedary camels, camels, imported, local, camels prospective, dromedary camels prospective, local arabian dromedary, genomic study, imported african local

Topic  2
kingdom saudi arabia, kingdom, kingdom saudi, saudi, saudi arabia, arabia, mers cov kingdom, cov kingdom, cov kingdom saudi, syndrome mers

Topic  3
likes, retweets, mentions, reach, likes retweets, twitter mentions, twitter, reach likes retweets, twitter mentions mention, mention reach

Topic  4
exacto, amen, real, molecular mechanism, molecular mechanism antibody, mechanism antibody dependent, mechanism antibody, mechanism, antibody, molecular

Topic  5
feline, feline infectious, feline infectious peritonitis, infectious peritonitis, peritonitis, infectious, cats, fip, caused, infectious peritonitis fip

Topic  6
biggest fans, biggest, thank, fan

  'stop_words.' % sorted(inconsistent))


## Jan Topics

In [22]:
data = jan_df.text.drop_duplicates()
jan_nmf_tfidf = TfidfVectorizer(stop_words=custom_stop_words,ngram_range=(1,3), binary=True)
jan_tfidf_nmf_doc_word = jan_nmf_tfidf.fit_transform(data)
jan_nmf_model = NMF(10)
jan_tfidf_nmf_doc_topic = jan_nmf_model.fit_transform(jan_tfidf_nmf_doc_word)
display_topics(jan_nmf_model, jan_nmf_tfidf.get_feature_names(), 10)



  'stop_words.' % sorted(inconsistent))



Topic  0
china, cause china, china wuhan, china pneumonia, wuhan china, china worse, worse, mystery china, strain, cause

Topic  1
mystery caused, wuhan pneumonia mystery, pneumonia mystery caused, pneumonia mystery, mystery, wuhan pneumonia, caused, pneumonia, wuhan, bbc

Topic  2
shut transportation, shut transportation wuhan, authorities limit, limit travel, residents shut transportation, residents shut, travel million residents, travel million, authorities limit travel, transportation wuhan amid

Topic  3
chinese city, city, city wuhan, chinese city wuhan, central, chinese, central chinese city, central chinese, authorities central chinese, authorities central

Topic  4
chinese report, report, illnesses, chinese report illnesses, report illnesses, chinese, xpress, chinese report wuhan, report wuhan, china chinese report

Topic  5
hacked, hacked phone hits, hacked phone, bezos hacked phone, hits us, phone hits, phone hits us, bezos hacked, jeff bezos hacked, bezos

Topic  6
wuhan, 

## Feb Topics 

In [23]:
data = feb_df.text.drop_duplicates()
feb_nmf_tfidf = TfidfVectorizer(stop_words=custom_stop_words,ngram_range=(1,3), binary=True)
feb_tfidf_nmf_doc_word = feb_nmf_tfidf.fit_transform(data)
feb_nmf_model = NMF(10)
feb_tfidf_nmf_doc_topic = feb_nmf_model.fit_transform(feb_tfidf_nmf_doc_word)
display_topics(feb_nmf_model, feb_nmf_tfidf.get_feature_names(), 10)

  'stop_words.' % sorted(inconsistent))



Topic  0
sars flu, gt sars, gt sars flu, sars, gt, flu, flu china, sars flu china, china trump, trump

Topic  1
italy, northern italy, italy least, northern, least, ansa, italy italy, italy ansa, monselice, old

Topic  2
hubei, china hubei, hubei province, province, china hubei province, china, feb, hubei province epicentre, province epicentre, epicentre

Topic  3
mobile, mobile congress, congress, fears, canceled, cancelled, concerns, congress cancelled, mobile congress cancelled, mobile congress canceled

Topic  4
us, cdc, americans, spread, flown, americans flown, advice, cdc advice, flown cdc, americans flown cdc

Topic  5
china, outside, outside china, philippines, philippines outside, philippines outside china, korea, south, south korea, related outside china

Topic  6
test, kits, states, test kits, kits sent, flawed, kits sent states, sent states, sent, test kits sent

Topic  7
got, bitch got, bitch, shit got, shit, nigga, wtf, bro, got dat, nigga got

Topic  8
bay, bay area, a

## March Topics

In [24]:
data = mar_df.text.drop_duplicates()
mar_nmf_tfidf = TfidfVectorizer(stop_words=custom_stop_words,ngram_range=(1,3), binary=True)
mar_tfidf_nmf_doc_word = mar_nmf_tfidf.fit_transform(data)
mar_nmf_model = NMF(10)
mar_tfidf_nmf_doc_topic = mar_nmf_model.fit_transform(mar_tfidf_nmf_doc_word)
display_topics(mar_nmf_model, mar_nmf_tfidf.get_feature_names(), 10)

  'stop_words.' % sorted(inconsistent))



Topic  0
weinstein, harvey weinstein, harvey, weinstein tests positive, weinstein tests, harvey weinstein tests, tests positive, tests, prison, positive

Topic  1
distancing, social distancing, social, guidelines, april, distancing guidelines, social distancing guidelines, extends, guidelines april, distancing guidelines april

Topic  2
fails, fails move, fails move forward, move forward, move, bill, forward, senate fails, senate fails move, move forward phase

Topic  3
trump, us, president, response, china, got, help, stop, crisis, americans

Topic  4
positive, tests, tests positive, tested, son, atiku, tested positive, son tests, son tests positive, atiku son

Topic  5
diffie, joe diffie, joe, country, dies, complications, star joe diffie, star joe, diffie dies, star

Topic  6
john prine, prine, john, critical condition, condition, critical, john prine critical, prine critical, prine critical condition, critical condition symptoms

Topic  7
president, president trump, someone cough,

## April Topics

In [25]:
data = apr_df.text.drop_duplicates()
apr_nmf_tfidf = TfidfVectorizer(stop_words=custom_stop_words,ngram_range=(1,3), binary=True)
apr_tfidf_nmf_doc_word = apr_nmf_tfidf.fit_transform(data)
apr_nmf_model = NMF(10)
apr_tfidf_nmf_doc_topic = apr_nmf_model.fit_transform(apr_tfidf_nmf_doc_word)
display_topics(apr_nmf_model, apr_nmf_tfidf.get_feature_names(), 10)

  'stop_words.' % sorted(inconsistent))



Topic  0
wave, second wave, warns, cdc director, second, director, cdc director warns, director warns, cdc, warns second

Topic  1
us, china, help, vaccine, test, spread, lockdown, crisis, state, states

Topic  2
white, task, white house, house, task force, force, briefing, task force briefing, force briefing, house task force

Topic  3
senate, passes, billion, senate passes, relief, bill, small, passes billion, senate passes billion, package

Topic  4
contract, dem, backtracks, consultant trump, contract awarded dem, awarded dem, dem consultant trump, awarded dem consultant, contract awarded, trump asap

Topic  5
navy, carrier, aircraft carrier, aircraft, captain, alarm, raised alarm, raised, captain raised, captain raised alarm

Topic  6
trump, immigration, immigration suspension, suspension, trump immigration, trump immigration suspension, suspend immigration, suspend, trump suspend, trump suspend immigration

Topic  7
possibly worse winter, possibly worse, worse winter, cdc chief,

## May Topics

In [26]:
data = may_df.text.drop_duplicates()
may_nmf_tfidf = TfidfVectorizer(stop_words=custom_stop_words,ngram_range=(1,3), binary=True)
may_tfidf_nmf_doc_word = may_nmf_tfidf.fit_transform(data)
may_nmf_model = NMF(10)
may_tfidf_nmf_doc_topic = may_nmf_model.fit_transform(may_tfidf_nmf_doc_word)
display_topics(may_nmf_model, may_nmf_tfidf.get_feature_names(), 10)

  'stop_words.' % sorted(inconsistent))



Topic  0
deregulations amid, deregulations, order aiming hundreds, announces executive, executive order aiming, announces executive order, aiming hundreds, order aiming, aiming, executive order

Topic  1
cost largest, bill estimated, estimated cost largest, bill estimated cost, estimated cost, cost largest stimulus, estimated, largest stimulus, largest stimulus package, package yet

Topic  2
democrats, relief, trillion, house democrats, bill, house, trillion relief, democrats trillion, house democrats trillion, democrats trillion relief

Topic  3
trump, fauci, response, dr, president, china, testing, positive, us, numbers

Topic  4
misreading data, misreading, egregious misreading data, egregious misreading, egregious, data survey, misreading data survey, cnn accused egregious, cnn accused, accused egregious misreading

Topic  5
illinois, companies involved testing, companies involved, stake companies, stake companies involved, governor family, governor family firm, illinois governor 

**Results:**
Not too much additional data here. It does show when certain topics are the most popular though. The next stop is to create a plot showing how topics changed over time