In [1]:
import pickle
import re
import string
import pandas as pd
import numpy as np
import json
import copy 

import GetOldTweets3 as got

from pymongo import MongoClient
import pymongo
from pymongo.errors import BulkWriteError

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction import text
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import gensim, spacy, logging, warnings
import en_core_web_sm
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
from gensim.models import Word2Vec
from gensim.models.nmf import Nmf

from matplotlib import pyplot as plt

[nltk_data] Downloading package punkt to /Users/samir/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/samir/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Testing the GetOldTweets API

In [49]:
tweetCriteria = got.manager.TweetCriteria().setQuerySearch('coronavirus')\
                                           .setSince("2020-05-09")\
                                           .setUntil("2020-05-10").setMaxTweets(1000).setNear('New York').setWithin('50mi')
tweet = got.manager.TweetManager.getTweets(tweetCriteria)
len(tweet)

1000

In [50]:
tweet_list = []
for idx,x in enumerate(tweet):
    if x.retweets>1:
        tweet_list.append(x.text)
        
len(tweet_list)

235

In [71]:
tweet_list_str = [str(item) for item in tweet_list]

In [79]:
df = pd.DataFrame(tweet_list_str)
df.columns = ['stuff']
df.stuff

0      Jill & Erykah stay on long enough, they might ...
1      President Trump announces the federal governme...
2      They're calling it "Covid toe": painful red or...
3      Cuomo: New coronavirus testing sites to open i...
4      Trump, Who Called The Pandemic A Hoax, Is Now ...
                             ...                        
230    Health care workers are risking their lives — ...
231    Paxos cofounder explains why Wall Street's plu...
232    NEW: President Trump announces the federal gov...
233    NEW: President Trump announces the federal gov...
234    Running on empty: Coronavirus has changed the ...
Name: stuff, Length: 235, dtype: object

In [76]:
cv1 = CountVectorizer(stop_words='english')

In [80]:
df_cv1 = cv1.fit_transform(df.stuff)
pd.DataFrame(df_cv1.toarray(), columns=cv1.get_feature_names()).head()

Unnamed: 0,000,01221,020,04,05,06,08,09,0p0vm5o,10,...,yorkers,young,youthwithyouep16,youtu,youtube,zero,zone,zulu,𝙊𝙣𝙚,𝙩𝙝𝙞𝙧𝙙
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Uploading the tweets to MongoDB

In [2]:
#connecting to MongoDB
client = MongoClient()
client.list_database_names()

['admin', 'books', 'config', 'events', 'local', 'outings']

In [3]:
#making a new database for tweets
db = client.admin

#making a new collection in the admin database
tweets = db.tweets


In [8]:
#this is a list of csvs containing all the tweets I scraped using the API in command line
csvs = ['coronavirus_tweets_dec_01_to_jan_12.csv', 'coronavirus_tweets_jan_22.csv', 
       'coronavirus_tweets_feb_02.csv', 'coronavirus_tweets_feb_12.csv', 'coronavirus_tweets_feb_21.csv',
        'coronavirus_tweets_march_12.csv', 'coronavirus_tweets_march_22.csv', 'coronavirus_tweets_march_29.csv',
        'coronavirus_tweets_april_02.csv' ,'coronavirus_tweets_april_21.csv', 'coronavirus_tweets_may_12.csv',
       'coronavirus_tweets_may_19.csv']

#turning each csv into a list of dictionaries and then trying to insert the tweets into MongoDB
for url in csvs:
    list_of_dicts = pd.read_csv(url).astype(str).to_dict(orient='records')
    try:
        tweets.insert_many(list_of_dicts)
    except BulkWriteError as exc:
        exc.details

In [4]:
#checking to see if all the tweets made it into the database
tweets.count_documents({})

84615

Success!

**Next Steps:**
1. Tokenize data
1. Clean data
1. Lemmatize
1. Countvectorizer or TF-IDF
1. Topic Modeling with LSA or NMF

# Downloading Data from MongoDB

In [213]:
#grab the data from MongoDB
#cursor = tweets.find()
cursor = tweets.find({},{'_id':0,'permalink':0, 'geo':0,})
tweet_list = list(cursor)
len(tweet_list)

84615

In [214]:
df = pd.DataFrame(tweet_list)

In [215]:
df.head()

Unnamed: 0,date,username,to,replies,retweets,favorites,text,mentions,hashtags,id
0,2020-01-12 23:55:37,jrbchunklight,statnews,1,0,0,Watch this space. Wuhan is a coronavirus like ...,,,1216509066132049920
1,2020-01-12 23:45:28,coronavirus_RD,CorinaLantigua,1,0,2,Todos mis tweets lo son.,,,1216506512958525440
2,2020-01-12 23:43:54,marcosarellano,,1,0,0,China's mystery 'coronavirus' isn't currently ...,,,1216506117741662211
3,2020-01-12 23:38:30,ImkenmacMaclean,,1,0,0,China's mystery 'coronavirus' isn't currently ...,,,1216504758569140225
4,2020-01-12 23:35:09,poandpo,,0,0,0,"1 dead, 41 diagnosed with coronavirus-related ...",,#Health,1216503916818522112


In [8]:
df.drop('id',axis=1, inplace=True)

In [15]:
df.to_pickle('uncleaned_full_df.pkl')

# Preprocessing

In [9]:
#taking our URLs
urls = lambda x: re.sub(r'http\S+', '' ,x)

#taking out capitalization and digits
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)

#removing punctuation
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

df.text = df.text.map(urls).map(alphanumeric).map(punc_lower)

In [18]:
df.to_pickle('basic_cleaned_df.pkl')

In [2]:
with open('basic_cleaned_df.pkl', 'rb') as handle:
    df = pickle.load(handle)

In [None]:
#dropping duplicates
X = df.text.drop_duplicates()
data_list = [x for x in X]

In [219]:
#new_df = df.mask(df.text.duplicated(keep='first'),)
#new_df = new_df.dropna()
#new_df

In [148]:
#adding custom stop words for this use case
addl_stop_words = (['coronavirus','corona virus', 'covid', 'covid-19', 'covid 19', 'corona',
                   'virus', 'new', 'case','cases', 'deaths', 'total', 'people', 'confirmed', 'novel',
                   'outbreak', 'pandemic', 'epidemic', 'death', 'like', 'just', 'news', 'rt', 'increasingly',
                   'illness', 'infection', 'infected', 'diagnosed', 'reports', "breaking", 'reported', 'dead'
                  ,'looks', 'know', 'big', 'type', 'make', 'unveil', 'experts', 'say', 'says', 'said', 
                    'grows', 'growing','day', 'days', "foxnews",'week','patient', 'hospital', 'number', 
                    'sick', 'doctor', 'next', 'health', 'first', 'even', 'press', 'youtube', 'fact', 
                    'likely', 'global', 'disease', 'thing', 'really','world', 'man', 'also', 'month', 
                    'job', 'many', 'time', 'way', 'get', 'think', 'need', 'home', 'go', 'may', 'going', 'would',
                    'live', 'see', 'update', 'far', 'last', 'year', 'back', 'much', 'medical', 'one', 'via',
                    'could', 'maybe', 'details', 'today', 'three', 'ninth', 'epoch', 'epoch times', 'download', 'app'
                   ,'pron', 'daily', 'updates', 'coverage', 'fox', 'virtual', 'hall', 'programming', 'alert',
                   'coronavirusoutbreak', 'confirm','due','die', 'gon', 'na', 'gonna', 'wan', 'wanna', 'come', 'take'
                   , 'kill'])

custom_stop_words = stopwords.words('english') + addl_stop_words

The below function is taken from Selva Prabhakaran's post on Machine Learning Plus which can be found here: https://www.machinelearningplus.com/nlp/lemmatization-examples-python/

In [14]:
#lemmatization
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = WordNetLemmatizer()

data = []
for sentence in data_list:
    data.append([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)])

In [15]:
#final step of getting data ready for a vectorizer
final = []
for sentence in data:
    final.append(' '.join(sentence))

## CountVectorizer

### CountVectorizer + LSA

In [68]:
cv1 = CountVectorizer(stop_words=custom_stop_words,ngram_range=(1,3),min_df = 5, max_df=0.90,binary=True)
X_cv = cv1.fit_transform(final)
X_cv.toarray().shape

  'stop_words.' % sorted(inconsistent))


(75008, 27873)

In [69]:
#vector_df = pd.DataFrame(X_cv.toarray(), index=df.date, columns=cv1.get_feature_names())

In [70]:
topic_amount = 10
lsa = TruncatedSVD(topic_amount)
doc_topic = lsa.fit_transform(X_cv)
sum(lsa.explained_variance_ratio_)

0.041132508265173014

In [73]:
#function taken from lecture slides to help display the topics and the top words per topic
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [74]:
display_topics(lsa, cv1.get_feature_names(), 10)


Topic  0
china, trump, test, wuhan, spread, state, take, amp, report, country

Topic  1
trump, test, president, american, president trump, response, state, take, positive, house

Topic  2
test, positive, test positive, state, report, work, kit, test kit, result, cdc

Topic  3
trump, test, china, positive, test positive, president, president trump, wuhan, donald trump, donald

Topic  4
spread, stop, trump, stop spread, test, prevent, state, prevent spread, slow, cdc

Topic  5
state, report, united, united state, house, reopen, bill, white, white house, official

Topic  6
wuhan, report, chinese, cause, pneumonia, die, city, wuhan china, sars, infect

Topic  7
take, state, united, united state, seriously, care, chinese, take care, action, take seriously

Topic  8
report, house, take, bill, white, white house, package, stimulus, spread, democrat

Topic  9
wuhan, bill, house, help, business, work, relief, pneumonia, package, cause


### CountVectorizer + NMF

In [76]:
cv_nmf = CountVectorizer(stop_words = custom_stop_words,ngram_range=(1,3),min_df = 5, max_df=.9,binary=True)
cv_nmf_doc_word = cv_nmf.fit_transform(final)
nmf_model = NMF(10)
cv_nmf_doc_topic = nmf_model.fit_transform(cv_nmf_doc_word)
display_topics(nmf_model, cv_nmf.get_feature_names(), 10)

  'stop_words.' % sorted(inconsistent))



Topic  0
trump, president, president trump, response, american, donald, donald trump, administration, trump administration, briefing

Topic  1
china, sars, flu, travel, wuhan china, outside, gt, outside china, sars flu, country

Topic  2
test, positive, test positive, kit, negative, test kit, result, weinstein, cdc, harvey

Topic  3
amp, work, help, die, come, want, well, life, use, stay

Topic  4
spread, stop, stop spread, prevent, country, prevent spread, slow, fear, cdc, slow spread

Topic  5
state, united, united state, reopen, order, governor, official, country, government, york

Topic  6
wuhan, chinese, pneumonia, cause, city, wuhan china, sars, wuhan pneumonia, authority, million

Topic  7
take, care, seriously, action, take care, take seriously, look, measure, away, drug

Topic  8
report, italy, hubei, province, china report, break, bring, toll, italy report, hubei province

Topic  9
house, bill, white, white house, package, stimulus, democrat, relief, senate, dems


In [None]:
doc_topic

## TF-IDF

### TF-IDF + LSA

In [85]:
tfidf = TfidfVectorizer(stop_words=custom_stop_words,ngram_range=(1,3), min_df = 10, max_df=.9, binary=True)

In [86]:
X_tfidf = tfidf.fit_transform(final)
#tfidf_df = pd.DataFrame(X_tfidf.toarray(), index=df.date, columns=tfidf.get_feature_names())

  'stop_words.' % sorted(inconsistent))


In [87]:
lsa_tfidf = TruncatedSVD(10)
tfidf_lsa_doc_topic = lsa_tfidf.fit_transform(X_tfidf)
sum(lsa_tfidf.explained_variance_ratio_)

0.019439552496429528

In [89]:
display_topics(lsa_tfidf, tfidf.get_feature_names(), 10)


Topic  0
china, trump, test, wuhan, spread, report, take, state, die, come

Topic  1
china, wuhan, report, pneumonia, wuhan china, china report, cause, sars, chinese, outside china

Topic  2
test, positive, test positive, china, report, wuhan, pneumonia, weinstein, harvey, harvey weinstein

Topic  3
trump, china, president, president trump, response, donald, donald trump, american, administration, test

Topic  4
report, state, italy, china report, bring, italy report, county, report bring, united, united state

Topic  5
na, gon, gon na, report, die, fuck, come, wan na, wan, work

Topic  6
spread, na, gon, gon na, china, die, stop, trump, bad, stop spread

Topic  7
die, wuhan, chinese, spread, cause, lockdown, pneumonia, infect, american, flu

Topic  8
wuhan, spread, na, gon, gon na, chinese, trump, come, well, quarantine

Topic  9
state, house, united, united state, take, na, gon, gon na, white, white house


In [48]:
with open('lsa_tfidf_model.pkl', 'wb') as handle:
    pickle.dump(lsa_tfidf, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

### TF-IDF + NMF

In [149]:
topic_amount = 15
nmf_tfidf = TfidfVectorizer(stop_words=custom_stop_words,ngram_range=(1,3), min_df = 5, max_df=.9, binary=True)
tfidf_nmf_doc_word = nmf_tfidf.fit_transform(final)
nmf_model_2 = NMF(topic_amount)
tfidf_nmf_doc_topic = nmf_model_2.fit_transform(tfidf_nmf_doc_word)
display_topics(nmf_model_2, nmf_tfidf.get_feature_names(), 10)

  'stop_words.' % sorted(inconsistent))



Topic  0
well, help, want, life, amp, right, stay, look, still, please

Topic  1
china, outside, outside china, china report, wuhan china, travel, sars, hubei, flu, pneumonia

Topic  2
test, positive, test positive, weinstein, harvey, harvey weinstein, negative, weinstein test, weinstein test positive, harvey weinstein test

Topic  3
trump, president, response, president trump, donald, donald trump, american, administration, trump administration, trump response

Topic  4
report, italy, china report, bring, italy report, report bring, break, hubei, county, bring report

Topic  5
spread, stop, stop spread, prevent, prevent spread, country, slow, slow spread, cdc, official

Topic  6
state, united, united state, reopen, official, york, county, washington, cdc, governor

Topic  7
wuhan, chinese, cause, pneumonia, wuhan china, city, wuhan pneumonia, sars, mystery, pneumonia wuhan

Topic  8
fuck, shit, bitch, hate, bro, give, everything, fuck shit, damn, miss

Topic  9
cancel, fear, mobile, 

In [49]:
with open('nmf_tfidf_model.pkl', 'wb') as handle:
    pickle.dump(nmf_model_2, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

**This is it! This is TF-IDF + NMF model is the best topics I have found yet. I'm going to save the resulting doc_topic matrix and use it for visualization**

In [154]:
#Created these topic names after seeing the top tweets for each one
Topics = ['General', 'Initial Stories', 'People testing positive', 'Trumps Response to Covid-19', 'Italy Covid-19 Outbreak'
          ,'Stopping the spread','United States Covid-19 outbreak','Coronavirus growing in China','Anger', 
          'Covid-19 cancellations', 'Working from home', 'Cruise and Quarantines', 'Wearing a mask', 
          'White House Briefings', 'Second Wave Warnings']

column_titles = []
for i in range(1,topic_amount+1):
    column_titles.append('component_' + str(i)) 

In [207]:
#saving the doc-topic matrix for later use
save = pd.DataFrame(tfidf_nmf_doc_topic, index=X,columns=Topics)
save['date'] = new_df.date

with open('nmf_tfidf_doc_topic.pkl', 'wb') as handle:
    pickle.dump(save, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

**Thoughts so far:**
* TFIDF seems better than CV
* I have topics now but the TFIDF topics are basically just the biggest news headlines so far. This isn't super helpful because we probably could have guessed these just by briefly following the news
* I feel like the custom processed NMF model is actually the best for public perception and the TFIDF topics are just the top news stories
* I have to find a business use case or a question to answer soon because otherwise this analysis is directionless
* As of now, I'm thinking I should focus on the following: how has public perception of the government shutdown changed over time?
* I could also take out the news stories by taking out retweets or even by taking out tweets with many retweets



**Next Steps:**
1. Try Corex to form cluster around government shutdown?
1. Scattertext to generate some hypotheses that I can then test in data?
1. Show how sentiment on topics changed over time
1. distribution of topics over time?
1. What are people complaining about more?
1. use retweets to subset data