In [1]:
import pickle
import re
import string
import pandas as pd
import numpy as np
import json
import copy 

import GetOldTweets3 as got

from pymongo import MongoClient
import pymongo
from pymongo.errors import BulkWriteError

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction import text
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import gensim, spacy, logging, warnings
import en_core_web_sm
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
from gensim.models import Word2Vec
from gensim.models.nmf import Nmf

from matplotlib import pyplot as plt

[nltk_data] Downloading package punkt to /Users/samir/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/samir/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Uploading the tweets to MongoDB

In [12]:
#connecting to MongoDB
client = MongoClient()
client.list_database_names()

['admin', 'books', 'config', 'events', 'local', 'outings']

In [13]:
#making a new database for tweets
db = client.admin

#making a new collection in the admin database
tweets = db.tweets


In [14]:
#this is a list of csvs containing all the tweets I scraped using the API in command line
csvs = ['coronavirus_tweets_dec_01_to_jan_12.csv', 'coronavirus_tweets_jan_22.csv', 
       'coronavirus_tweets_feb_02.csv', 'coronavirus_tweets_feb_12.csv', 'coronavirus_tweets_feb_21.csv',
        'coronavirus_tweets_march_12.csv', 'coronavirus_tweets_march_22.csv', 'coronavirus_tweets_march_29.csv',
        'coronavirus_tweets_april_02.csv' ,'coronavirus_tweets_april_21.csv', 'coronavirus_tweets_may_12.csv',
       'coronavirus_tweets_may_19.csv']

#turning each csv into a list of dictionaries and then trying to insert the tweets into MongoDB
for url in csvs:
    list_of_dicts = pd.read_csv('data/' + url).astype(str).to_dict(orient='records')
    try:
        tweets.insert_many(list_of_dicts)
    except BulkWriteError as exc:
        exc.details

In [15]:
#checking to see if all the tweets made it into the database
tweets.count_documents({})

84615

Success!

# Downloading Data from MongoDB

In [17]:
#grab the data from MongoDB
cursor = tweets.find({},{'_id':0,'permalink':0, 'geo':0,})
tweet_list = list(cursor)

In [18]:
df = pd.DataFrame(tweet_list)

In [19]:
df.head()

Unnamed: 0,date,username,to,replies,retweets,favorites,text,mentions,hashtags,id
0,2020-01-12 23:55:37,jrbchunklight,statnews,1,0,0,Watch this space. Wuhan is a coronavirus like ...,,,1216509066132049920
1,2020-01-12 23:45:28,coronavirus_RD,CorinaLantigua,1,0,2,Todos mis tweets lo son.,,,1216506512958525440
2,2020-01-12 23:43:54,marcosarellano,,1,0,0,China's mystery 'coronavirus' isn't currently ...,,,1216506117741662211
3,2020-01-12 23:38:30,ImkenmacMaclean,,1,0,0,China's mystery 'coronavirus' isn't currently ...,,,1216504758569140225
4,2020-01-12 23:35:09,poandpo,,0,0,0,"1 dead, 41 diagnosed with coronavirus-related ...",,#Health,1216503916818522112


In [44]:
df.date = pd.to_datetime(df.date)

In [20]:
df.drop('id',axis=1, inplace=True)

In [21]:
#df.to_pickle('uncleaned_full_df.pkl')

# Preprocessing

In [22]:
#taking our URLs
urls = lambda x: re.sub(r'http\S+', '' ,x)

#taking out capitalization and digits
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)

#removing punctuation
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

df.text = df.text.map(urls).map(alphanumeric).map(punc_lower)

In [23]:
#df.to_pickle('basic_cleaned_df.pkl')

In [24]:
with open('basic_cleaned_df.pkl', 'rb') as handle:
    df = pickle.load(handle)

In [25]:
#dropping duplicates
X = df.text.drop_duplicates()
data_list = [x for x in X]

In [26]:
#adding custom stop words for this use case
addl_stop_words = (['coronavirus','corona virus', 'covid', 'covid-19', 'covid 19', 'corona',
                   'virus', 'new', 'case','cases', 'deaths', 'total', 'people', 'confirmed', 'novel',
                   'outbreak', 'pandemic', 'epidemic', 'death', 'like', 'just', 'news', 'rt', 'increasingly',
                   'illness', 'infection', 'infected', 'diagnosed', 'reports', "breaking", 'reported', 'dead'
                  ,'looks', 'know', 'big', 'type', 'make', 'unveil', 'experts', 'say', 'says', 'said', 
                    'grows', 'growing','day', 'days', "foxnews",'week','patient', 'hospital', 'number', 
                    'sick', 'doctor', 'next', 'health', 'first', 'even', 'press', 'youtube', 'fact', 
                    'likely', 'global', 'disease', 'thing', 'really','world', 'man', 'also', 'month', 
                    'job', 'many', 'time', 'way', 'get', 'think', 'need', 'home', 'go', 'may', 'going', 'would',
                    'live', 'see', 'update', 'far', 'last', 'year', 'back', 'much', 'medical', 'one', 'via',
                    'could', 'maybe', 'details', 'today', 'three', 'ninth', 'epoch', 'epoch times', 'download', 'app'
                   ,'pron', 'daily', 'updates', 'coverage', 'fox', 'virtual', 'hall', 'programming', 'alert',
                   'coronavirusoutbreak', 'confirm','due','die', 'gon', 'na', 'gonna', 'wan', 'wanna', 'come', 'take'
                   , 'kill'])

custom_stop_words = stopwords.words('english') + addl_stop_words

The below function is taken from Selva Prabhakaran's post on Machine Learning Plus which can be found here: https://www.machinelearningplus.com/nlp/lemmatization-examples-python/

In [27]:
#lemmatization
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = WordNetLemmatizer()

data = []
for sentence in data_list:
    data.append([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)])

In [28]:
#final step of getting data ready for a vectorizer
final = []
for sentence in data:
    final.append(' '.join(sentence))

## Testing CountVectorizer

### CountVectorizer + LSA

In [29]:
cv1 = CountVectorizer(stop_words=custom_stop_words,ngram_range=(1,3),min_df = 5, max_df=0.90,binary=True)
X_cv = cv1.fit_transform(final)
X_cv.toarray().shape

  'stop_words.' % sorted(inconsistent))


(75008, 27387)

In [69]:
#vector_df = pd.DataFrame(X_cv.toarray(), index=df.date, columns=cv1.get_feature_names())

In [30]:
topic_amount = 10
lsa = TruncatedSVD(topic_amount)
doc_topic = lsa.fit_transform(X_cv)
sum(lsa.explained_variance_ratio_)

0.041367494639639235

In [31]:
#function taken from Metis lecture slides to help display the topics and the top words per topic
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [32]:
display_topics(lsa, cv1.get_feature_names(), 10)


Topic  0
china, trump, test, wuhan, spread, state, report, amp, country, president

Topic  1
trump, test, president, american, president trump, state, response, positive, test positive, house

Topic  2
test, positive, test positive, state, report, work, kit, test kit, result, cdc

Topic  3
trump, test, china, positive, test positive, wuhan, president trump, president, donald trump, donald

Topic  4
spread, stop, trump, test, stop spread, prevent, prevent spread, positive, test positive, slow

Topic  5
state, report, united, united state, spread, house, reopen, official, white, white house

Topic  6
wuhan, report, chinese, cause, pneumonia, city, wuhan china, trump, quarantine, sars

Topic  7
report, house, bill, white, white house, spread, package, stimulus, relief, democrat

Topic  8
wuhan, house, bill, white, white house, democrat, package, stimulus, relief, chinese

Topic  9
amp, american, country, house, realdonaldtrump, watch, social, infect, white, distance


### CountVectorizer + NMF

In [33]:
cv_nmf = CountVectorizer(stop_words = custom_stop_words,ngram_range=(1,3),min_df = 5, max_df=.9,binary=True)
cv_nmf_doc_word = cv_nmf.fit_transform(final)
nmf_model = NMF(10)
cv_nmf_doc_topic = nmf_model.fit_transform(cv_nmf_doc_word)
display_topics(nmf_model, cv_nmf.get_feature_names(), 10)

  'stop_words.' % sorted(inconsistent))



Topic  0
china, sars, flu, travel, wuhan china, outside, gt, outside china, country, sars flu

Topic  1
trump, president, president trump, response, american, donald, donald trump, administration, trump administration, briefing

Topic  2
test, positive, test positive, kit, negative, test kit, result, cdc, weinstein, harvey

Topic  3
work, help, want, well, life, use, stay, country, still, crisis

Topic  4
spread, stop, stop spread, prevent, prevent spread, country, slow, cdc, fear, slow spread

Topic  5
state, united, united state, reopen, order, governor, official, country, york, government

Topic  6
wuhan, chinese, pneumonia, cause, city, wuhan china, sars, wuhan pneumonia, authority, million

Topic  7
report, italy, hubei, province, china report, break, bring, toll, italy report, hubei province

Topic  8
house, bill, white, white house, package, stimulus, democrat, relief, senate, dems

Topic  9
amp, realdonaldtrump, response, watch, dr, lie, usa, article, vaccine, support


## TF-IDF

### TF-IDF + LSA

In [34]:
tfidf = TfidfVectorizer(stop_words=custom_stop_words,ngram_range=(1,3), min_df = 10, max_df=.9, binary=True)

In [35]:
X_tfidf = tfidf.fit_transform(final)
#tfidf_df = pd.DataFrame(X_tfidf.toarray(), index=df.date, columns=tfidf.get_feature_names())

  'stop_words.' % sorted(inconsistent))


In [36]:
lsa_tfidf = TruncatedSVD(10)
tfidf_lsa_doc_topic = lsa_tfidf.fit_transform(X_tfidf)
sum(lsa_tfidf.explained_variance_ratio_)

0.01913866259133547

In [37]:
display_topics(lsa_tfidf, tfidf.get_feature_names(), 10)


Topic  0
china, trump, test, wuhan, spread, report, state, work, positive, country

Topic  1
china, wuhan, report, pneumonia, wuhan china, china report, chinese, cause, sars, outside china

Topic  2
test, positive, test positive, china, report, wuhan, pneumonia, weinstein, harvey, harvey weinstein

Topic  3
trump, china, president, president trump, response, donald, donald trump, test, administration, american

Topic  4
report, italy, state, china report, bring, italy report, county, break, report bring, toll

Topic  5
spread, state, stop, country, stop spread, united, united state, help, prevent, prevent spread

Topic  6
wuhan, chinese, pneumonia, cause, wuhan china, sars, city, wuhan pneumonia, flu, official

Topic  7
state, united, cancel, united state, quarantine, amid, china, late, fear, business

Topic  8
bad, state, flu, good, vaccine, well, country, united, united state, quarantine

Topic  9
fuck, house, white, white house, state, mask, quarantine, want, watch, face


In [38]:
#with open('lsa_tfidf_model.pkl', 'wb') as handle:
    #pickle.dump(lsa_tfidf, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

### TF-IDF + NMF

In [39]:
topic_amount = 15
nmf_tfidf = TfidfVectorizer(stop_words=custom_stop_words,ngram_range=(1,3), min_df = 5, max_df=.9, binary=True)
tfidf_nmf_doc_word = nmf_tfidf.fit_transform(final)
nmf_model_2 = NMF(topic_amount)
tfidf_nmf_doc_topic = nmf_model_2.fit_transform(tfidf_nmf_doc_word)
display_topics(nmf_model_2, nmf_tfidf.get_feature_names(), 10)

  'stop_words.' % sorted(inconsistent))



Topic  0
well, help, want, life, amp, right, stay, look, still, please

Topic  1
china, outside, outside china, china report, wuhan china, travel, sars, hubei, flu, pneumonia

Topic  2
test, positive, test positive, weinstein, harvey, harvey weinstein, negative, weinstein test, weinstein test positive, harvey weinstein test

Topic  3
trump, president, response, president trump, donald, donald trump, american, administration, trump administration, trump response

Topic  4
report, italy, china report, bring, italy report, report bring, break, hubei, county, bring report

Topic  5
spread, stop, stop spread, prevent, prevent spread, country, slow, slow spread, cdc, official

Topic  6
fuck, shit, bitch, hate, bro, give, everything, fuck shit, damn, miss

Topic  7
wuhan, chinese, cause, pneumonia, wuhan china, city, wuhan pneumonia, sars, mystery, pneumonia wuhan

Topic  8
state, united, united state, reopen, official, york, county, washington, cdc, governor

Topic  9
cancel, fear, mobile, 

In [49]:
#with open('nmf_tfidf_model.pkl', 'wb') as handle:
    #pickle.dump(nmf_model_2, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

**This is it! This is TF-IDF + NMF model is the best topics I have found yet. I'm going to save the resulting doc_topic matrix and use it for visualization**

In [41]:
#Created these topic names after seeing the top tweets for each one
Topics = ['General', 'Initial Stories', 'People testing positive', 'Trumps Response to Covid-19', 'Italy Covid-19 Outbreak'
          ,'Stopping the spread','Anger','Coronavirus growing in China','United States Covid-19 outbreak', 
          'Covid-19 cancellations', 'Second Wave Warnings', 'Cruise and Quarantines','Wearing a mask', 
          'White House Briefings', 'Working from home']


In [45]:
#saving the doc-topic matrix for later use
save = pd.DataFrame(tfidf_nmf_doc_topic, index=X,columns=Topics)
save['date'] = df.date

#with open('nmf_tfidf_doc_topic.pkl', 'wb') as handle:
    #pickle.dump(save, handle, protocol=pickle.HIGHEST_PROTOCOL)
    