<a href="https://colab.research.google.com/github/slp22/data-engineering-project/blob/main/engineering_monkeypox_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Data Engineering | Pipeline

# Monkeypox Tweets

## Imports

In [59]:
import json
import logging
import sqlite3
import matplotlib.pyplot as plt
import numpy as np
import os, shutil, itertools
import pandas as pd
import pathlib as Path
import pickle
import PIL
import random
import seaborn as sns
import sklearn as sk
import warnings
import zipfile


import nltk
import re
import en_core_web_sm
import string
pd.set_option('display.max_colwidth', None)
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from sklearn.decomposition import PCA, NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from wordcloud import WordCloud, STOPWORDS


from sqlite3 import connect
import time
from datetime import datetime
from dateutil.parser import parse
from dateutil.relativedelta import *
from dateutil.easter import *
from dateutil.rrule import *
from dateutil.parser import *
from datetime import *


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Google Drive

In [60]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

# https://colab.research.google.com/notebooks/snippets/sheets.ipynb#scrollTo=JiJVCmu3dhFa

# authorize access 
from google.colab import auth
auth.authenticate_user()

# read in from Google Sheets

import gspread
from google.auth import default
creds, _ = default()

gc = gspread.authorize(creds)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Pyspark



In [61]:
# # https://towardsdatascience.com/pyspark-on-google-colab-101-d31830b238be
# # https://www.analyticsvidhya.com/blog/2020/11/a-must-read-guide-on-how-to-work-with-pyspark-on-google-colab-for-data-scientists/

!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [62]:
!wget -q https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz

In [63]:
!tar -xf spark-3.0.0-bin-hadoop3.2.tgz

In [64]:
!pip install -q findspark

In [65]:
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-bin-hadoop3.2"

In [66]:
import findspark
findspark.init()

In [67]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, explode, col, lower
from pyspark.sql.types import StructType,StructField, StringType, IntegerType


In [None]:
! pip install geopandas

In [191]:
pip install clean-text

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting clean-text
  Downloading clean_text-0.6.0-py3-none-any.whl (11 kB)
Collecting ftfy<7.0,>=6.0
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 1.5 MB/s 
[?25hCollecting emoji<2.0.0,>=1.0.0
  Downloading emoji-1.7.0.tar.gz (175 kB)
[K     |████████████████████████████████| 175 kB 9.8 MB/s 
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-1.7.0-py3-none-any.whl size=171046 sha256=eab30cdac1476652688061ea7cf293746db2ee617e4ee0d302710a0ac770b0e7
  Stored in directory: /root/.cache/pip/wheels/8a/4e/b6/57b01db010d17ef6ea9b40300af725ef3e210cb1acfb7ac8b6
Successfully built emoji
Installing collected packages: ftfy, emoji, clean-text
Successfully installed clean-text-0.6.0 emoji-1.7.0 ftfy-6.1.1


In [193]:
from cleantext import clean

## 1 | Pipeline Design


* **Business Problem:** Can we build a dashboard to monitor top trending topics on Twitter about monkeypox?
* **Data source:** [Kaggle Tweets on Monkeypox ](https://www.kaggle.com/datasets/thakurnirmalya/monkeypox2022tweets)
* **Impact Hypothesis:** 

## 2 | Data Ingestion

#### 2.1  [Twitter Dataset on the 2022 MonkeyPox Outbreak](https://www.kaggle.com/datasets/thakurnirmalya/monkeypox2022tweets) 
* Dataset is list of TweetIDs

#### 2.2  [Twitter Hydrating](https://towardsdatascience.com/learn-how-to-easily-hydrate-tweets-a0f393ed340e#:~:text=Hydrating%20Tweets) with [DocNow Hydrator](https://github.com/DocNow/hydrator/releases)
* Processed on local machine
* Save to Google Sheets

#### 2.3  Import [hydrated tweets](https://drive.google.com/drive/folders/1NbddxuSF3v5YuOgjvA1G4WgfPUlKfiul?usp=sharing) from GoogleDrive to Colab
* Six Google Sheets of data

## 3 | Exploratory Data Analysis

### 3.1  Explore one set: `tweets` (n = 12,656) 

In [69]:
w = ['TweetIDs_Part1', 'TweetIDs_Part2', 'TweetIDs_Part3', 'TweetIDs_Part4', 'TweetIDs_Part5', 'TweetIDs_Part6']
tweets = pd.DataFrame.from_records(gc.open(w[0]).sheet1.get_all_values())

In [194]:
tweets.head(2)

Unnamed: 0,date,user_screen_name,text,tweet_url,user_location,hashtags
5817,2022-01-02,JoelPau68848306,@JohannaSzabo1 @postblocksyndro @igfbss @GarethW84521928 @NSWHealth @Dom_Perrottet @BradHazzard Hopefully i catch the monkey pox and covid at the same time ! I’m sooooo scared !,https://twitter.com/JoelPau68848306/status/1527363844259332096,,
5820,2022-01-02,cosborne687,@CandiceBergenMP You are the problem . Did you write this during the monkey pox discovery? You are the problem.,https://twitter.com/cosborne687/status/1527372079254777860,"Nipissing, Ontario",


In [71]:
tweets.columns = tweets.iloc[0]
tweets = tweets.drop(index=tweets.index[0])

In [72]:
tweets.head(2)

Unnamed: 0,coordinates,created_at,hashtags,media,urls,favorite_count,id,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,...,user_followers_count,user_friends_count,user_listed_count,user_location,user_name,user_screen_name,user_statuses_count,user_time_zone,user_urls,user_verified
1,,Wed May 18 21:49:25 +0000 2022,,,,1,1527043704967528453,theofficepirate,1.5270433568781555e+18,140472501.0,...,36791,6088,255,,Yates,Jyates5,36441,,,False
2,,Fri May 20 20:43:44 +0000 2022,,,,0,1527751952448344065,,,,...,134,553,3,"Chicago, IL",Patrick,LeftistHank,10782,,,False


In [73]:
cols_list = list(tweets.columns)
cols_list

['coordinates',
 'created_at',
 'hashtags',
 'media',
 'urls',
 'favorite_count',
 'id',
 'in_reply_to_screen_name',
 'in_reply_to_status_id',
 'in_reply_to_user_id',
 'lang',
 'place',
 'possibly_sensitive',
 'quote_id',
 'retweet_count',
 'retweet_id',
 'retweet_screen_name',
 'source',
 'text',
 'tweet_url',
 'user_created_at',
 'user_id',
 'user_default_profile_image',
 'user_description',
 'user_favourites_count',
 'user_followers_count',
 'user_friends_count',
 'user_listed_count',
 'user_location',
 'user_name',
 'user_screen_name',
 'user_statuses_count',
 'user_time_zone',
 'user_urls',
 'user_verified']

In [74]:
tweets['hashtags']
tweets['hashtags'].nunique()

528

In [75]:
tweets['possibly_sensitive'][0:2]

1    
2    
Name: possibly_sensitive, dtype: object

In [76]:
tweets['text']

1                                                                                                                                                                                                                    @theofficepirate You bro remember them talking about monkey pox lol
2                                                                                                                                                                                                                                          oh monkey POX? I thought you said monkey POGS
3                                                                                                                                                                                                                          If I get monkey pox y’all gotta bring me the juiciest bananas
4        Great, the initial Monkey pox symptoms read like how I generally feel. People in both England and Scotland have been quarantined.\n\nInitial symptom

In [77]:
tweets['tweet_url'][0:2]

1        https://twitter.com/Jyates5/status/1527043704967528453
2    https://twitter.com/LeftistHank/status/1527751952448344065
Name: tweet_url, dtype: object

In [78]:
tweets['lang'].nunique() #40

40

In [79]:
tweets['lang'].unique()

array(['en', 'ja', 'fr', 'de', 'pl', 'nl', 'qme', 'und', 'da', 'in', 'ta',
       'pt', 'es', 'et', 'ar', 'ru', 'tl', 'el', 'zh', 'qht', 'fi', 'zxx',
       'cy', 'it', 'art', 'tr', 'ht', 'qst', 'ko', 'sr', 'iw', 'ml', 'ro',
       'bn', 'sv', 'hi', 'th', 'ca', 'lv', 'lang'], dtype=object)

In [80]:
print('English entries:', (tweets[tweets["lang"] == 'en'].count())['lang'])

English entries: 12137


In [81]:
tweets = tweets[tweets['lang'] == 'en']
tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12137 entries, 1 to 12656
Data columns (total 35 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   coordinates                 12137 non-null  object
 1   created_at                  12137 non-null  object
 2   hashtags                    12137 non-null  object
 3   media                       12137 non-null  object
 4   urls                        12137 non-null  object
 5   favorite_count              12137 non-null  object
 6   id                          12137 non-null  object
 7   in_reply_to_screen_name     12137 non-null  object
 8   in_reply_to_status_id       12137 non-null  object
 9   in_reply_to_user_id         12137 non-null  object
 10  lang                        12137 non-null  object
 11  place                       12137 non-null  object
 12  possibly_sensitive          12137 non-null  object
 13  quote_id                    12137 non-null  ob

In [82]:
tweets['user_created_at']

1        Fri Apr 01 00:29:41 +0000 2011
2        Fri Apr 01 13:14:03 +0000 2022
3        Fri Apr 01 15:12:02 +0000 2011
4        Fri Apr 01 16:27:26 +0000 2011
5        Fri Apr 01 17:41:17 +0000 2016
                      ...              
12652    Wed Sep 30 18:20:38 +0000 2015
12653    Wed Sep 30 19:12:03 +0000 2020
12654    Wed Sep 30 22:42:35 +0000 2020
12655    Wed Sep 30 23:06:12 +0000 2009
12656    Wed Sep 30 23:38:29 +0000 2009
Name: user_created_at, Length: 12137, dtype: object

In [83]:
tweets['date'] = pd.to_datetime(tweets['user_created_at'], 
                                  format='%a %b %d %H:%M:%S +0000 %Y', 
                                  errors='coerce').dt.date

In [84]:
tweets[['date']]

Unnamed: 0,date
1,2011-04-01
2,2022-04-01
3,2011-04-01
4,2011-04-01
5,2016-04-01
...,...
12652,2015-09-30
12653,2020-09-30
12654,2020-09-30
12655,2009-09-30


In [85]:
tweets['user_id']

1                  275288972
2        1509881738302001155
3                  275573209
4                  275604178
5         715957219972530180
                ...         
12652             3826370843
12653    1311383290688004097
12654    1311436211429543943
12655               78734566
12656               78741475
Name: user_id, Length: 12137, dtype: object

In [86]:
tweets['user_location']
tweets['user_location'].nunique()

4319

In [87]:
tweets['user_screen_name']

1                Jyates5
2            LeftistHank
3         MyNameIsRickyM
4           Just_sue_now
5             JackPaceSr
              ...       
12652    theoceanlawyers
12653          RolexCola
12654            DeeKno_
12655         ChipFranks
12656          moni_lisa
Name: user_screen_name, Length: 12137, dtype: object

In [88]:
tweets = tweets[['date',
                 'user_screen_name',
                 'text',
                 'tweet_url',
                 'user_location',
                 'hashtags']]
tweets.head(2)

Unnamed: 0,date,user_screen_name,text,tweet_url,user_location,hashtags
1,2011-04-01,Jyates5,@theofficepirate You bro remember them talking about monkey pox lol,https://twitter.com/Jyates5/status/1527043704967528453,,
2,2022-04-01,LeftistHank,oh monkey POX? I thought you said monkey POGS,https://twitter.com/LeftistHank/status/1527751952448344065,"Chicago, IL",


In [89]:
tweets = tweets.sort_values('date')
tweets.head(2)

Unnamed: 0,date,user_screen_name,text,tweet_url,user_location,hashtags
464,2006-12-15,tash,Sorry I can't come to work today I've got monkey pox. Still not the most ridiculous reason I've heard for not coming to work. My favourite one remains my bunny's got the shits - on xmas day. https://t.co/RGjnlyCYEo,https://twitter.com/tash/status/1525438720287166464,Earth,
2501,2007-01-08,BBCNews,Several monkeybox cases have been found in the UK - but how dangerous is the virus?\n\nhttps://t.co/Y8FG3DVyaE,https://twitter.com/BBCNews/status/1527739373772578816,London,


In [90]:
# https://stackoverflow.com/questions/22898824/filtering-pandas-dataframes-on-dates
# https://stackoverflow.com/questions/5619489/troubleshooting-descriptor-date-requires-a-datetime-datetime-object-but-rec

tweets = tweets[(tweets['date'] > date(2022,1,1))] 
tweets

Unnamed: 0,date,user_screen_name,text,tweet_url,user_location,hashtags
5817,2022-01-02,JoelPau68848306,@JohannaSzabo1 @postblocksyndro @igfbss @GarethW84521928 @NSWHealth @Dom_Perrottet @BradHazzard Hopefully i catch the monkey pox and covid at the same time ! I’m sooooo scared !,https://twitter.com/JoelPau68848306/status/1527363844259332096,,
5820,2022-01-02,cosborne687,@CandiceBergenMP You are the problem . Did you write this during the monkey pox discovery? You are the problem.,https://twitter.com/cosborne687/status/1527372079254777860,"Nipissing, Ontario",
5821,2022-01-02,localpirate7,@FoxNews Tf is monkey pox. Just stop it,https://twitter.com/localpirate7/status/1527058536856887297,,
5818,2022-01-02,Charles10151978,So then when are we locking down and shutting a global economy for Monkey Pox ? Watch this space … Vaccines incoming 🙄🙄,https://twitter.com/Charles10151978/status/1526581077497757701,,
5816,2022-01-02,ORnBNBucksCrew,Ayo we’re going to need to change the name “monkey pox” because even I can see how this could be a racist thing,https://twitter.com/ORnBNBucksCrew/status/1527058222657445888,🇺🇸,
...,...,...,...,...,...,...
1372,2022-05-20,USAF_Brat66,@hrkbenowen Don’t anyone be scared and jump to any conclusions. Monkey pox is a cousin of small pox and is NOT high contagious so there is no reason to be alarmed. There will be no reason for our Govt to do lock downs or force things we don’t want to do They may try but don’t believe them!,https://twitter.com/USAF_Brat66/status/1528100897951203329,"Tennessee, USA",
1366,2022-05-20,iwillnotsubmit1,"WW3 is on the horizon, Covid-19, and Monkey Pox about to be released into the world we'll be lucky if any humans survive? Nice work Joe.\n#LetsGoBrandon",https://twitter.com/iwillnotsubmit1/status/1528118077166174209,"New Jersey, USA",LetsGoBrandon
1367,2022-05-20,TwinomujuniDis5,@cmyeaton @dylanbgeorge @mlipsitch @rebeccajk13 Hello Dr am Dismas from UGANDA teach me about new disease monkey pox,https://twitter.com/TwinomujuniDis5/status/1527781932678926336,,
1368,2022-05-20,TwinomujuniDis5,@DrTomFrieden Hey doctor teach me about new disease monkey pox,https://twitter.com/TwinomujuniDis5/status/1527781548585541632,,


In [91]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1248 entries, 5817 to 1369
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   date              1248 non-null   object
 1   user_screen_name  1248 non-null   object
 2   text              1248 non-null   object
 3   tweet_url         1248 non-null   object
 4   user_location     1248 non-null   object
 5   hashtags          1248 non-null   object
dtypes: object(6)
memory usage: 68.2+ KB


In [92]:
tweets.reset_index(drop=True)

Unnamed: 0,date,user_screen_name,text,tweet_url,user_location,hashtags
0,2022-01-02,JoelPau68848306,@JohannaSzabo1 @postblocksyndro @igfbss @GarethW84521928 @NSWHealth @Dom_Perrottet @BradHazzard Hopefully i catch the monkey pox and covid at the same time ! I’m sooooo scared !,https://twitter.com/JoelPau68848306/status/1527363844259332096,,
1,2022-01-02,cosborne687,@CandiceBergenMP You are the problem . Did you write this during the monkey pox discovery? You are the problem.,https://twitter.com/cosborne687/status/1527372079254777860,"Nipissing, Ontario",
2,2022-01-02,localpirate7,@FoxNews Tf is monkey pox. Just stop it,https://twitter.com/localpirate7/status/1527058536856887297,,
3,2022-01-02,Charles10151978,So then when are we locking down and shutting a global economy for Monkey Pox ? Watch this space … Vaccines incoming 🙄🙄,https://twitter.com/Charles10151978/status/1526581077497757701,,
4,2022-01-02,ORnBNBucksCrew,Ayo we’re going to need to change the name “monkey pox” because even I can see how this could be a racist thing,https://twitter.com/ORnBNBucksCrew/status/1527058222657445888,🇺🇸,
...,...,...,...,...,...,...
1243,2022-05-20,USAF_Brat66,@hrkbenowen Don’t anyone be scared and jump to any conclusions. Monkey pox is a cousin of small pox and is NOT high contagious so there is no reason to be alarmed. There will be no reason for our Govt to do lock downs or force things we don’t want to do They may try but don’t believe them!,https://twitter.com/USAF_Brat66/status/1528100897951203329,"Tennessee, USA",
1244,2022-05-20,iwillnotsubmit1,"WW3 is on the horizon, Covid-19, and Monkey Pox about to be released into the world we'll be lucky if any humans survive? Nice work Joe.\n#LetsGoBrandon",https://twitter.com/iwillnotsubmit1/status/1528118077166174209,"New Jersey, USA",LetsGoBrandon
1245,2022-05-20,TwinomujuniDis5,@cmyeaton @dylanbgeorge @mlipsitch @rebeccajk13 Hello Dr am Dismas from UGANDA teach me about new disease monkey pox,https://twitter.com/TwinomujuniDis5/status/1527781932678926336,,
1246,2022-05-20,TwinomujuniDis5,@DrTomFrieden Hey doctor teach me about new disease monkey pox,https://twitter.com/TwinomujuniDis5/status/1527781548585541632,,


In [163]:
tweets.to_csv('/content/drive/MyDrive/tweets_eda_clean.csv')

### 3.2  Import rest of tweet data: `df` (n = 127,940)

In [94]:
w = ['TweetIDs_Part1', 'TweetIDs_Part2', 'TweetIDs_Part3', 'TweetIDs_Part4', 'TweetIDs_Part5', 'TweetIDs_Part6']

df_1 = pd.DataFrame.from_records(gc.open(w[0]).sheet1.get_all_values())
df_2 = pd.DataFrame.from_records(gc.open(w[1]).sheet1.get_all_values())
df_3 = pd.DataFrame.from_records(gc.open(w[2]).sheet1.get_all_values())
df_4 = pd.DataFrame.from_records(gc.open(w[3]).sheet1.get_all_values())
df_5 = pd.DataFrame.from_records(gc.open(w[4]).sheet1.get_all_values())
df_6 = pd.DataFrame.from_records(gc.open(w[5]).sheet1.get_all_values())


In [95]:
df_6.tail(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
127939,,Sat Jul 23 00:00:23 +0000 2022,,,https://twitter.com/i/broadcasts/1OdKrBzXyMQKX,0,1550631877781835776,,,,...,1818,942,34,Los Angeles,(((Luke Ford))),lukeford,61434,,http://www.lukeford.net,False
127940,,Sat Jul 23 00:00:07 +0000 2022,,,http://crweworld.com/article/world/2448896/2-children-diagnosed-with-monkeypox-in-us,0,1550631810970750976,,,,...,1051,2047,42,"Las Vegas, NV",Crwe World,CrweWorld,1482340,,http://crweworld.com,False


In [96]:
dfs = [df_1, df_2, df_3, df_4, df_5, df_6]

for d in dfs: 
  d.columns = d.iloc[0]
  d = d.drop(index=d.index[0],
               axis=0,
               inplace=True)

In [97]:
df_2.head(3)

Unnamed: 0,coordinates,created_at,hashtags,media,urls,favorite_count,id,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,...,user_followers_count,user_friends_count,user_listed_count,user_location,user_name,user_screen_name,user_statuses_count,user_time_zone,user_urls,user_verified
1,,Thu May 26 20:31:06 +0000 2022,,,,0,1529923099424178191,dustinbennett76,1.5299214221344973e+18,43171823.0,...,182,501,3,,Work in Progress,Kenny_Swift,8117,,,False
2,,Sun May 22 22:20:07 +0000 2022,,,,0,1528500983893983233,OnlineAlison,1.5284842269699236e+18,24115438.0,...,1,36,0,,Saniye,biirSaniye,238,,,False
3,,Thu May 26 22:54:22 +0000 2022,,,,1,1529959154059730946,,,,...,2297,1311,8,"Johannesburg, South Africa",Lebogang,lebza_mtwana,70755,,,False


In [98]:
df = pd.concat([df_1, df_2, df_3, df_4, df_5, df_6])
df.tail()

Unnamed: 0,coordinates,created_at,hashtags,media,urls,favorite_count,id,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,...,user_followers_count,user_friends_count,user_listed_count,user_location,user_name,user_screen_name,user_statuses_count,user_time_zone,user_urls,user_verified
127936,,Sat Jul 23 00:00:17 +0000 2022,,,https://www.cdc.gov/poxvirus/monkeypox/transmission.html https://twitter.com/NBCNews/status/1550620740235530240,1,1550631852393828352,,,,...,529,972,21,,Wendy,wmzraz,41861,,,False
127937,,Sat Jul 23 00:00:00 +0000 2022,,,https://endpts.com/bavarian-nordics-monkeypox-vaccine-wins-positive-opinion-from-ema-committee/,4,1550631781396729856,,,,...,24498,40,530,Global,Endpoints News,endpts,44967,,http://endpts.com,False
127938,,Sat Jul 23 00:00:18 +0000 2022,Monkeypox HealthierJC,https://twitter.com/HealthierJC/status/1550631856336506886/photo/1,,0,1550631856336506886,,,,...,2330,1582,32,"Jersey City, NJ",Healthier JC,HealthierJC,8091,,http://healthierjc.com,False
127939,,Sat Jul 23 00:00:23 +0000 2022,,,https://twitter.com/i/broadcasts/1OdKrBzXyMQKX,0,1550631877781835776,,,,...,1818,942,34,Los Angeles,(((Luke Ford))),lukeford,61434,,http://www.lukeford.net,False
127940,,Sat Jul 23 00:00:07 +0000 2022,,,http://crweworld.com/article/world/2448896/2-children-diagnosed-with-monkeypox-in-us,0,1550631810970750976,,,,...,1051,2047,42,"Las Vegas, NV",Crwe World,CrweWorld,1482340,,http://crweworld.com,False


In [99]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 229181 entries, 1 to 127940
Data columns (total 35 columns):
 #   Column                      Non-Null Count   Dtype 
---  ------                      --------------   ----- 
 0   coordinates                 229181 non-null  object
 1   created_at                  229181 non-null  object
 2   hashtags                    229181 non-null  object
 3   media                       229181 non-null  object
 4   urls                        229181 non-null  object
 5   favorite_count              229181 non-null  object
 6   id                          229181 non-null  object
 7   in_reply_to_screen_name     229181 non-null  object
 8   in_reply_to_status_id       229181 non-null  object
 9   in_reply_to_user_id         229181 non-null  object
 10  lang                        229181 non-null  object
 11  place                       229181 non-null  object
 12  possibly_sensitive          229181 non-null  object
 13  quote_id                    2

In [100]:
df.to_csv('/content/drive/MyDrive/tweets_raw.csv')

### 3.3  Clean rest of tweet data

In [101]:
df.tail(2)

Unnamed: 0,coordinates,created_at,hashtags,media,urls,favorite_count,id,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,...,user_followers_count,user_friends_count,user_listed_count,user_location,user_name,user_screen_name,user_statuses_count,user_time_zone,user_urls,user_verified
127939,,Sat Jul 23 00:00:23 +0000 2022,,,https://twitter.com/i/broadcasts/1OdKrBzXyMQKX,0,1550631877781835776,,,,...,1818,942,34,Los Angeles,(((Luke Ford))),lukeford,61434,,http://www.lukeford.net,False
127940,,Sat Jul 23 00:00:07 +0000 2022,,,http://crweworld.com/article/world/2448896/2-children-diagnosed-with-monkeypox-in-us,0,1550631810970750976,,,,...,1051,2047,42,"Las Vegas, NV",Crwe World,CrweWorld,1482340,,http://crweworld.com,False


In [102]:
print('English entries:', (df[df["lang"] == 'en'].count())['lang'])

English entries: 210812


In [103]:
df = df[(df['lang'] == 'en')]
df['lang'].unique()

array(['en'], dtype=object)

In [104]:
df['date'] = pd.to_datetime(df['user_created_at'],
                            format='%a %b %d %H:%M:%S +0000 %Y', 
                            errors='coerce').dt.date
df[['date']][:2]

Unnamed: 0,date
1,2011-04-01
2,2022-04-01


In [180]:
df = df[(df['date'] > date(2022,1,1))] 
df.head(2)

Unnamed: 0,date,user_screen_name,text,tweet_url,user_location,hashtags
2,2022-04-01,LeftistHank,oh monkey POX? I thought you said monkey POGS,https://twitter.com/LeftistHank/status/1527751952448344065,"Chicago, IL",
6,2022-04-01,YUCCAYAWN,lemme be clear ... if you like your monkey pox im afraid of where my life is going,https://twitter.com/YUCCAYAWN/status/1527396267633963019,,


In [183]:
print(df['user_location'].value_counts())
print('\n', 'Num unique:', df['user_location'].nunique())

                                15161
Sullivans Island                 1981
United States                     464
Binance Smart Chain               225
Pennsylvania                      168
                                ...  
Soon to be miles away.              1
Lagos Nigeria 🇳🇬                    1
Orkney, South Africa                1
God                                 1
Port Elizabeth, South Africa        1
Name: user_location, Length: 3797, dtype: int64

 Num unique: 3797


In [None]:
print(df['user_screen_name'].value_counts())
print('\n', 'Num unique:', df['user_screen_name'].nunique())

In [None]:
df = df[['date',
         'user_screen_name',
         'text',
         'tweet_url',
         'user_location',
         'hashtags']]
df.head(2)

In [None]:
# remove numbers, punctuation, and capital letters
alphanumeric = lambda x: re.sub('\w*\d\w*',' ', str(x))
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())
                          
df['user_location'] = df.user_location.map(alphanumeric).map(punc_lower)
df.tail(20)

In [227]:
# remove emojis
df = df.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))

In [None]:
df.reset_index(drop=True)

In [None]:
duplicateRows = df[df.duplicated()]
duplicateRows

In [111]:
df.drop_duplicates(subset=None, inplace=True)

### 3.4  Corpus: `tweet_data` (n = 28,281)

In [None]:
df.info()

In [164]:
# Save corpus
df.to_pickle('/content/tweets.pkl')
df.to_csv(r'/content/tweets.csv', index=False)

# 4 | Storage

### 4.1  Create SQL database: `monkeypox.db`

#### helper functions

In [114]:
# https://towardsdatascience.com/have-a-sql-interview-coming-up-ace-it-using-google-colab-6d3c0ffb29dc

def pd_to_sqlDB(input_df: pd.DataFrame,
                table_name: str,
                db_name: str = 'default.db') -> None:

    # # Setup local logging
    # logging.basicConfig(level=logging.INFO,
    #                     format='%(asctime)s %(levelname)s: %(message)s',
    #                     datefmt='%Y-%m-%d %H:%M:%S')

    # Find columns in the dataframe
    cols = input_df.columns
    cols_string = ','.join(cols)
    val_wildcard_string = ','.join(['?'] * len(cols))

    # Connect to a DB file if it exists, else create a new file
    con = sqlite3.connect(db_name)
    cur = con.cursor()
    # logging.info(f'SQL DB {db_name} created')

    # Create table
    sql_string = f"""CREATE TABLE {table_name} ({cols_string});"""
    cur.execute(sql_string)
    # logging.info(f'SQL Table {table_name} created with {len(cols)} columns')

    # Upload df
    rows_to_upload = input_df.to_dict(orient='split')['data']
    sql_string = f"""INSERT INTO {table_name} ({cols_string}) VALUES ({val_wildcard_string});"""    
    cur.executemany(sql_string, rows_to_upload)
    # logging.info(f'{len(rows_to_upload)} rows uploaded to {table_name}')
  
    # Commit the changes and close the connection
    con.commit()
    con.close()

In [115]:
#  https://towardsdatascience.com/have-a-sql-interview-coming-up-ace-it-using-google-colab-6d3c0ffb29dc

def sql_query_to_pd(sql_query_string: str, db_name: str ='mpox.db') -> pd.DataFrame:
    
    # Connect to the SQL DB
    con = sqlite3.connect(db_name)

    # Execute the SQL query
    cursor = con.execute(sql_query_string)

    # Fetch the data and column names
    result_data = cursor.fetchall()
    cols = [description[0] for description in cursor.description]

    # Close the connection
    con.close()

    # Return as df
    return pd.DataFrame(result_data, columns=cols)

###  4.2  Set up database

In [None]:
# https://towardsdatascience.com/have-a-sql-interview-coming-up-ace-it-using-google-colab-6d3c0ffb29dc

# Read csv as df
input_df = pd.read_csv('/content/tweets.csv')

# Upload df to as a SQL table
pd_to_sqlDB(input_df,
            table_name='tweets',
            db_name='monkeypox.db')



### 4.3  Save tweets to database: `corpus`

In [None]:
sql_query_string = """
    SELECT *
    FROM tweets
"""
corpus = sql_query_to_pd(sql_query_string, db_name='monkeypox.db')
corpus

In [168]:
corpus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28281 entries, 0 to 28280
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   date              28281 non-null  object
 1   user_screen_name  28281 non-null  object
 2   text              28281 non-null  object
 3   tweet_url         28281 non-null  object
 4   user_location     13117 non-null  object
 5   hashtags          2825 non-null   object
dtypes: object(6)
memory usage: 1.3+ MB


### 4.4  Count locations: `location`

In [217]:
sql_query_string = """
  SELECT user_location, COUNT(user_location) AS count
  FROM tweets
  GROUP BY user_location
  ORDER BY
    COUNT(user_location) DESC
"""


In [218]:
location = sql_query_to_pd(sql_query_string, db_name='monkeypox.db')

In [204]:
location.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3796 entries, 0 to 3795
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   user_location  3795 non-null   object
 1   count          3796 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 59.4+ KB


In [221]:
location.tail(10)

Unnamed: 0,user_location,count
3786,j edgar hoover building,1
3787,magadmcrypto,1
3788,,1
3789,,1
3790,,1
3791,,1
3792,,1
3793,your local strawberry patch,1
3794,,1
3795,none,0


In [222]:
sql_query_string = """
  SELECT user_location, COUNT(user_location) AS count
  FROM tweets
  GROUP BY user_location
  ORDER BY
    COUNT(user_location) DESC
"""


In [223]:
location = sql_query_to_pd(sql_query_string, db_name='monkeypox.db')

In [224]:
location.tail(10)

Unnamed: 0,user_location,count
3786,🚫 J. Edgar Hoover Building,1
3787,🚫MAGA🚫DM🚫CRYPTO😷💉x3,1
3788,🛌 🛋️ 🚽🛀,1
3789,🤔,1
3790,🤡🌍🇨🇦,1
3791,🤡🪐,1
3792,🦞,1
3793,🧺Your Local Strawberry Patch,1
3794,🪐,1
3795,,0


# 5 | Processing

#### 5.1  Word Count (PySpark)
* groupby, count, agg, 
* google = pyspark word frequency


In [123]:
spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [124]:
spark_df = spark.createDataFrame(corpus)
spark_df.printSchema()


root
 |-- date: string (nullable = true)
 |-- user_screen_name: string (nullable = true)
 |-- text: string (nullable = true)
 |-- tweet_url: string (nullable = true)
 |-- user_location: string (nullable = true)
 |-- hashtags: string (nullable = true)



In [125]:
df_schema = StructType([StructField("date", IntegerType(), True)])
                        # StructField("tweet", StringType(), True)])

In [126]:
spark_df = spark_df.withColumn('text', 
                               explode(split(lower(col('text')), '\s')))

In [None]:
(spark_df.groupBy('text')
  .count()
  .orderBy('count', ascending=False)
  .show(50))

In [128]:
spark_df.groupBy('text')


<pyspark.sql.group.GroupedData at 0x7fdb9f20cad0>

In [129]:
spark_df = spark_df.withColumn('user_location', 
                               explode(split(lower(col('user_location')), '\s')))

In [None]:
(spark_df.groupBy('user_location')
  .count()
  .orderBy('count', ascending=False)
  .show(150))

In [131]:
# https://simplemaps.com/data/world-cities

### 5.2 Topic Modeling

In [132]:
terms = corpus['text']
terms.shape

(28281,)

In [133]:
# custom stop words 
stopwords = nltk.corpus.stopwords.words('english')

custom_words = ['monkey',
             'pox',
             'monkeypox',
             'people',
             'covid',
             'amp',
             '19',
             '2022',
             'another',
             'co', 'https', 'cases', 'case', 'new', 'first', 'via', 'confirmed', 'us',
             'going', 'one', 'know',
             'get', 'like', 'go', 'got', 'time', 'think', 'want',
             'health', 'declares', 'outbreak', 'world', 'public', 'declared', 'news', 'says',
            'total', 'take', 'virus', 'already', 'getting', 'sure',
            'spread', 'disease', 'next', 'would', 'need']
stopwords.extend(custom_words)


In [134]:
tf_vectorizer = TfidfVectorizer(stop_words=stopwords, 
                                min_df=0.01, 
                                max_df=.95)
tf_vectorizer

TfidfVectorizer(max_df=0.95, min_df=0.01,
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...])

In [135]:
# document-term matrix with TF-IDF
tf_doc_term_mtx = tf_vectorizer.fit_transform(terms)
type(tf_doc_term_mtx)

scipy.sparse.csr.csr_matrix

In [136]:
tf_doc_term_df = pd.DataFrame(tf_doc_term_mtx.toarray(), 
                              columns=tf_vectorizer.get_feature_names_out())
tf_doc_term_df.head(2)

Unnamed: 0,aids,also,anyone,around,back,biden,cdc,children,cnn,come,...,trying,two,vaccine,vaccines,vax,way,well,year,years,yet
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [137]:
# # double check that domain specfic words were omitted 
print('monkey' in tf_vectorizer.get_feature_names_out())
print('pox' in tf_vectorizer.get_feature_names_out())
print('monkeypox' in tf_vectorizer.get_feature_names_out())
print('covid' in tf_vectorizer.get_feature_names_out())

# who and what are mentioned in the corpus? not cdc, trump mentioned 
print('rash' in tf_vectorizer.get_feature_names_out())
print('fever' in tf_vectorizer.get_feature_names_out())
print('cdc:', 'cdc' in tf_vectorizer.get_feature_names_out())
print('cdcgov' in tf_vectorizer.get_feature_names_out())
print('centers for disease control and prevention' in tf_vectorizer.get_feature_names_out())
print('walensky' in tf_vectorizer.get_feature_names_out())
print('biden:', 'biden' in tf_vectorizer.get_feature_names_out())



False
False
False
False
False
False
cdc: True
False
False
False
biden: True


In [138]:
# V     visible variables     doc_term             input (corpus matrix)
# W     weights               doc_topic            feature set
# H     hidden variables      topic_term           coefficients

In [139]:
V = tf_doc_term_mtx
V.shape


(28281, 83)

In [140]:
# W matrix = feature set & weights

nmf = NMF(n_components=3, init=None)
W = nmf.fit_transform(V).round(3)
print(type(W))
W.shape

<class 'numpy.ndarray'>


(28281, 3)

In [None]:
# H matrix = hidden variables & coefficients 

H = pd.DataFrame(nmf.components_.round(2),
                 index = ['c1', 
                          'c2',
                          'c3'] #,
#                           'c4']#,, 
#                           'c5']
                 ,
                 columns = tf_vectorizer.get_feature_names_out())
print('H.shape:',  H.shape)
H.T.style.background_gradient(cmap='Blues')


In [142]:
# function to display topics
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))


In [None]:
display_topics(nmf, tf_vectorizer.get_feature_names_out(), 10)

### 5.3  Word Cloud

In [None]:
# wordcloud
# https://www.geeksforgeeks.org/generating-word-cloud-python/
comment_words = ''
stopwords = set(stopwords)
 
for val in terms:
    val = str(val)
    tokens = val.split()     
    for i in range(len(tokens)):
        tokens[i] = tokens[i].lower()
     
    comment_words += " ".join(tokens)+" "
 
wordcloud = WordCloud(width = 800, height = 800,
                background_color ='white',
                stopwords = stopwords,
                min_font_size = 10).generate(comment_words)
 
# plot the WordCloud                       
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.title('Monkeypox Tweets 2022')
plt.savefig("monkeypox-tweets-word-cloud.jpeg");

### 5.4  Case Counts by State

In [145]:
# import sheet with state lattitude and longitude 
# data source: https://developers.google.com/public-data/docs/canonical/states_csv

# open google spreadsheet
worksheet = gc.open('USA-State-Coordinates').sheet1

# get_all_values gives a list of rows.
rows = worksheet.get_all_values()

states = pd.DataFrame.from_records(rows)

states.columns = states.iloc[0]
states.drop([0], inplace=True)
states.drop(['state'], axis=1, inplace=True)
states.sort_values(by=['name'], inplace=True)
states = states.rename(columns={'latitude': 'lat', 'longitude': 'lon', 'name': 'state'})

states.tail(2)

Unnamed: 0,lat,lon,state
51,43.78444,-88.787868,Wisconsin
52,43.075968,-107.290284,Wyoming


In [146]:
states.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52 entries, 1 to 52
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   lat     52 non-null     object
 1   lon     52 non-null     object
 2   state   52 non-null     object
dtypes: object(3)
memory usage: 1.6+ KB


In [147]:
# import sheet with US case count by state 
# data source: https://www.cdc.gov/poxvirus/monkeypox/response/2022/us-map.html

worksheet = gc.open('2022-US-mpx-cases-by-state').sheet1
rows = worksheet.get_all_values()
cases = pd.DataFrame.from_records(rows)

cases.columns = cases.iloc[0]
cases.drop([0], inplace=True)
cases.drop(['AsOf', 'Case Range'], axis=1, inplace=True)
cases.sort_values(by=['Location'], inplace=True)

cases.tail(2)

Unnamed: 0,Location,Cases
51,Wisconsin,56
52,Wyoming,1


In [148]:
# US case count by state (long + lat)
map_data = pd.concat([states, cases], axis=1)
map_data = map_data[['Location', 'Cases', 'lat', 'lon' ]]
map_data = map_data.rename(columns={'Location': 'state', 'Cases':'cases'})

map_data.tail(2)
# map_data.info()

Unnamed: 0,state,cases,lat,lon
51,Wisconsin,56,43.78444,-88.787868
52,Wyoming,1,43.075968,-107.290284


In [None]:
map_data = map_data.astype({'cases':'int'})

map_data.sort_values(by=['cases'], ascending=False)
# map_data.sort_values(by=['latitude'], ascending=False)



In [150]:
map_data.head(2)

Unnamed: 0,state,cases,lat,lon
1,Alabama,53,32.318231,-86.902298
2,Alaska,3,63.588753,-154.493062


In [151]:
map_data.to_csv('map_data.csv')  

### 5.5  Tweets by Location

In [206]:
location.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3796 entries, 0 to 3795
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   user_location  3795 non-null   object
 1   count          3796 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 59.4+ KB


In [None]:
location

In [None]:
location.tail(20)

In [None]:
sql_query_string = """
  SELECT user_location, COUNT(user_location) AS count
  FROM tweets
  GROUP BY user_location
  ORDER BY
    COUNT(user_location) DESC
"""


In [None]:
location = sql_query_to_pd(sql_query_string, db_name='monkeypox.db')

# 6 | Deployment

Streamlit App: https://slp22-data-engineering-project-streamlit-mpx-app-ckpzq2.streamlitapp.com/

# 7 | Testing/Robustness

[Python schedule](https://schedule.readthedocs.io/en/stable/examples.html#run-a-job-every-x-minute)