# Capstone Project

In [77]:
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', 0) #To display entire text content of a column
import matplotlib.pyplot as plt
import seaborn as sns

# For web scraping
from selenium import webdriver
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions
# from selenium.webdriver.common.by import By
# import time

import twint
import nest_asyncio
import csv

import re
import string
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords 
nltk.download('vader_lexicon')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image

from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates

from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/rachelchen/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rachelchen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rachelchen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Web Scarping: Covid Developments Timeline 

In [78]:
# Initializes the Chrome Driver to access the website
driver = webdriver.Chrome('/Applications/chromedriver')

### Timeline 2021

In [79]:
# Initializes the Chrome Driver to access the URL
driver.get('https://www.ajmc.com/view/a-timeline-of-covid-19-vaccine-developments-in-2021')

# Web scraping covid timeline 2021
links_21 = driver.find_elements_by_tag_name('strong')
timeline_21 = []

for i in range(0,len(links_21)):
    timeline_21.append(links_21[i].text)

# Store output to a dictionary 
date_list = []
text_list = []
date_event_dict = {}
current_date = None

for element in timeline_21:
    if len(element) <= 12:
        current_date = element
        date_event_dict[current_date] = []
    else:
        date_event_dict[current_date].append(element)

In [80]:
pd.set_option("max_rows", None)

# Create a dataframe from dictionary
df_tl_21 = pd.DataFrame(list(date_event_dict.items()),columns = ['date','covid_update']) 
df_tl_21['date'] = df_tl_21['date'].apply(lambda x: x + ', 2021')
df_tl_21.head()

Unnamed: 0,date,covid_update
0,"January 4, 2021","[Operation Warp Speed Initiates Talks With Moderna on Half-Dose Vaccines, UK Begins Distributing AstraZeneca/Oxford Vaccine]"
1,"January 5, 2021","[FDA Advises Against Altering Vaccine Schedules, Moderna to Produce 600 Million Vaccine Doses]"
2,"January 6, 2021","[HHS to Provide $22 Billion to Fund Testing, Vaccine Distribution]"
3,"January 7, 2021","[CDC: COVID-19 Vaccine Benefits Outweigh Allergic Reaction Risk, Study Shows Patients With Heart Failure Should Be Prioritized for Vaccines]"
4,"January 8, 2021","[American Hospital Association Pushes for Faster Vaccine Rollout, Pharmacies Tapped to Distribute Vaccines, Biden Plans to Rapidly Release Most COVID-19 Doses, States Face Significant Rollout Hurdles]"


In [81]:
# clean up the df
df_tl_21['covid_update'] = df_tl_21['covid_update'].apply(lambda s: ', '.join([str(elem) for elem in s]))
df_tl_21

Unnamed: 0,date,covid_update
0,"January 4, 2021","Operation Warp Speed Initiates Talks With Moderna on Half-Dose Vaccines, UK Begins Distributing AstraZeneca/Oxford Vaccine"
1,"January 5, 2021","FDA Advises Against Altering Vaccine Schedules, Moderna to Produce 600 Million Vaccine Doses"
2,"January 6, 2021","HHS to Provide $22 Billion to Fund Testing, Vaccine Distribution"
3,"January 7, 2021","CDC: COVID-19 Vaccine Benefits Outweigh Allergic Reaction Risk, Study Shows Patients With Heart Failure Should Be Prioritized for Vaccines"
4,"January 8, 2021","American Hospital Association Pushes for Faster Vaccine Rollout, Pharmacies Tapped to Distribute Vaccines, Biden Plans to Rapidly Release Most COVID-19 Doses, States Face Significant Rollout Hurdles"
5,"January 11, 2021",Vaccine Doses Go Unused or Are Trashed
6,"January 12, 2021","CDC, HHS Update Vaccine Allocation Guidance"
7,"January 14, 2021","Elderly Los Angeles County Residents Report Confusion, Delays, GoodRx Report Documents Vaccine Deserts"
8,"January 18, 2021",
9,"Reports of, 2021",Racial Disparities in Vaccination Rates


In [82]:
# manually correct some spotted error 
df_tl_21.iloc[8,1] = 'Racial Disparities in Vaccination Rate'
df_tl_21 = df_tl_21.drop([9])
df_tl_21

Unnamed: 0,date,covid_update
0,"January 4, 2021","Operation Warp Speed Initiates Talks With Moderna on Half-Dose Vaccines, UK Begins Distributing AstraZeneca/Oxford Vaccine"
1,"January 5, 2021","FDA Advises Against Altering Vaccine Schedules, Moderna to Produce 600 Million Vaccine Doses"
2,"January 6, 2021","HHS to Provide $22 Billion to Fund Testing, Vaccine Distribution"
3,"January 7, 2021","CDC: COVID-19 Vaccine Benefits Outweigh Allergic Reaction Risk, Study Shows Patients With Heart Failure Should Be Prioritized for Vaccines"
4,"January 8, 2021","American Hospital Association Pushes for Faster Vaccine Rollout, Pharmacies Tapped to Distribute Vaccines, Biden Plans to Rapidly Release Most COVID-19 Doses, States Face Significant Rollout Hurdles"
5,"January 11, 2021",Vaccine Doses Go Unused or Are Trashed
6,"January 12, 2021","CDC, HHS Update Vaccine Allocation Guidance"
7,"January 14, 2021","Elderly Los Angeles County Residents Report Confusion, Delays, GoodRx Report Documents Vaccine Deserts"
8,"January 18, 2021",Racial Disparities in Vaccination Rate
10,"January 19, 2021","California COVID-19 Variant May Be Vaccine Resistant, Pfizer, Moderna, AstraZeneca to Test Vaccines in Adolescents, Incoming CDC Director Walensky to Prioritize Vaccine Rollout"


### Timeline 2020

In [83]:
# Initializes the Chrome Driver to access the URL
driver.get('https://www.ajmc.com/view/a-timeline-of-covid19-developments-in-2020')

In [84]:
# Initializes the Chrome Driver to access the URL
driver.get('https://www.ajmc.com/view/a-timeline-of-covid19-developments-in-2020')

links_20 = driver.find_elements_by_tag_name('strong')
timeline_20 = []

for i in range(0,len(links_20)):
    timeline_20.append(links_20[i].text)
    
df_tl_20 = pd.DataFrame(timeline_20)
df_tl_20 = df_tl_20[0].str.split(' — ', expand = True)
df_tl_20 = df_tl_20.rename(columns={0: 'date', 1: 'covid_update'})
df_tl_20['date'] = df_tl_20['date'].apply(lambda x: x + ', 2020' )
df_tl_20.head()

Unnamed: 0,date,covid_update
0,"January 9, 2020","WHO Announces Mysterious Coronavirus-Related Pneumonia in Wuhan, China"
1,"January 20, 2020",CDC Says 3 US Airports Will Begin Screening for Coronavirus
2,"January 21, 2020",CDC Confirms First US Coronavirus Case
3,"January 21, 2020",Chinese Scientist Confirms COVID-19 Human Transmission
4,"January 23, 2020",Wuhan Now Under Quarantine


In [85]:
# clean up the df
df_tl_20 = pd.DataFrame(timeline_20)
df_tl_20 = df_tl_20[0].str.split(' — ', expand = True)
df_tl_20 = df_tl_20.rename(columns={0: 'date', 1: 'covid_update'})
df_tl_20['date'] = df_tl_20['date'].apply(lambda x: x + ', 2020' )
df_tl_20

Unnamed: 0,date,covid_update
0,"January 9, 2020","WHO Announces Mysterious Coronavirus-Related Pneumonia in Wuhan, China"
1,"January 20, 2020",CDC Says 3 US Airports Will Begin Screening for Coronavirus
2,"January 21, 2020",CDC Confirms First US Coronavirus Case
3,"January 21, 2020",Chinese Scientist Confirms COVID-19 Human Transmission
4,"January 23, 2020",Wuhan Now Under Quarantine
5,"January 31, 2020",WHO Issues Global Health Emergency
6,"February 2, 2020",Global Air Travel Is Restricted
7,"February 3, 2020",US Declares Public Health Emergency
8,"February 10, 2020",China’s COVID-19 Deaths Exceed Those of SARS Crisis
9,"February 25, 2020",CDC Says COVID-19 Is Heading Toward Pandemic Status


In [86]:
# manually correct some spotted error 
df_tl_20.iloc[107,0] = 'November 16, 2020'
df_tl_20.iloc[107,1] = 'Moderna Reveals Vaccine Efficacy Results'
df_tl_20.iloc[114,0] = 'December 11, 2020'
df_tl_20.iloc[114,1] = 'FDA Agrees to EUA for COVID-19 Vaccine From Pfizer, BioNTech'
df_tl_20.iloc[120,0] = 'December 28, 2020'
df_tl_20.iloc[120,1] = 'Novavax Starts Phase 3 Trial of COVID-19 Vaccine'
df_tl_20.iloc[124,0] = 'December 31, 2020'
df_tl_20.iloc[124,1] = 'US Falls Short of Goal to Give 20 Million Vaccinations by Year End'

df_tl_20

Unnamed: 0,date,covid_update
0,"January 9, 2020","WHO Announces Mysterious Coronavirus-Related Pneumonia in Wuhan, China"
1,"January 20, 2020",CDC Says 3 US Airports Will Begin Screening for Coronavirus
2,"January 21, 2020",CDC Confirms First US Coronavirus Case
3,"January 21, 2020",Chinese Scientist Confirms COVID-19 Human Transmission
4,"January 23, 2020",Wuhan Now Under Quarantine
5,"January 31, 2020",WHO Issues Global Health Emergency
6,"February 2, 2020",Global Air Travel Is Restricted
7,"February 3, 2020",US Declares Public Health Emergency
8,"February 10, 2020",China’s COVID-19 Deaths Exceed Those of SARS Crisis
9,"February 25, 2020",CDC Says COVID-19 Is Heading Toward Pandemic Status


In [87]:
df_tl_20 = df_tl_20.drop([23, 115, 121 ])

df_tl_20

Unnamed: 0,date,covid_update
0,"January 9, 2020","WHO Announces Mysterious Coronavirus-Related Pneumonia in Wuhan, China"
1,"January 20, 2020",CDC Says 3 US Airports Will Begin Screening for Coronavirus
2,"January 21, 2020",CDC Confirms First US Coronavirus Case
3,"January 21, 2020",Chinese Scientist Confirms COVID-19 Human Transmission
4,"January 23, 2020",Wuhan Now Under Quarantine
5,"January 31, 2020",WHO Issues Global Health Emergency
6,"February 2, 2020",Global Air Travel Is Restricted
7,"February 3, 2020",US Declares Public Health Emergency
8,"February 10, 2020",China’s COVID-19 Deaths Exceed Those of SARS Crisis
9,"February 25, 2020",CDC Says COVID-19 Is Heading Toward Pandemic Status


In [88]:
# combine events happened in the same day
df_tl_20 = df_tl_20.groupby('date')['covid_update'].apply(', '.join).reset_index()
df_tl_20

Unnamed: 0,date,covid_update
0,"April 16, 2020",“Gating Criteria” Emerge as a Way to Reopen the Economy
1,"April 28, 2020","Young, Poor Avoid Care for COVID-19 Symptoms"
2,"April 29, 2020",NIH Trial Shows Early Promise for Remdesivir
3,"April 8, 2020",Troubles With the COVID-19 Cocktail
4,"August 11, 2020",Trump Administration Reaches Deal With Moderna
5,"August 12, 2020",Severe Obesity Increases Mortality Risk From COVID-19
6,"August 13, 2020",Biden Calls for 3-Month Mask Mandate
7,"August 15, 2020",FDA Approves Saliva Test
8,"August 17, 2020",COVID-19 Now the Third-Leading Cause of Death in the US
9,"August 23, 2020",Convalescent Plasma Is Cleared for Use by FDA


In [89]:
df_tl_20

Unnamed: 0,date,covid_update
0,"April 16, 2020",“Gating Criteria” Emerge as a Way to Reopen the Economy
1,"April 28, 2020","Young, Poor Avoid Care for COVID-19 Symptoms"
2,"April 29, 2020",NIH Trial Shows Early Promise for Remdesivir
3,"April 8, 2020",Troubles With the COVID-19 Cocktail
4,"August 11, 2020",Trump Administration Reaches Deal With Moderna
5,"August 12, 2020",Severe Obesity Increases Mortality Risk From COVID-19
6,"August 13, 2020",Biden Calls for 3-Month Mask Mandate
7,"August 15, 2020",FDA Approves Saliva Test
8,"August 17, 2020",COVID-19 Now the Third-Leading Cause of Death in the US
9,"August 23, 2020",Convalescent Plasma Is Cleared for Use by FDA


### Combine the Timeline 

In [90]:
# concat two df
df_tl = pd.concat([df_tl_20, df_tl_21], ignore_index=True)
df_tl

Unnamed: 0,date,covid_update
0,"April 16, 2020",“Gating Criteria” Emerge as a Way to Reopen the Economy
1,"April 28, 2020","Young, Poor Avoid Care for COVID-19 Symptoms"
2,"April 29, 2020",NIH Trial Shows Early Promise for Remdesivir
3,"April 8, 2020",Troubles With the COVID-19 Cocktail
4,"August 11, 2020",Trump Administration Reaches Deal With Moderna
5,"August 12, 2020",Severe Obesity Increases Mortality Risk From COVID-19
6,"August 13, 2020",Biden Calls for 3-Month Mask Mandate
7,"August 15, 2020",FDA Approves Saliva Test
8,"August 17, 2020",COVID-19 Now the Third-Leading Cause of Death in the US
9,"August 23, 2020",Convalescent Plasma Is Cleared for Use by FDA


In [91]:
# check columns and dtype
df_tl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171 entries, 0 to 170
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   date          171 non-null    object
 1   covid_update  171 non-null    object
dtypes: object(2)
memory usage: 2.8+ KB


In [96]:
# convert object date to numerical date
df_tl['date'] = pd.to_datetime(df_tl['date'])
df_tl = df_tl.sort_values(by = 'date', ignore_index = True)
df_tl.head()

Unnamed: 0,date,covid_update
0,2020-01-09,"WHO Announces Mysterious Coronavirus-Related Pneumonia in Wuhan, China"
1,2020-01-20,CDC Says 3 US Airports Will Begin Screening for Coronavirus
2,2020-01-21,"CDC Confirms First US Coronavirus Case, Chinese Scientist Confirms COVID-19 Human Transmission"
3,2020-01-23,Wuhan Now Under Quarantine
4,2020-01-31,WHO Issues Global Health Emergency


## Major Cities in North America 

In [2]:
coord_ca = pd.read_csv('/Users/rachelchen/Desktop/BrainStation/Capstone/Twitter Sentiment/ca_latlng.csv')
coord_us = pd.read_csv('/Users/rachelchen/Desktop/BrainStation/Capstone/Twitter Sentiment/us_latlng.csv')
coord = pd.concat([coord_ca, coord_us], ignore_index=True)
coord

Unnamed: 0,City,Coordinates
0,Toronto,"43.70011, -79.4163"
1,Ottawa,"45.41117, -75.69812"
2,Montréal,"45.50884, -73.58781"
3,Edmonton,"53.55014, -113.46871"
4,Mississauga,"43.5789, -79.6583"
5,Winnipeg,"49.8844, -97.14704"
6,Vancouver,"49.24966, -123.11934"
7,Hamilton,"43.25011, -79.84963"
8,Calgary,"51.05011, -114.08529"
9,Brampton,"43.68341, -79.76633"


In [3]:
coord_list = coord['Coordinates'].tolist()
geo_list=[]

for i in coord_list:
    i = i + ', 200 km'
    geo_list.append(i)

geo_list

['43.70011, -79.4163, 200 km',
 '45.41117, -75.69812, 200 km',
 '45.50884, -73.58781, 200 km',
 '53.55014, -113.46871, 200 km',
 '43.5789, -79.6583, 200 km',
 '49.8844, -97.14704, 200 km',
 '49.24966, -123.11934, 200 km',
 '43.25011, -79.84963, 200 km',
 '51.05011, -114.08529, 200 km',
 '43.68341, -79.76633, 200 km',
 '49.10635, -122.82509, 200 km',
 '45.56995, -73.692, 200 km',
 '44.6464, -63.57291, 200 km',
 '42.98339, -81.23304, 200 km',
 '43.90012, -78.84957, 200 km',
 '50.36386, -119.34997, 200 km',
 '48.4359, -123.35155, 200 km',
 '42.30008, -83.01654, 200 km',
 '46.81228, -71.21454, 200 km',
 '43.86682, -79.2663, 200 km',
 '40.71427, -74.00597, 200 km',
 '34.05223, -118.24368, 200 km',
 '41.85003, -87.65005, 200 km',
 '29.76328, -95.36327, 200 km',
 '39.95233, -75.16379, 200 km',
 '33.44838, -112.07404, 200 km',
 '29.42412, -98.49363, 200 km',
 '32.71571, -117.16472, 200 km',
 '32.78306, -96.80667, 200 km',
 '40.6501, -73.94958, 200 km',
 '40.68149, -73.83652, 200 km',
 '37.3393

**Source**: 

https://www.geodatos.net/en/coordinates/canada

https://www.geodatos.net/en/coordinates/united-states

In [6]:
open('/Users/rachelchen/Desktop/BrainStation/Capstone/Twitter Sentiment/filtered1.csv', 'a')

<_io.TextIOWrapper name='/Users/rachelchen/Desktop/BrainStation/Capstone/Twitter Sentiment/filtered1.csv' mode='a' encoding='UTF-8'>

In [1]:
# nest_asyncio.apply()

# for geo in geo_list:
#     c = twint.Config()
#     c.Search = '(covid OR coronavirus) AND (vaccination OR vaccine OR vaccinated) AND -filter:verified'
#     c.Lang = 'en'
#     c.Filter_retweets = True
#     c.Lowercase = True
#     c.Since = '2021-04-14'
#     c.Until = '2021-04-25'
#     c.Custom['tweet'] = ['id', 'created_at','user_id', 'username', 'tweet', 'replies_count', 'retweets_count', 'likes_count',
#                          'hashtags', 'geo']
#     c.Geo = geo
#     c.Store_csv = True
#     c.Output = '/Users/rachelchen/Desktop/BrainStation/Capstone/Twitter Sentiment/filtered1.csv'   
#     twint.run.Search(c)

## EDA

In [8]:
# create column name for our output
colnames = ['id', 'datetime','user_id', 'username', 'tweet', 'replies_count', 'retweets_count', 'likes_count',
            'hashtags', 'coordinates']

In [9]:
#df = pd.read_csv('/Users/rachelchen/Desktop/BrainStation/Capstone/Twitter Sentiment/filtered.csv')


df = pd.read_csv('/Users/rachelchen/Desktop/BrainStation/Capstone/Twitter Sentiment/filtered1.csv', names = colnames,
                   header = None)

In [10]:
# take a quick look in to our data 
df.head()

Unnamed: 0,id,datetime,user_id,username,tweet,replies_count,retweets_count,likes_count,hashtags,coordinates
0,1382483708784078848,2021-04-14 19:59:36 EDT,1556939665,xwoman54,How do we know the COVID vaccine won't have long-term side-effects? https://t.co/29tIN8o4J1 via @ConversationEDU Waiting for my second dose of #AstraZeneca,0,0,1,['astrazeneca'],"43.70011,-79.4163,200km"
1,1382483637401190403,2021-04-14 19:59:19 EDT,926509404165255168,marcus13781234,Did the FDA and Pfizer hold the COVID vaccine until after Trump lost re-election? - Emily Posts https://t.co/PEn72g3yjl,0,0,0,[],"43.70011,-79.4163,200km"
2,1382483631034281985,2021-04-14 19:59:17 EDT,178168932,marleersocket,COVID-19 vaccine appointment booked ☑️,0,0,15,[],"43.70011,-79.4163,200km"
3,1382483610662543369,2021-04-14 19:59:12 EDT,1322036867555053568,easyontario,@nationalpost The federal government has spent $8B on COVID vaccine that Canada hasn’t received. This is quite clear another bribery scandal this time with the pharmaceutical giants,0,0,3,[],"43.70011,-79.4163,200km"
4,1382483489870782465,2021-04-14 19:58:43 EDT,1146177193682362368,mare55742414,"Evangelical pastor says he is not a politician, he is a prophet and tells followers not to believe in Covid and not to get vaccinated. Stupid is spreading!!!",0,0,0,[],"43.70011,-79.4163,200km"


In [11]:
df.shape

(386243, 10)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 386243 entries, 0 to 386242
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   id              386243 non-null  int64 
 1   datetime        386243 non-null  object
 2   user_id         386243 non-null  int64 
 3   username        386243 non-null  object
 4   tweet           386243 non-null  object
 5   replies_count   386243 non-null  int64 
 6   retweets_count  386243 non-null  int64 
 7   likes_count     386243 non-null  int64 
 8   hashtags        386243 non-null  object
 9   coordinates     386243 non-null  object
dtypes: int64(5), object(5)
memory usage: 29.5+ MB


In [13]:
df.isna().mean()

id                0.0
datetime          0.0
user_id           0.0
username          0.0
tweet             0.0
replies_count     0.0
retweets_count    0.0
likes_count       0.0
hashtags          0.0
coordinates       0.0
dtype: float64

In [14]:
df.duplicated().any()

True

In [15]:
df.duplicated().T.any()

True

In [16]:
df[df[['id']].duplicated() == True].shape

(240758, 10)

There are more than 16,000 duplicated tweet and we need to drop them.

In [17]:
# remove duplicate tweet
df = df.drop_duplicates(subset ='id', keep = 'first').sort_values('id', ascending= False).reset_index(drop=True)
df.head()

Unnamed: 0,id,datetime,user_id,username,tweet,replies_count,retweets_count,likes_count,hashtags,coordinates
0,1385745271376711685,2021-04-23 19:59:53 EDT,437549659,craig_swenson,"@MarkHigbee @byu_sam @Ch_JesusChrist Too late. At this point we have known long-term effects of COVID-19 weighed against potential unknowns of a vaccine that has passed similar safety protocols as other vaccines, just in a shorter timescale (more a commentary on what is achievable when resources are not limited).",1,0,0,[],"39.95233,-75.16379,200km"
1,1385745252435173377,2021-04-23 19:59:48 EDT,1247580609708359686,tsince1985,"“My Body, My Choice” sure dissolved quickly thanks to COVID-19!!! The UC &amp; CSU University Systems in CA are requiring proof of COVID-19 vaccination to return to in-person learning. Well you had a good run Institutions of “Higher Learning”!!! There’s always Chapter 11 Bankruptcy!!",0,0,0,[],"37.33939,-121.89496,200km"
2,1385745233258770433,2021-04-23 19:59:44 EDT,195878438,mcfarlandclinic,"If you’ve been waiting to get your COVID vaccine, this is your shot! All Iowans 16 and older can get the vaccine now, and McFarland Clinic and others have the vaccine readily available. Learn more and schedule online at https://t.co/p5JVQe8xOh. https://t.co/2OkJVapo4n",0,1,2,[],"29.76328,-95.36327,200km"
3,1385745114107195395,2021-04-23 19:59:15 EDT,3962090297,bonheurchasse,Drove from the suburbs to the north side of Chicago to the south west side of Chicago and back to take my daughter for her covid vaccine. Total trip time 5 hours. Worth every minute. https://t.co/Xff0o3pl9b,1,0,6,[],"41.85003,-87.65005,200km"
4,1385745098357489668,2021-04-23 19:59:12 EDT,135263424,meg_michael_,@DFisman “Life circumstances” means that those most at risk of adverse Covid outcomes also be put at additional “risk” with AZ vaccination. Save it for “them” because “we” can all wait for the “better” vaccines. Just such fucking shit messaging on your part. Awful. Shameful.,1,1,1,[],"45.41117,-75.69812,200km"


In [18]:
print(df.shape)

(145485, 10)


We extracted the data we need from Twitter, let's start data cleaning.

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145485 entries, 0 to 145484
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   id              145485 non-null  int64 
 1   datetime        145485 non-null  object
 2   user_id         145485 non-null  int64 
 3   username        145485 non-null  object
 4   tweet           145485 non-null  object
 5   replies_count   145485 non-null  int64 
 6   retweets_count  145485 non-null  int64 
 7   likes_count     145485 non-null  int64 
 8   hashtags        145485 non-null  object
 9   coordinates     145485 non-null  object
dtypes: int64(5), object(5)
memory usage: 11.1+ MB


**The `datetime` is now object, let's change it to numerical.**

In [20]:
df['datetime'] = pd.to_datetime(df['datetime'])
df.head()

Unnamed: 0,id,datetime,user_id,username,tweet,replies_count,retweets_count,likes_count,hashtags,coordinates
0,1385745271376711685,2021-04-23 19:59:53-04:00,437549659,craig_swenson,"@MarkHigbee @byu_sam @Ch_JesusChrist Too late. At this point we have known long-term effects of COVID-19 weighed against potential unknowns of a vaccine that has passed similar safety protocols as other vaccines, just in a shorter timescale (more a commentary on what is achievable when resources are not limited).",1,0,0,[],"39.95233,-75.16379,200km"
1,1385745252435173377,2021-04-23 19:59:48-04:00,1247580609708359686,tsince1985,"“My Body, My Choice” sure dissolved quickly thanks to COVID-19!!! The UC &amp; CSU University Systems in CA are requiring proof of COVID-19 vaccination to return to in-person learning. Well you had a good run Institutions of “Higher Learning”!!! There’s always Chapter 11 Bankruptcy!!",0,0,0,[],"37.33939,-121.89496,200km"
2,1385745233258770433,2021-04-23 19:59:44-04:00,195878438,mcfarlandclinic,"If you’ve been waiting to get your COVID vaccine, this is your shot! All Iowans 16 and older can get the vaccine now, and McFarland Clinic and others have the vaccine readily available. Learn more and schedule online at https://t.co/p5JVQe8xOh. https://t.co/2OkJVapo4n",0,1,2,[],"29.76328,-95.36327,200km"
3,1385745114107195395,2021-04-23 19:59:15-04:00,3962090297,bonheurchasse,Drove from the suburbs to the north side of Chicago to the south west side of Chicago and back to take my daughter for her covid vaccine. Total trip time 5 hours. Worth every minute. https://t.co/Xff0o3pl9b,1,0,6,[],"41.85003,-87.65005,200km"
4,1385745098357489668,2021-04-23 19:59:12-04:00,135263424,meg_michael_,@DFisman “Life circumstances” means that those most at risk of adverse Covid outcomes also be put at additional “risk” with AZ vaccination. Save it for “them” because “we” can all wait for the “better” vaccines. Just such fucking shit messaging on your part. Awful. Shameful.,1,1,1,[],"45.41117,-75.69812,200km"


**Let's clean the texts in tweets.**

In [21]:
def clean_text(txt):
    
    '''Replace special character found in a text string with nothing 
    (i.e. it will remove special characters from the string).

    Parameters
    ----------
    txt : string
        A text string that you want to parse and remove special characters.

    Returns
    -------
    The same txt string with the specials characters removed, and all text convert to lower case
    '''
    
    txt = txt.lower()
    
    return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", txt).split())

In [22]:
df['tweet'] = df['tweet'].apply(lambda text: clean_text(text))

In [23]:
df.head()

Unnamed: 0,id,datetime,user_id,username,tweet,replies_count,retweets_count,likes_count,hashtags,coordinates
0,1385745271376711685,2021-04-23 19:59:53-04:00,437549659,craig_swenson,markhigbee byusam chjesuschrist too late at this point we have known longterm effects of covid19 weighed against potential unknowns of a vaccine that has passed similar safety protocols as other vaccines just in a shorter timescale more a commentary on what is achievable when resources are not limited,1,0,0,[],"39.95233,-75.16379,200km"
1,1385745252435173377,2021-04-23 19:59:48-04:00,1247580609708359686,tsince1985,my body my choice sure dissolved quickly thanks to covid19 the uc amp csu university systems in ca are requiring proof of covid19 vaccination to return to inperson learning well you had a good run institutions of higher learning theres always chapter 11 bankruptcy,0,0,0,[],"37.33939,-121.89496,200km"
2,1385745233258770433,2021-04-23 19:59:44-04:00,195878438,mcfarlandclinic,if youve been waiting to get your covid vaccine this is your shot all iowans 16 and older can get the vaccine now and mcfarland clinic and others have the vaccine readily available learn more and schedule online at,0,1,2,[],"29.76328,-95.36327,200km"
3,1385745114107195395,2021-04-23 19:59:15-04:00,3962090297,bonheurchasse,drove from the suburbs to the north side of chicago to the south west side of chicago and back to take my daughter for her covid vaccine total trip time 5 hours worth every minute,1,0,6,[],"41.85003,-87.65005,200km"
4,1385745098357489668,2021-04-23 19:59:12-04:00,135263424,meg_michael_,dfisman life circumstances means that those most at risk of adverse covid outcomes also be put at additional risk with az vaccination save it for them because we can all wait for the better vaccines just such fucking shit messaging on your part awful shameful,1,1,1,[],"45.41117,-75.69812,200km"


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145485 entries, 0 to 145484
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype                    
---  ------          --------------   -----                    
 0   id              145485 non-null  int64                    
 1   datetime        145485 non-null  datetime64[ns, tzlocal()]
 2   user_id         145485 non-null  int64                    
 3   username        145485 non-null  object                   
 4   tweet           145485 non-null  object                   
 5   replies_count   145485 non-null  int64                    
 6   retweets_count  145485 non-null  int64                    
 7   likes_count     145485 non-null  int64                    
 8   hashtags        145485 non-null  object                   
 9   coordinates     145485 non-null  object                   
dtypes: datetime64[ns, tzlocal()](1), int64(5), object(4)
memory usage: 11.1+ MB


Let's add a column for polarity and a column for subjectivity
- Polarity is float which lies in the range of [-1,1] where 1 means positive statement and -1 means a negative statement. 
- Subjective sentences generally refer to personal opinion, emotion or judgment whereas objective refers to factual information. Subjectivity is also a float which lies in the range of [0,1].

In [25]:
df['polarity'] = df['tweet'].apply(lambda text: TextBlob(text).sentiment.polarity)
df['subjectivity'] = df['tweet'].apply(lambda text: TextBlob(text).sentiment.subjectivity)

In [26]:
df['positive'] = df['tweet'].apply(lambda text: SentimentIntensityAnalyzer().polarity_scores(text)['pos'])
df['neutral'] = df['tweet'].apply(lambda text: SentimentIntensityAnalyzer().polarity_scores(text)['neu'])
df['negative'] = df['tweet'].apply(lambda text: SentimentIntensityAnalyzer().polarity_scores(text)['neg'])

In [27]:
df['sentiment'] = np.select([(df['positive']>df['negative']), 
                             (df['positive']<df['negative']), 
                             (df['positive']==df['negative'])], 
                            ['positive', 'negative','neutral'])

In [28]:
df.head()

Unnamed: 0,id,datetime,user_id,username,tweet,replies_count,retweets_count,likes_count,hashtags,coordinates,polarity,subjectivity,positive,neutral,negative,sentiment
0,1385745271376711685,2021-04-23 19:59:53-04:00,437549659,craig_swenson,markhigbee byusam chjesuschrist too late at this point we have known longterm effects of covid19 weighed against potential unknowns of a vaccine that has passed similar safety protocols as other vaccines just in a shorter timescale more a commentary on what is achievable when resources are not limited,1,0,0,[],"39.95233,-75.16379,200km",0.018452,0.502976,0.139,0.861,0.0,positive
1,1385745252435173377,2021-04-23 19:59:48-04:00,1247580609708359686,tsince1985,my body my choice sure dissolved quickly thanks to covid19 the uc amp csu university systems in ca are requiring proof of covid19 vaccination to return to inperson learning well you had a good run institutions of higher learning theres always chapter 11 bankruptcy,0,0,0,[],"37.33939,-121.89496,200km",0.4125,0.547222,0.207,0.793,0.0,positive
2,1385745233258770433,2021-04-23 19:59:44-04:00,195878438,mcfarlandclinic,if youve been waiting to get your covid vaccine this is your shot all iowans 16 and older can get the vaccine now and mcfarland clinic and others have the vaccine readily available learn more and schedule online at,0,1,2,[],"29.76328,-95.36327,200km",0.355556,0.411111,0.0,1.0,0.0,neutral
3,1385745114107195395,2021-04-23 19:59:15-04:00,3962090297,bonheurchasse,drove from the suburbs to the north side of chicago to the south west side of chicago and back to take my daughter for her covid vaccine total trip time 5 hours worth every minute,1,0,6,[],"41.85003,-87.65005,200km",0.1,0.283333,0.054,0.946,0.0,positive
4,1385745098357489668,2021-04-23 19:59:12-04:00,135263424,meg_michael_,dfisman life circumstances means that those most at risk of adverse covid outcomes also be put at additional risk with az vaccination save it for them because we can all wait for the better vaccines just such fucking shit messaging on your part awful shameful,1,1,1,[],"45.41117,-75.69812,200km",-0.04,0.66,0.101,0.612,0.287,negative


In [29]:
df.to_csv(r'/Users/rachelchen/Desktop/BrainStation/Capstone/Twitter Sentiment/new0425.csv', index = False)

**We have complete our ideal raw data.**