# TWITTER SENTIMENT ANALYSIS ON COVID DATA - WEBSCRAPING

## Importing Packages and Connecting Data

In [12]:
import numpy as np
import pandas as pd
import seaborn as sns
import nltk
import snscrape.modules.twitter as sntwitter
from tqdm.notebook import tqdm
from pathlib import Path  
import re
import string
from transformers import AutoTokenizer
#add package
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [13]:
#package for social network scraping
#pip install snscrape

## Web Scraping for CovidVaccine  related tweets 

In [14]:
query = "python"

for tweet in sntwitter.TwitterSearchScraper(query).get_items():
    print(vars(tweet))
    break

{'url': 'https://twitter.com/EvanBoothe25/status/1589354034343055363', 'date': datetime.datetime(2022, 11, 6, 20, 28, 25, tzinfo=datetime.timezone.utc), 'content': '@ShitpostGate Python master race 💪😎🐍', 'renderedContent': '@ShitpostGate Python master race 💪😎🐍', 'id': 1589354034343055363, 'user': User(username='EvanBoothe25', id=1518032591470751744, displayname='Evan Boothe', description='MIT class of 2025, jazz musician, conservative national populist, future Caesar of the American Empire, Hawaiian shirt appreciator, ginger beer aficionado, Okie', rawDescription='MIT class of 2025, jazz musician, conservative national populist, future Caesar of the American Empire, Hawaiian shirt appreciator, ginger beer aficionado, Okie', descriptionUrls=None, verified=False, created=datetime.datetime(2022, 4, 24, 1, 2, 37, tzinfo=datetime.timezone.utc), followersCount=81, friendsCount=113, statusesCount=1904, favouritesCount=19581, listedCount=0, mediaCount=101, location='Cambridge, MA', protected=F

In [15]:

# webscaping from 2020 to 2021
tweets = []
limit = 1000

query = "covid vaccine -availability -free -slot -slots -trade -trades (#covidvaccine OR #CovidVaccine OR #vaccine OR #covid OR #Covid19Vaccine OR #CoronaVirusVaccine ) lang:en until:2021-12-8 since:2020-12-08 -filter:links -filter:replies"

for tweet in tqdm(sntwitter.TwitterSearchScraper(query).get_items(),total = limit):
    if len(tweets) == limit:
        break
    else:
        tweets.append([tweet.date,tweet.user.username,tweet.content,tweet.url,tweet.user.location])
vaccine_df_2020_2021 = pd.DataFrame(tweets,columns=['Date','User','Tweet','Tweet_URL','Location'])
vaccine_df_2020_2021.head()


  0%|          | 0/1000 [00:00<?, ?it/s]

Unnamed: 0,Date,User,Tweet,Tweet_URL,Location
0,2021-12-07 23:29:36+00:00,MA_CEAL,#COVIDMythBustingTuesdays! \n\nMyth: Doesn’t t...,https://twitter.com/MA_CEAL/status/14683620841...,"Boston, Massachusetts"
1,2021-12-07 23:08:19+00:00,WebAeneas,#vaccinated another great idea you can sue any...,https://twitter.com/WebAeneas/status/146835672...,
2,2021-12-07 23:02:41+00:00,TheAmirImani,"I don't know if this is a ""3 dose #vaccine"" or...",https://twitter.com/TheAmirImani/status/146835...,Toronto
3,2021-12-07 22:58:56+00:00,michaelwbunner,Can anyone prove that the #covid #vaccine prev...,https://twitter.com/michaelwbunner/status/1468...,"Norristown, PA"
4,2021-12-07 22:58:46+00:00,drsajumathew,About 1600 people die each day from #COVID in ...,https://twitter.com/drsajumathew/status/146835...,"Atlanta, GA"


In [16]:
vaccine_df_2020_2021_newline_cleaned = vaccine_df_2020_2021.replace(r'\n',' ', regex=True).replace(r'\r',' ', regex=True).replace(r'\t',' ', regex=True)
vaccine_df_2020_2021_newline_cleaned1 = vaccine_df_2020_2021_newline_cleaned.replace(r'\\r',' ', regex=True).replace(r'\\n',' ', regex=True).replace(r'\\t',' ',regex=True)
vaccine_df_2020_2021_white_space = vaccine_df_2020_2021_newline_cleaned1.replace(r' +',' ',regex=True)
vaccine_df_2020_2021_white_space.head()

Unnamed: 0,Date,User,Tweet,Tweet_URL,Location
0,2021-12-07 23:29:36+00:00,MA_CEAL,#COVIDMythBustingTuesdays! Myth: Doesn’t the #...,https://twitter.com/MA_CEAL/status/14683620841...,"Boston, Massachusetts"
1,2021-12-07 23:08:19+00:00,WebAeneas,#vaccinated another great idea you can sue any...,https://twitter.com/WebAeneas/status/146835672...,
2,2021-12-07 23:02:41+00:00,TheAmirImani,"I don't know if this is a ""3 dose #vaccine"" or...",https://twitter.com/TheAmirImani/status/146835...,Toronto
3,2021-12-07 22:58:56+00:00,michaelwbunner,Can anyone prove that the #covid #vaccine prev...,https://twitter.com/michaelwbunner/status/1468...,"Norristown, PA"
4,2021-12-07 22:58:46+00:00,drsajumathew,About 1600 people die each day from #COVID in ...,https://twitter.com/drsajumathew/status/146835...,"Atlanta, GA"


In [17]:
vaccine_df_2020_2021['Tweet'][6]

'"Up till now\rthose of us who want to remain in our normal health state have been derisively known as “The Unvaccinated.” We need to change\rthe perception, change the narrative.\nKG Wordsmith Warrior (in part) \nWe choose to be #vaccinefree of #CovidVaccine we are not #unvaccinated'

In [18]:
vaccine_df_2020_2021_white_space['Tweet'][6]

'"Up till now those of us who want to remain in our normal health state have been derisively known as “The Unvaccinated.” We need to change the perception, change the narrative. KG Wordsmith Warrior (in part) We choose to be #vaccinefree of #CovidVaccine we are not #unvaccinated'

In [19]:
vaccine_df_2020_2021_white_space.duplicated(subset=['Tweet']).sum()

1

In [20]:
vaccine_df_clean=vaccine_df_2020_2021_white_space.drop_duplicates(subset=['Tweet'])
vaccine_df_clean.shape

(999, 5)

In [21]:
filepath = Path('data/vaccine_output_2020_2021.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)
vaccine_df_clean.to_csv(filepath)

In [22]:
#Webscraping from 2021 to 2022
tweets = []
query = "covid vaccine -availability -free -slot -slots -trade -trades (#covidvaccine OR #CovidVaccine OR #vaccine OR #covid OR #Covid19Vaccine OR #CoronaVirusVaccine ) lang:en until:2022-12-8 since:2021-12-08 -filter:links -filter:replies"

for tweet in tqdm(sntwitter.TwitterSearchScraper(query).get_items(),total = limit):
    if len(tweets) == limit:
        break
    else:
        tweets.append([tweet.date,tweet.user.username,tweet.content,tweet.url,tweet.user.location])
vaccine_df_2021_2022 = pd.DataFrame(tweets,columns=['Date','User','Tweet','Tweet_URL','Location'])

  0%|          | 0/1000 [00:00<?, ?it/s]

In [23]:
vaccine_df_2021_2022_newline_cleaned = vaccine_df_2021_2022.replace(r'\n',' ', regex=True).replace(r'\r',' ', regex=True).replace(r'\t',' ', regex=True)
vaccine_df_2021_2022_newline_cleaned1 = vaccine_df_2021_2022_newline_cleaned.replace(r'\\r',' ', regex=True).replace(r'\\n',' ', regex=True).replace(r'\\t',' ',regex=True)
vaccine_df_2021_2022_white_space = vaccine_df_2021_2022_newline_cleaned1.replace(r' +',' ',regex=True)
vaccine_df_2021_2022_white_space.duplicated(subset=['Tweet']).sum()

7

In [24]:
vaccine_df_2021_2022_clean=vaccine_df_2021_2022_white_space.drop_duplicates(subset=['Tweet'])
vaccine_df_2021_2022_clean.shape

(993, 5)

In [25]:
filepath = Path('data/vaccine_output_2021_2022.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)
vaccine_df_2021_2022_clean.to_csv(filepath)