# TWITTER SENTIMENT ANALYSIS ON COVID DATA - WEBSCRAPING

## Importing Packages and Connecting Data

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import nltk
import snscrape.modules.twitter as sntwitter
from tqdm.notebook import tqdm
from pathlib import Path  
import re
import string
from transformers import AutoTokenizer
#add package
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [2]:
#package for social network scraping
#pip install snscrape

## Web Scraping for CovidVaccine  related tweets 

In [3]:
query = "python"

for tweet in sntwitter.TwitterSearchScraper(query).get_items():
    print(vars(tweet))
    break

{'url': 'https://twitter.com/manas_llb/status/1594595754051518466', 'date': datetime.datetime(2022, 11, 21, 7, 37, 8, tzinfo=datetime.timezone.utc), 'content': '@ManasMarthi @ShriramKMurthi My students and colleagues find real hard to switch languages. Mutability was not big concern. How to plumb moving parts, or use a framework was big learning curve. So,starting with python may be okay for a web developer who is going to deal with procedural req/res code', 'renderedContent': '@ManasMarthi @ShriramKMurthi My students and colleagues find real hard to switch languages. Mutability was not big concern. How to plumb moving parts, or use a framework was big learning curve. So,starting with python may be okay for a web developer who is going to deal with procedural req/res code', 'id': 1594595754051518466, 'user': User(username='manas_llb', id=1467328215735693315, displayname='Manas', description='Maxime stultus. Retweets are retweets. Likes are Likes. Endorsements are endorsements. Trying t

In [4]:

# webscaping from 2020 to 2021
tweets = []
limit = 10000

query = "(covid+vaccine OR Nuvaxovid OR Novavax OR pfizer OR pfizer-biontech OR pfizerbiontech OR moderna OR astrazeneca OR astra+zeneca) -availability -free -slot -slots -trade -trades (#covidvaccine OR #CovidVaccine OR #vaccine OR #covid OR #Covid19Vaccine OR #CoronaVirusVaccine ) lang:en until:2021-12-8 since:2020-12-08 -filter:links -filter:replies"

for tweet in tqdm(sntwitter.TwitterSearchScraper(query).get_items(),total = limit):
    if len(tweets) == limit:
        break
    else:
        tweets.append([tweet.date,tweet.user.username,tweet.content,tweet.url,tweet.user.location])
vaccine_df_2020_2021 = pd.DataFrame(tweets,columns=['Date','User','Tweet','Tweet_URL','Location'])
vaccine_df_2020_2021.head()


  0%|          | 0/10000 [00:00<?, ?it/s]

Unnamed: 0,Date,User,Tweet,Tweet_URL,Location
0,2021-12-07 23:58:57+00:00,TPedherney,In line to get my booster #vaccine #Booster #P...,https://twitter.com/TPedherney/status/14683694...,"Burlington, ON"
1,2021-12-07 23:32:39+00:00,cwillits30,Officially boosted! First done of Moderna afte...,https://twitter.com/cwillits30/status/14683628...,"Phoenix, AZ"
2,2021-12-07 22:58:56+00:00,michaelwbunner,Can anyone prove that the #covid #vaccine prev...,https://twitter.com/michaelwbunner/status/1468...,"Norristown, PA"
3,2021-12-07 22:08:57+00:00,talking_biscuit,My 6 yo #neurodiverse kiddo just got his 2nd C...,https://twitter.com/talking_biscuit/status/146...,"Sacramento, CA"
4,2021-12-07 22:03:38+00:00,chris_brennan91,Omicron has now been confirmed as not only mor...,https://twitter.com/chris_brennan91/status/146...,"Newcastle Upon Tyne, England"


In [5]:
vaccine_df_2020_2021_newline_cleaned = vaccine_df_2020_2021.replace(r'\n',' ', regex=True).replace(r'\r',' ', regex=True).replace(r'\t',' ', regex=True)
vaccine_df_2020_2021_newline_cleaned1 = vaccine_df_2020_2021_newline_cleaned.replace(r'\\r',' ', regex=True).replace(r'\\n',' ', regex=True).replace(r'\\t',' ',regex=True)
vaccine_df_2020_2021_white_space = vaccine_df_2020_2021_newline_cleaned1.replace(r' +',' ',regex=True)
vaccine_df_2020_2021_white_space.head()

Unnamed: 0,Date,User,Tweet,Tweet_URL,Location
0,2021-12-07 23:58:57+00:00,TPedherney,In line to get my booster #vaccine #Booster #P...,https://twitter.com/TPedherney/status/14683694...,"Burlington, ON"
1,2021-12-07 23:32:39+00:00,cwillits30,Officially boosted! First done of Moderna afte...,https://twitter.com/cwillits30/status/14683628...,"Phoenix, AZ"
2,2021-12-07 22:58:56+00:00,michaelwbunner,Can anyone prove that the #covid #vaccine prev...,https://twitter.com/michaelwbunner/status/1468...,"Norristown, PA"
3,2021-12-07 22:08:57+00:00,talking_biscuit,My 6 yo #neurodiverse kiddo just got his 2nd C...,https://twitter.com/talking_biscuit/status/146...,"Sacramento, CA"
4,2021-12-07 22:03:38+00:00,chris_brennan91,Omicron has now been confirmed as not only mor...,https://twitter.com/chris_brennan91/status/146...,"Newcastle Upon Tyne, England"


In [6]:
vaccine_df_2020_2021['Tweet'][6]

'Pfizer advertising the Covid vaccine on the radio is compatible to my coke dealer advertising his 8 balls #COVID19 #Pfizer #vaccine'

In [7]:
vaccine_df_2020_2021_white_space['Tweet'][6]

'Pfizer advertising the Covid vaccine on the radio is compatible to my coke dealer advertising his 8 balls #COVID19 #Pfizer #vaccine'

In [8]:
vaccine_df_2020_2021_white_space.duplicated(subset=['Tweet']).sum()

1558

In [9]:
vaccine_df_clean=vaccine_df_2020_2021_white_space.drop_duplicates(subset=['Tweet'])
vaccine_df_clean.shape

(8442, 5)

In [10]:
filepath = Path('data/vaccine_output_2020_2021.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)
vaccine_df_clean.to_csv(filepath)

In [15]:
#Webscraping from 2021 to 2022
tweets = []
query = "(covid+vaccine OR Nuvaxovid OR Novavax OR pfizer OR pfizer-biontech OR pfizerbiontech OR moderna OR astrazeneca OR astra+zeneca) -availability -free -slot -slots -trade -trades (#covidvaccine OR #CovidVaccine OR #vaccine OR #covid OR #Covid19Vaccine OR #CoronaVirusVaccine ) lang:en until:2022-11-25 since:2021-12-08 -filter:links -filter:replies"

for tweet in tqdm(sntwitter.TwitterSearchScraper(query).get_items(),total = limit):
    if len(tweets) == limit:
        break
    else:
        tweets.append([tweet.date,tweet.user.username,tweet.content,tweet.url,tweet.user.location])
vaccine_df_2021_2022 = pd.DataFrame(tweets,columns=['Date','User','Tweet','Tweet_URL','Location'])

  0%|          | 0/10000 [00:00<?, ?it/s]

In [16]:
vaccine_df_2021_2022_newline_cleaned = vaccine_df_2021_2022.replace(r'\n',' ', regex=True).replace(r'\r',' ', regex=True).replace(r'\t',' ', regex=True)
vaccine_df_2021_2022_newline_cleaned1 = vaccine_df_2021_2022_newline_cleaned.replace(r'\\r',' ', regex=True).replace(r'\\n',' ', regex=True).replace(r'\\t',' ',regex=True)
vaccine_df_2021_2022_white_space = vaccine_df_2021_2022_newline_cleaned1.replace(r' +',' ',regex=True)
vaccine_df_2021_2022_white_space.duplicated(subset=['Tweet']).sum()

279

In [17]:
vaccine_df_2021_2022_clean=vaccine_df_2021_2022_white_space.drop_duplicates(subset=['Tweet'])
vaccine_df_2021_2022_clean.shape

(7526, 5)

In [18]:
filepath = Path('data/vaccine_output_2021_2022.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)
vaccine_df_2021_2022_clean.to_csv(filepath)