## Twitter Dataset

In [None]:
import jsonlines
import json

with jsonlines.open('../../../../Vaccine-Disinfo-Data/Twitter/vaers_2021_01_01_to_2021_08_01.json') as reader:
    tweets = list()
    for obj in reader:
        tweets.extend(obj["data"])
            
with open('./all_tweets.json', 'w', encoding='utf-8') as file_obj:
    json.dump(tweets, file_obj, ensure_ascii=False)    

In [None]:
with open('./all_tweets.json') as f:
    data = json.load(f)
    
print(data)

### AstraZeneca dictionary

In [None]:
import re

compiled_dictionary = re.compile(r"(?i)(AstraZeneca|Astra Zeneca|AZD1222|COVID|vaccine|immunity|herd immunity|Barrington|focused protection)" , flags=re.IGNORECASE)

In [None]:
import jsonlines
import json
import re

with jsonlines.open('drive/MyDrive/vaers_2021_01_01_to_2021_08_01.json') as reader:
    url_list = list()
    for obj in reader:
      for o in obj["data"]:
        try:
          twt = o["text"]
          r1 = re.search(compiled_dictionary,twt)
          if r1 is not None:
            urls = [dict(url=url["expanded_url"],text=twt) for url in o["entities"]["urls"]]
            url_list.extend(urls)
        except Exception:
          pass

with open('./url_list.json', 'w', encoding='utf-8') as file_obj:
    json.dump(url_list, file_obj, ensure_ascii=False, indent=4)

#### Analysis of AstraZeneca keyword filtered dataset

In [6]:
import json

with open('../../data/VAERS_data/URLs for AstraZeneca Keywords/url list for astrazeneca.json') as f:
    data = json.load(f)

In [4]:
df = pd.DataFrame(data,columns=["url"])
df

Unnamed: 0,url
0,https://twitter.com/PressDoYourJob/status/1421...
1,https://t.me/Hyer971/4320
2,https://twitter.com/Drwealth54/status/14216194...
3,https://childrenshealthdefense.org/defender/va...
4,https://t.me/Hyer971/4320
...,...
77561,https://vaers.hhs.gov/esub/index.jsp
77562,https://ift.tt/2WYTD3M
77563,https://ift.tt/2WYTD3M
77564,https://twitter.com/critica18495985/status/134...


In [7]:
unique_urls = df.groupby(by="url").size().reset_index(name='count').sort_values("count",ascending=False)
unique_urls.to_csv("../../data/VAERS_data/URLs for AstraZeneca Keywords/unique_urls.csv",index=False)
unique_urls

Unnamed: 0,url,count
25183,https://www.openvaers.com/covid-data,1569
23364,https://www.cdc.gov/coronavirus/2019-ncov/vacc...,1298
25678,https://www.thegatewaypundit.com/2021/06/shock...,1227
22894,https://wonder.cdc.gov/vaers.html,1130
22392,https://vaers.hhs.gov/,1123
...,...,...
10203,https://twitter.com/Justin365/status/141326717...,1
10202,https://twitter.com/JustaCountryGi8/status/141...,1
10201,https://twitter.com/JustaCountryGi8/status/141...,1
10200,https://twitter.com/Just_Cartooning/status/140...,1


In [10]:
from urllib.parse import urlparse

unique_urls["netloc"] = unique_urls["url"].apply(lambda x: urlparse(x).netloc)
unique_urls.to_csv("../../data/VAERS_data/URLs for AstraZeneca Keywords/unique_urls_with_netloc.csv",index=False)
unique_urls.sort_values("count",ascending=False)

Unnamed: 0,url,count,netloc
25183,https://www.openvaers.com/covid-data,1569,www.openvaers.com
23364,https://www.cdc.gov/coronavirus/2019-ncov/vacc...,1298,www.cdc.gov
25678,https://www.thegatewaypundit.com/2021/06/shock...,1227,www.thegatewaypundit.com
22894,https://wonder.cdc.gov/vaers.html,1130,wonder.cdc.gov
22392,https://vaers.hhs.gov/,1123,vaers.hhs.gov
...,...,...,...
20554,https://twitter.com/realBryanCowger/status/141...,1,twitter.com
20555,https://twitter.com/realConnieBevan/status/140...,1,twitter.com
20556,https://twitter.com/realDaVincigal/status/1417...,1,twitter.com
20557,https://twitter.com/realDailyWire/status/14157...,1,twitter.com


In [17]:
unique_urls[["netloc"]].groupby(by="netloc").size().reset_index(name='count').sort_values("count",ascending=False)

Unnamed: 0,netloc,count
2114,twitter.com,43182
587,childrenshealthdefense.org,653
2967,www.medalerts.org,653
2300,wonder.cdc.gov,586
3556,youtu.be,379
...,...,...
1366,medianism.org,1
1368,medical.nikkeibp.co.jp,1
1369,medicaldialogues.in,1
1371,medicaltrend.org,1


## Analysis of unfiltered twitter dataset

In [3]:
from IPython.core.interactiveshell import InteractiveShell
import pandas as pd
pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 0)
InteractiveShell.ast_node_interactivity = "all"

In [11]:
import json

with open('../../data/VAERS_data/url_list.json') as f:
    data = json.load(f)

In [12]:
df = pd.DataFrame(data,columns=["url"])
df

Unnamed: 0,url
0,https://twitter.com/PressDoYourJob/status/1421...
1,https://twitter.com/val_ic/status/142156589791...
2,https://www.google.com/amp/s/mobile.reuters.co...
3,https://www.nebraskamed.com/COVID/does-vaers-l...
4,https://t.me/Hyer971/4320
...,...
240470,https://ift.tt/2WYTD3M
240471,https://ift.tt/2WYTD3M
240472,https://twitter.com/critica18495985/status/134...
240473,https://www.medalerts.org/vaersdb/findfield.ph...


In [13]:
unique_urls = df.groupby(by="url").size().reset_index(name='count').sort_values("count",ascending=False)
unique_urls.to_csv("../../data/VAERS_data/unique_urls.csv",index=False)
unique_urls

Unnamed: 0,url,count
53831,https://wonder.cdc.gov/vaers.html,5380
58043,https://www.openvaers.com/covid-data,5307
3705,https://childrenshealthdefense.org/defender/va...,3773
52938,https://vaers.hhs.gov/,3742
3378,https://childrenshealthdefense.org/defender/cd...,3702
...,...,...
23342,https://twitter.com/LonelyZeta/status/13926695...,1
23343,https://twitter.com/LongLiv04611701/status/141...,1
23344,https://twitter.com/LongLiv04611701/status/141...,1
23347,https://twitter.com/Long_GoldSilver/status/135...,1


In [14]:
from urllib.parse import urlparse

unique_urls["netloc"] = unique_urls["url"].apply(lambda x: urlparse(x).netloc)
unique_urls.to_csv("../../data/VAERS_data/unique_urls_with_netloc.csv",index=False)
unique_urls

Unnamed: 0,url,count,netloc
53831,https://wonder.cdc.gov/vaers.html,5380,wonder.cdc.gov
58043,https://www.openvaers.com/covid-data,5307,www.openvaers.com
3705,https://childrenshealthdefense.org/defender/va...,3773,childrenshealthdefense.org
52938,https://vaers.hhs.gov/,3742,vaers.hhs.gov
3378,https://childrenshealthdefense.org/defender/cd...,3702,childrenshealthdefense.org
...,...,...,...
23342,https://twitter.com/LonelyZeta/status/13926695...,1,twitter.com
23343,https://twitter.com/LongLiv04611701/status/141...,1,twitter.com
23344,https://twitter.com/LongLiv04611701/status/141...,1,twitter.com
23347,https://twitter.com/Long_GoldSilver/status/135...,1,twitter.com


In [16]:
unique_urls[["netloc"]].groupby(by="netloc").size().reset_index(name='count').sort_values("count",ascending=False)

Unnamed: 0,netloc,count
2114,twitter.com,43182
587,childrenshealthdefense.org,653
2967,www.medalerts.org,653
2300,wonder.cdc.gov,586
3556,youtu.be,379
...,...,...
1366,medianism.org,1
1368,medical.nikkeibp.co.jp,1
1369,medicaldialogues.in,1
1371,medicaltrend.org,1


In [None]:
import re

compiled = re.compile(r".gov" , flags=re.IGNORECASE)

df_gov_filtered = df[~df["url"].str.contains(compiled)]