# Twitter VEARS Dataset

## Twitter Text Dataset

In [1]:
from IPython.core.interactiveshell import InteractiveShell
import pandas as pd
pd.set_option("display.max_rows", 10)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 0)
InteractiveShell.ast_node_interactivity = "all"
import re

In [2]:
uptextdf = pd.read_pickle("../../data/Master-Data/Twitter/vaers_processed_v1.pkl.gz")
uptextdf["nodeTime"] = pd.to_datetime(uptextdf["nodeTime"])

In [None]:
"total twitter text dataset"
uptextdf.describe(datetime_is_numeric=True).loc["unique"]

"retweeted tweets"
uptextdf[uptextdf.actionType=="retweeted"].describe(datetime_is_numeric=True).loc["unique"]

"tweeted tweets"
uptextdf[uptextdf.actionType=="tweet"].describe(datetime_is_numeric=True).loc["unique"]

"replied tweets"
uptextdf[uptextdf.actionType=="replied_to"].describe(datetime_is_numeric=True).loc["unique"]

"quoted tweets"
uptextdf[uptextdf.actionType=="quoted"].describe(datetime_is_numeric=True).loc["unique"]

## Twitter URL Dataset

In [3]:
upurldf = pd.read_pickle("../../data/Master-Data/Twitter/vaers_processed_v1_urls.pkl.gz")

In [None]:
upurldf.describe()

In [None]:
upmerged = uptextdf.merge(upurldf, on=['nodeID'], 
                   how='left',indicator=True)
upmerged["nodeDate"] = upmerged["nodeTime"].dt.date

gptime = upmerged \
    .groupby(["nodeDate"]).nunique().sort_values(["nodeDate"],ascending=True).reset_index()
gptime

import plotly.express as px
fig = px.line(gptime.rename(columns=dict(nodeID="tweets")),x="nodeDate",y=["tweets","url"],width=900)
fig = fig.update_layout(title_text="Number of Tweets and URLs over time",legend_title="Legend")
fig = fig.update_xaxes(title_text='Date')
fig = fig.update_yaxes(title_text='Unique Count')
fig.show()

In [4]:
filteredurldf = upurldf[~upurldf["domain"].str.contains("twitter.com")]
filteredurldf

Unnamed: 0,nodeID,url,domain
2,1421621149683920898,https://www.google.com/amp/s/mobile.reuters.co...,google.com
3,1421621149683920898,https://www.google.com/amp/s/mobile.reuters.co...,google.com
4,1421621035154038788,https://t.me/Hyer971/4320,t.me
7,1421619416840785920,https://childrenshealthdefense.org/defender/va...,childrenshealthdefense.org
8,1421618874382946311,http://fb.watch/v/2SY9meKNx/,fb.watch
...,...,...,...
240469,1344938144995618816,https://vaers.hhs.gov,vaers.hhs.gov
240470,1344924211677179904,https://ift.tt/2WYTD3M,ift.tt
240471,1344921371814588417,https://ift.tt/2WYTD3M,ift.tt
240473,1344811013518966784,https://www.medalerts.org/vaersdb/findfield.ph...,medalerts.org


__Expanding shortened URLS__

__all_urls.csv MD5sum: 6f7b1e2b67697737ad1044af584c14ad__

In [86]:
!md5sum all_urls.csv

6f7b1e2b67697737ad1044af584c14ad  all_urls.csv


In [None]:
import pandas as pd
from threading import Thread
import requests
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
main_url_list = list()
url_set = set()
main_thread_list = list()

urls = pd.read_csv("./all_urls.csv").sort_values(by="url", ascending=True)
url_list = urls.url.to_list()


def main():
    try:
        step_size = 1000
        chunks = [
            url_list[x : x + step_size] for x in range(0, len(url_list), step_size)
        ]

        for c_index, chunk in enumerate(chunks):
            thread = Thread(target=threader, args=(chunk,))
            main_thread_list.append(thread)
            thread.start()

        if len(main_thread_list) > 0:
            for thread_in_list in main_thread_list:
                thread_in_list.join()

        expanded_urls = pd.DataFrame(data=main_url_list).sort_values(by="url")
        expanded_urls.to_csv("./expanded_urls.csv", index=False)
        print("url expansion completed")
    except Exception as e:
        print(e)


def threader(chunk):
    try:
        for u_index, url in enumerate(chunk):
            get_exp_url(url)
    except Exception as e:
        print(e)


def get_exp_url(url):
    try:
        timeout = 10
        response = requests.get(url, timeout=timeout, verify=False, stream=True)
        main_url_list.append({"url": url, "expanded_url": response.url, "active": True})
        url_set.add(url)
        print(f"{len(url_set)}/{len(url_list)} done")
    except Exception:
        main_url_list.append({"url": url, "expanded_url": url, "active": False})


main()

__expanded_urls.csv MD5sum: 4ee5f5a2bc76482a948478ad0f0484f1__

In [87]:
!md5sum expanded_urls.csv

4ee5f5a2bc76482a948478ad0f0484f1  expanded_urls.csv


In [7]:
expandedurls = pd.read_csv("./expanded_urls.csv")
expandedurls.describe()

Unnamed: 0,url,expanded_url,active
count,15389,15389,15389
unique,15389,13618,2
top,https://www.youtube.com/watch?v=gPHgRp70H8o,https://www.instagram.com/accounts/login/,True
freq,1,113,15066


__Obtain domains for the expanded URLs__

In [8]:
from urllib.parse import urlparse
expandedurls["expanded_domain"] = expandedurls["expanded_url"].apply(lambda x: urlparse(x).netloc.lower())
expandedurls

Unnamed: 0,url,expanded_url,active,expanded_domain
0,HTTP://BRANDHAARD.COM,https://brandhaard.com/,True,brandhaard.com
1,HTTPS://VAERS.HTIS.GOV,HTTPS://VAERS.HTIS.GOV,False,vaers.htis.gov
2,HTTPS://VAERS.hhs.gov,https://vaers.hhs.gov/,True,vaers.hhs.gov
3,HTTPS://cdc.gov/vaers.html,https://www.cdc.gov/vaers.html,True,www.cdc.gov
4,HTTPS://childrenshealthdefense.org/defender/cd...,https://childrenshealthdefense.org/defender/cd...,True,childrenshealthdefense.org
...,...,...,...,...
15384,https://zpr.io/Pmwxc,https://www.factcheck.org/,True,www.factcheck.org
15385,https://zpr.io/R2cNE,https://theconservativetreehouse.com/blog/2021...,True,theconservativetreehouse.com
15386,https://zpr.io/RLDCS,https://e24.no/norsk-oekonomi/i/dl61AB/hyttepr...,True,e24.no
15387,https://zpr.io/RNkhi,https://www.cdc.gov/coronavirus/2019-ncov/vacc...,True,www.cdc.gov


In [9]:
expandedurldf = filteredurldf.merge(expandedurls,on=["url"],how="left")

finalfilteredurldf = expandedurldf[~expandedurldf["expanded_domain"].astype(str).str.contains("twitter.com")]

"final filtered URL Dataframe statistics"
finalfilteredurldf.describe().loc["unique"]

'final filtered URL Dataframe statistics'

nodeID             143204
url                 15384
domain               2977
expanded_url        13613
active                  2
expanded_domain      2967
Name: unique, dtype: object

In [None]:
"active URLS"
finalfilteredurldf[finalfilteredurldf.active].describe()

"inactive URLS"
finalfilteredurldf[~finalfilteredurldf.active].describe()

### Twitter Text-URL Dataset

In [10]:
texturlmerged = uptextdf.merge(finalfilteredurldf, on=['nodeID'], 
                   how='right')
texturlmerged.describe(datetime_is_numeric=True)

Unnamed: 0,actionType,nodeID,nodeLang,nodeText,nodeTime,nodeUserID,parentID,parentUserID,url,domain,expanded_url,active,expanded_domain
count,175122,175122.0,175122,175122,175122,175122.0,175122.0,170818.0,175122,175122,175122,175122,175122
unique,4,143204.0,44,93799,,60769.0,87977.0,44647.0,15384,2977,13613,2,2967
top,replied_to,1.4045663886735524e+18,en,RT @RobertKennedyJr: Here's the latest CDC VAE...,,1.0991771436013772e+18,1.401199884997382e+18,337808606.0,https://www.openvaers.com/covid-data,childrenshealthdefense.org,https://openvaers.com/index.php,True,childrenshealthdefense.org
freq,66622,20.0,106507,2510,,1144.0,2522.0,12511.0,6770,38784,8730,173159,44644
mean,,,,,2021-05-19 18:39:22.556737280+00:00,,,,,,,,
min,,,,,2021-01-01 00:59:34+00:00,,,,,,,,
25%,,,,,2021-04-13 17:39:50+00:00,,,,,,,,
50%,,,,,2021-06-01 13:43:52+00:00,,,,,,,,
75%,,,,,2021-07-04 02:19:15+00:00,,,,,,,,
max,,,,,2021-07-31 23:57:49+00:00,,,,,,,,


In [11]:
texturlmerged \
    .groupby(["expanded_url"]).nunique().sort_values(["nodeID"],ascending=False)\
.reset_index()


texturlmerged \
    .groupby(["expanded_domain"]).nunique().sort_values(["nodeID"],ascending=False)\
.reset_index()

Unnamed: 0,expanded_url,actionType,nodeID,nodeLang,nodeText,nodeTime,nodeUserID,parentID,parentUserID,url,domain,active,expanded_domain
0,https://vaers.hhs.gov/,4,5962,33,4612,5957,3800,4809,3570,38,6,1,1
1,https://openvaers.com/index.php,4,5718,28,3323,5706,3702,3211,2292,74,6,1,1
2,https://wonder.cdc.gov/vaers.html,4,5323,27,2445,5303,3977,2379,1819,10,3,1,1
3,https://childrenshealthdefense.org/defender/va...,4,3755,22,932,3690,3378,866,758,5,2,1,1
4,https://childrenshealthdefense.org/defender/cd...,4,3685,21,1081,3603,3149,1041,864,3,3,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13608,https://ugetube.com/watch/euthanasia-injection...,1,1,1,1,1,1,1,1,1,1,1,1
13609,https://ugetube.com/watch/my-job-required-me-t...,1,1,1,1,1,1,1,1,1,1,1,1
13610,https://ugetube.com/watch/the-david-knight-sho...,1,1,1,1,1,1,1,1,1,1,1,1
13611,https://ugetube.com/watch/the-david-knight-sho...,1,1,1,1,1,1,1,1,1,1,1,1


Unnamed: 0,expanded_domain,actionType,nodeID,nodeLang,nodeText,nodeTime,nodeUserID,parentID,parentUserID,url,domain,expanded_url,active
0,childrenshealthdefense.org,4,40421,39,22063,40009,19777,21039,11398,852,20,631,1
1,vaers.hhs.gov,4,12806,38,10501,12797,7733,10447,7614,224,9,141,1
2,openvaers.com,4,8233,29,5010,8220,4695,4898,3466,195,6,91,1
3,wonder.cdc.gov,4,6805,28,3787,6784,4761,3661,2746,559,4,537,1
4,www.thegatewaypundit.com,4,6335,24,3292,6259,5077,3070,2297,162,10,126,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2962,res.mdpi.com,1,1,1,1,1,1,1,1,1,1,1,1
2963,republic.ru,1,1,1,1,1,1,1,1,1,1,1,1
2964,report3-19-21-4-05-21.pn,1,1,1,1,1,1,1,1,1,1,1,1
2965,report.link,1,1,1,1,1,1,1,1,1,1,1,1


## Identifying Tweets that mention vaccines

### Checking if the Tweet text mentions a specific vaccine

In [12]:
vaccinesdf = uptextdf.copy()
vaccinesdf["pfizer"] = vaccinesdf["nodeText"].astype(str).str.contains(r"(?i)(pfizer|biontech)",case=False,flags=re.IGNORECASE)
vaccinesdf["astrazeneca"] = vaccinesdf["nodeText"].astype(str).str.contains(r"(?i)(astrazeneca|astra zeneca)",case=False,flags=re.IGNORECASE)
vaccinesdf["moderna"] = vaccinesdf["nodeText"].astype(str).str.contains(r"(?i)(moderna)",case=False,flags=re.IGNORECASE)
vaccinesdf["nodeDate"] = vaccinesdf["nodeTime"].dt.date
vaccinesdf

  return func(self, *args, **kwargs)


Unnamed: 0,actionType,nodeID,nodeLang,nodeText,nodeTime,nodeUserID,parentID,parentUserID,pfizer,astrazeneca,moderna,nodeDate
0,retweeted,1421621648843710465,ja,RT @Alzhacker: まずVAERSデータベースの信頼性の低さが指摘されています。そ...,2021-07-31 23:59:48+00:00,379173233,1421328886860705797,823123029748158464,False,False,False,2021-07-31
1,retweeted,1421621633291218944,en,RT @LuvmyCountry52: If anything good comes fro...,2021-07-31 23:59:44+00:00,818315474723282944,1421593064498401280,1060660195989299200,False,False,False,2021-07-31
2,retweeted,1421621557051281410,en,RT @freethought202: From latest VAERS 🇺🇸 : The...,2021-07-31 23:59:26+00:00,1374150364857999360,1421448278613184519,846328208932798464,False,False,False,2021-07-31
3,tweet,1421621542480404481,en,Them: Vaccines are safe!\n\nMe: But VAERS repo...,2021-07-31 23:59:22+00:00,94864837,1421621542480404481,94864837,True,False,False,2021-07-31
4,replied_to,1421621536805449729,en,@Bowen_Thaylin @MandalaDanvers @Politics_1138 ...,2021-07-31 23:59:21+00:00,1269405809001197570,1421621082113589250,151510218,False,False,False,2021-07-31
...,...,...,...,...,...,...,...,...,...,...,...,...
730788,replied_to,1344811013518966784,en,@iamgregk @latimes The VAERS Database of react...,2021-01-01 01:01:45+00:00,92691651,1344805979594997760,46448402,False,False,False,2021-01-01
730789,replied_to,1344810463683461120,en,@latimes Check the VAERS for COVID-19 reaction...,2021-01-01 00:59:34+00:00,92691651,1344673687253893133,16664681,False,False,False,2021-01-01
730790,replied_to,1344809796332097536,en,@Harrrybel @latimes Please take care of yourse...,2021-01-01 00:56:55+00:00,44425430,1344755814356152321,,False,False,False,2021-01-01
730791,replied_to,1344809262091038726,en,@Reuters From #CoronaVaccine Check out #CDC #F...,2021-01-01 00:54:48+00:00,204176104,1344806908323131398,1652541,True,False,True,2021-01-01


In [None]:
vaccinesdf.describe()

In [None]:
vaccinesdf \
    .groupby(["pfizer","astrazeneca","moderna"]).nunique().sort_values(["nodeID"],ascending=False).reset_index()

## Identifying Tweets with misleading vaccine texts

### Checking if the tweets has misleading info

In [13]:
mistextdf = vaccinesdf.copy()
mistextdf

Unnamed: 0,actionType,nodeID,nodeLang,nodeText,nodeTime,nodeUserID,parentID,parentUserID,pfizer,astrazeneca,moderna,nodeDate
0,retweeted,1421621648843710465,ja,RT @Alzhacker: まずVAERSデータベースの信頼性の低さが指摘されています。そ...,2021-07-31 23:59:48+00:00,379173233,1421328886860705797,823123029748158464,False,False,False,2021-07-31
1,retweeted,1421621633291218944,en,RT @LuvmyCountry52: If anything good comes fro...,2021-07-31 23:59:44+00:00,818315474723282944,1421593064498401280,1060660195989299200,False,False,False,2021-07-31
2,retweeted,1421621557051281410,en,RT @freethought202: From latest VAERS 🇺🇸 : The...,2021-07-31 23:59:26+00:00,1374150364857999360,1421448278613184519,846328208932798464,False,False,False,2021-07-31
3,tweet,1421621542480404481,en,Them: Vaccines are safe!\n\nMe: But VAERS repo...,2021-07-31 23:59:22+00:00,94864837,1421621542480404481,94864837,True,False,False,2021-07-31
4,replied_to,1421621536805449729,en,@Bowen_Thaylin @MandalaDanvers @Politics_1138 ...,2021-07-31 23:59:21+00:00,1269405809001197570,1421621082113589250,151510218,False,False,False,2021-07-31
...,...,...,...,...,...,...,...,...,...,...,...,...
730788,replied_to,1344811013518966784,en,@iamgregk @latimes The VAERS Database of react...,2021-01-01 01:01:45+00:00,92691651,1344805979594997760,46448402,False,False,False,2021-01-01
730789,replied_to,1344810463683461120,en,@latimes Check the VAERS for COVID-19 reaction...,2021-01-01 00:59:34+00:00,92691651,1344673687253893133,16664681,False,False,False,2021-01-01
730790,replied_to,1344809796332097536,en,@Harrrybel @latimes Please take care of yourse...,2021-01-01 00:56:55+00:00,44425430,1344755814356152321,,False,False,False,2021-01-01
730791,replied_to,1344809262091038726,en,@Reuters From #CoronaVaccine Check out #CDC #F...,2021-01-01 00:54:48+00:00,204176104,1344806908323131398,1652541,True,False,True,2021-01-01


__Vaccine Misinfo keywords__

In [14]:
avaxkeywords = pd.read_csv("../../data/Master-Data/Twitter/avax-keywords.csv",header=None,names=["keyword"])
avax_dictionary = re.compile(rf"(?i)({'|'.join(avaxkeywords.keyword.to_list())})" , flags=re.IGNORECASE)

In [15]:
extracteddf = mistextdf["nodeText"].astype(str).str.extractall(avax_dictionary)
extracteddf

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Unnamed: 0_level_1,match,Unnamed: 2_level_1
24,0,unvaccinated
34,0,unvaccinated
190,0,vaxxed
264,0,unvaccinated
462,0,unvaccinated
...,...,...
730462,0,vaxxed
730479,0,vaxxed
730567,0,LearnTheRisk
730781,0,VaxXed


In [16]:
mistextdf = mistextdf.join(extracteddf.reset_index().set_index("level_0")[[0]].rename(columns={0:"avax_keywordcs"}))

In [17]:
def to_lower(word):
    if isinstance(word, str):
        return word.lower()
    else:
        return word

In [18]:
mistextdf["avax_keyword"] = mistextdf["avax_keywordcs"].apply(to_lower)
mistextdf

Unnamed: 0,actionType,nodeID,nodeLang,nodeText,nodeTime,nodeUserID,parentID,parentUserID,pfizer,astrazeneca,moderna,nodeDate,avax_keywordcs,avax_keyword
0,retweeted,1421621648843710465,ja,RT @Alzhacker: まずVAERSデータベースの信頼性の低さが指摘されています。そ...,2021-07-31 23:59:48+00:00,379173233,1421328886860705797,823123029748158464,False,False,False,2021-07-31,,
1,retweeted,1421621633291218944,en,RT @LuvmyCountry52: If anything good comes fro...,2021-07-31 23:59:44+00:00,818315474723282944,1421593064498401280,1060660195989299200,False,False,False,2021-07-31,,
2,retweeted,1421621557051281410,en,RT @freethought202: From latest VAERS 🇺🇸 : The...,2021-07-31 23:59:26+00:00,1374150364857999360,1421448278613184519,846328208932798464,False,False,False,2021-07-31,,
3,tweet,1421621542480404481,en,Them: Vaccines are safe!\n\nMe: But VAERS repo...,2021-07-31 23:59:22+00:00,94864837,1421621542480404481,94864837,True,False,False,2021-07-31,,
4,replied_to,1421621536805449729,en,@Bowen_Thaylin @MandalaDanvers @Politics_1138 ...,2021-07-31 23:59:21+00:00,1269405809001197570,1421621082113589250,151510218,False,False,False,2021-07-31,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
730788,replied_to,1344811013518966784,en,@iamgregk @latimes The VAERS Database of react...,2021-01-01 01:01:45+00:00,92691651,1344805979594997760,46448402,False,False,False,2021-01-01,,
730789,replied_to,1344810463683461120,en,@latimes Check the VAERS for COVID-19 reaction...,2021-01-01 00:59:34+00:00,92691651,1344673687253893133,16664681,False,False,False,2021-01-01,,
730790,replied_to,1344809796332097536,en,@Harrrybel @latimes Please take care of yourse...,2021-01-01 00:56:55+00:00,44425430,1344755814356152321,,False,False,False,2021-01-01,,
730791,replied_to,1344809262091038726,en,@Reuters From #CoronaVaccine Check out #CDC #F...,2021-01-01 00:54:48+00:00,204176104,1344806908323131398,1652541,True,False,True,2021-01-01,,


In [30]:
vaxgp = mistextdf[mistextdf.avax_keyword.notna()] \
    .groupby(["avax_keyword"]).nunique().sort_values(["nodeID"],ascending=False).reset_index()

vaxgp

"Number of Tweets, and authors for vaccine hesitancy keywords"

import plotly.express as px
fig = px.bar(vaxgp.rename(columns=dict(nodeID="tweets",nodeUserID="authors"))[vaxgp.nodeID>5],
              x="avax_keyword",y="tweets",color='authors')
fig = fig.update_layout(legend=dict(
yanchor="top",y=1,xanchor="right",x=1,bgcolor='rgba(0,0,0,0)'),
                        legend_title="Legend")
fig = fig.update_xaxes(title_text='Vaccine hesitancy keywords')
fig = fig.update_yaxes(title_text='Unique Tweets')
fig.show()

Unnamed: 0,avax_keyword,actionType,nodeID,nodeLang,nodeText,nodeTime,nodeUserID,parentID,parentUserID,pfizer,astrazeneca,moderna,nodeDate,avax_keywordcs
0,vaxxed,4,2168,7,1291,2163,1726,1224,954,2,1,2,174,5
1,informedconsent,4,714,6,445,713,425,413,248,2,2,2,153,5
2,unvaccinated,4,696,3,529,694,565,507,437,2,2,2,116,5
3,novaccine,4,416,9,309,416,257,284,204,2,2,2,123,5
4,vaccineinjuries,4,299,3,65,299,274,55,46,2,2,2,57,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26,notomandatoryvaccines,1,1,1,1,1,1,1,1,1,1,1,1,1
27,vaccinefraud,1,1,1,1,1,1,1,1,1,1,1,1,1
28,exposebillgates,1,1,1,1,1,1,1,1,1,1,1,1,1
29,covidvaccineispoison,1,1,1,1,1,1,1,1,1,1,1,1,1


'Number of Tweets, and authors for vaccine hesitancy keywords'

In [31]:
vaxgppf = mistextdf[mistextdf.avax_keyword.notna() & mistextdf.pfizer] \
    .groupby(["avax_keyword"]).nunique().sort_values(["nodeID"],ascending=False).reset_index()
vaxgppf

"Number of Tweets, and authors for vaccine hesitancy keywords (For Pfizer)"

import plotly.express as px
fig = px.bar(vaxgppf.rename(columns=dict(nodeID="tweets",nodeUserID="authors")),
              x="avax_keyword",y="tweets",color='authors')
fig = fig.update_layout(
                        legend_title="Legend")
fig = fig.update_xaxes(title_text='Vaccine hesitancy keywords')
fig = fig.update_yaxes(title_text='Unique Tweets')
fig.show()

Unnamed: 0,avax_keyword,actionType,nodeID,nodeLang,nodeText,nodeTime,nodeUserID,parentID,parentUserID,pfizer,astrazeneca,moderna,nodeDate,avax_keywordcs
0,informedconsent,4,30,2,30,30,19,30,20,1,2,2,26,3
1,vaxxed,4,25,3,25,25,25,25,21,1,1,2,22,3
2,arrestbillgates,3,15,2,14,15,3,12,4,1,1,2,7,1
3,novaccine,3,11,2,11,11,10,11,10,1,2,2,10,2
4,vaccineinjury,4,11,3,10,11,8,9,6,1,2,2,10,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7,vaccineinjuries,2,7,1,5,7,7,4,4,1,2,2,5,2
8,antivaccine,2,3,1,3,3,1,2,2,1,1,1,1,1
9,vaccineskill,1,2,1,2,2,2,2,2,1,2,2,2,2
10,exposebillgates,1,1,1,1,1,1,1,1,1,1,1,1,1


'Number of Tweets, and authors for vaccine hesitancy keywords (For Pfizer)'

In [32]:
vaxgpmd = mistextdf[mistextdf.avax_keyword.notna() & mistextdf.moderna] \
    .groupby(["avax_keyword"]).nunique().sort_values(["nodeID"],ascending=False).reset_index()
vaxgpmd

"Number of Tweets, and authors for vaccine hesitancy keywords (For Moderna)"

import plotly.express as px
fig = px.bar(vaxgpmd.rename(columns=dict(nodeID="tweets",nodeUserID="authors")),
              x="avax_keyword",y="tweets",color='authors')
fig = fig.update_layout(
                        legend_title="Legend")
fig = fig.update_xaxes(title_text='Vaccine hesitancy keywords')
fig = fig.update_yaxes(title_text='Unique Tweets')
fig.show()

Unnamed: 0,avax_keyword,actionType,nodeID,nodeLang,nodeText,nodeTime,nodeUserID,parentID,parentUserID,pfizer,astrazeneca,moderna,nodeDate,avax_keywordcs
0,informedconsent,4,18,2,18,18,9,18,10,2,2,1,15,1
1,vaxxed,3,14,2,12,14,14,11,10,2,1,1,11,2
2,arrestbillgates,2,12,2,12,12,2,12,4,1,1,1,5,1
3,vaccineinjury,4,9,3,8,9,7,7,5,2,2,1,8,4
4,novaccine,1,5,1,5,5,4,5,4,1,2,1,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7,learntherisk,1,2,1,2,2,2,2,2,2,1,1,2,2
8,unvaccinated,1,2,1,2,2,2,2,1,1,1,1,2,1
9,vaccineinjuries,1,2,1,2,2,2,2,2,1,1,1,2,2
10,vaccineskill,1,2,2,2,2,2,2,2,2,2,1,2,1


'Number of Tweets, and authors for vaccine hesitancy keywords (For Moderna)'

In [34]:
vaxgpaz = mistextdf[mistextdf.avax_keyword.notna() & mistextdf.astrazeneca] \
    .groupby(["avax_keyword"]).nunique().sort_values(["nodeID"],ascending=False).reset_index()
vaxgpaz

"Number of Tweets, and authors for vaccine hesitancy keywords (For AstraZeneca)"

import plotly.express as px
fig = px.bar(vaxgpaz.rename(columns=dict(nodeID="tweets",nodeUserID="authors")),
              x="avax_keyword",y="tweets",color='authors')
fig = fig.update_layout(
                        legend_title="Legend")
fig = fig.update_xaxes(title_text='Vaccine hesitancy keywords')
fig = fig.update_yaxes(title_text='Unique Tweets')
fig.show()

Unnamed: 0,avax_keyword,actionType,nodeID,nodeLang,nodeText,nodeTime,nodeUserID,parentID,parentUserID,pfizer,astrazeneca,moderna,nodeDate,avax_keywordcs
0,arrestbillgates,3,15,2,14,15,3,12,4,1,1,2,7,1
1,vaccineinjury,2,14,1,4,14,12,3,2,2,1,2,5,2
2,novaccine,3,7,1,7,7,6,7,6,2,1,2,6,1
3,informedconsent,2,5,1,5,5,3,5,4,1,1,1,5,1
4,vaccineinjuries,2,5,1,3,5,5,2,2,1,1,2,3,2
5,vaccineskill,1,2,2,2,2,2,2,2,2,1,2,2,2
6,depopulation,1,1,1,1,1,1,1,1,1,1,1,1,1
7,unvaccinated,1,1,1,1,1,1,1,1,1,1,1,1,1
8,vaccineagenda,1,1,1,1,1,1,1,1,1,1,1,1,1


'Number of Tweets, and authors for vaccine hesitancy keywords (For AstraZeneca)'

In [None]:
mistextdf[mistextdf.avax_keyword.notna()] \
    .groupby(["pfizer"]).nunique().sort_values(["nodeID"],ascending=False). reset_index()

mistextdf[mistextdf.avax_keyword.notna()] \
    .groupby(["moderna"]).nunique().sort_values(["nodeID"],ascending=False). reset_index()

mistextdf[mistextdf.avax_keyword.notna()] \
    .groupby(["astrazeneca"]).nunique().sort_values(["nodeID"],ascending=False). reset_index()

In [None]:
import plotly.graph_objects as go
Vaccines=['Pfizer', 'Moderna', 'AstraZeneca']

fig = go.Figure(data=[
    go.Bar(name='Tweets', x=Vaccines, y=[113, 64, 45]),
    go.Bar(name='Authors', x=Vaccines, y=[82, 40, 26]),
    go.Bar(name='Keywords', x=Vaccines, y=[12, 12, 9])
])
 

fig = fig.update_layout(barmode='group',
                        title_text="Number of Tweets, authors and vaccine hesitancy keywords for vaccines",
                        legend_title="Legend",height=600)
fig = fig.update_yaxes(title_text='Unique Count')
fig.show()

## Identifying AstraZeneca mentioned URLs

### checking the Tweets mentioning AstraZeneca

In [35]:
import re

az_dictionary = re.compile(r"(?i)(AstraZeneca|Astra Zeneca|AZD1222|COVID|vaccine|immunity|herd immunity|Barrington|focused protection)" , flags=re.IGNORECASE)

In [36]:
texturlmerged["nodeDate"] = texturlmerged["nodeTime"].dt.date
texturlmerged.describe()





Unnamed: 0,actionType,nodeID,nodeLang,nodeText,nodeTime,nodeUserID,parentID,parentUserID,url,domain,expanded_url,active,expanded_domain,nodeDate
count,175122,175122.0,175122,175122,175122,175122.0,175122.0,170818.0,175122,175122,175122,175122,175122,175122
unique,4,143204.0,44,93799,141828,60769.0,87977.0,44647.0,15384,2977,13613,2,2967,212
top,replied_to,1.4045663886735524e+18,en,RT @RobertKennedyJr: Here's the latest CDC VAE...,2021-06-14 22:28:17+00:00,1.0991771436013772e+18,1.401199884997382e+18,337808606.0,https://www.openvaers.com/covid-data,childrenshealthdefense.org,https://openvaers.com/index.php,True,childrenshealthdefense.org,2021-06-13
freq,66622,20.0,106507,2510,20,1144.0,2522.0,12511.0,6770,38784,8730,173159,44644,3669
first,,,,,2021-01-01 00:59:34+00:00,,,,,,,,,
last,,,,,2021-07-31 23:57:49+00:00,,,,,,,,,


In [37]:
azextracted = texturlmerged["nodeText"].astype(str).str.extractall(az_dictionary)
azextracted

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Unnamed: 0_level_1,match,Unnamed: 2_level_1
2,0,Vaccine
7,0,covid
9,0,Vaccine
19,0,COVID
19,1,vaccine
...,...,...
175118,2,Vaccine
175119,0,COVID
175119,1,vaccine
175119,2,Vaccine


In [38]:
txturldf = texturlmerged.join(azextracted.reset_index().set_index("level_0")[[0]].rename(columns={0:"az_keywordcs"}))
txturldf

Unnamed: 0,actionType,nodeID,nodeLang,nodeText,nodeTime,nodeUserID,parentID,parentUserID,url,domain,expanded_url,active,expanded_domain,nodeDate,az_keywordcs
0,replied_to,1421621149683920898,en,@kmbnrb09 @CreamedCornACob @ClapdVikesBurn You...,2021-07-31 23:57:49+00:00,1262800414979883015,1421620549453631490,292779226,https://www.google.com/amp/s/mobile.reuters.co...,google.com,https://www.reuters.com/article/factcheck-vaer...,True,www.reuters.com,2021-07-31,
1,replied_to,1421621149683920898,en,@kmbnrb09 @CreamedCornACob @ClapdVikesBurn You...,2021-07-31 23:57:49+00:00,1262800414979883015,1421620549453631490,292779226,https://www.google.com/amp/s/mobile.reuters.co...,google.com,https://www.reuters.com/article/factcheck-vaer...,True,www.reuters.com,2021-07-31,
2,retweeted,1421621035154038788,ja,RT @seymour172: https://t.co/EOFwVtbgqRより\n日本語...,2021-07-31 23:57:21+00:00,130808380,1421616692644352003,1409050188346322949,https://t.me/Hyer971/4320,t.me,https://t.me/Hyer971/4320,True,t.me,2021-07-31,Vaccine
3,retweeted,1421619416840785920,und,RT @SteredL: @ivanrioufol @JVerier https://t.c...,2021-07-31 23:50:56+00:00,472249581,1421614670851547138,1241796022801760256,https://childrenshealthdefense.org/defender/va...,childrenshealthdefense.org,https://childrenshealthdefense.org/defender/va...,True,childrenshealthdefense.org,2021-07-31,
4,retweeted,1421618874382946311,en,RT @DuskyPat: @RealJoelSmalley Dr Anne McClosk...,2021-07-31 23:48:46+00:00,719820444526841861,1421426957892440071,1417700662662598656,http://fb.watch/v/2SY9meKNx/,fb.watch,https://www.facebook.com/login/?next=https%3A%...,True,www.facebook.com,2021-07-31,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175119,tweet,1344921371814588417,en,READ: Adverse Events reported after COVID-19 v...,2021-01-01 08:20:17+00:00,1245976226570702849,1344921371814588417,1245976226570702849,https://ift.tt/2WYTD3M,ift.tt,https://sharylattkisson.com/2021/01/read-adver...,True,sharylattkisson.com,2021-01-01,COVID
175119,tweet,1344921371814588417,en,READ: Adverse Events reported after COVID-19 v...,2021-01-01 08:20:17+00:00,1245976226570702849,1344921371814588417,1245976226570702849,https://ift.tt/2WYTD3M,ift.tt,https://sharylattkisson.com/2021/01/read-adver...,True,sharylattkisson.com,2021-01-01,vaccine
175119,tweet,1344921371814588417,en,READ: Adverse Events reported after COVID-19 v...,2021-01-01 08:20:17+00:00,1245976226570702849,1344921371814588417,1245976226570702849,https://ift.tt/2WYTD3M,ift.tt,https://sharylattkisson.com/2021/01/read-adver...,True,sharylattkisson.com,2021-01-01,Vaccine
175120,replied_to,1344811013518966784,en,@iamgregk @latimes The VAERS Database of react...,2021-01-01 01:01:45+00:00,92691651,1344805979594997760,46448402,https://www.medalerts.org/vaersdb/findfield.ph...,medalerts.org,https://www.medalerts.org/vaersdb/findfield.ph...,True,www.medalerts.org,2021-01-01,


In [39]:
txturldf["az_keyword"] = txturldf["az_keywordcs"].apply(to_lower)
txturldf.describe()
txturldf[txturldf.az_keyword.notna()].describe()





Unnamed: 0,actionType,nodeID,nodeLang,nodeText,nodeTime,nodeUserID,parentID,parentUserID,url,domain,expanded_url,active,expanded_domain,nodeDate,az_keywordcs,az_keyword
count,217003,217003.0,217003,217003,217003,217003.0,217003.0,211642.0,217003,217003,217003,217003,217003,217003,99330,99330
unique,4,143204.0,44,93799,141828,60769.0,87977.0,44647.0,15384,2977,13613,2,2967,212,34,10
top,replied_to,1.4127761090458092e+18,en,RT @RobertKennedyJr: Here's the latest CDC VAE...,2021-07-07 14:10:47+00:00,1.3828635230347223e+18,1.401199884997382e+18,337808606.0,https://www.openvaers.com/covid-data,childrenshealthdefense.org,https://openvaers.com/index.php,True,childrenshealthdefense.org,2021-07-17,vaccine,vaccine
freq,84912,55.0,145769,2510,55,2543.0,2522.0,12592.0,8323,47441,10581,214704,53893,4262,28308,55409
first,,,,,2021-01-01 00:59:34+00:00,,,,,,,,,,,
last,,,,,2021-07-31 23:57:49+00:00,,,,,,,,,,,






Unnamed: 0,actionType,nodeID,nodeLang,nodeText,nodeTime,nodeUserID,parentID,parentUserID,url,domain,expanded_url,active,expanded_domain,nodeDate,az_keywordcs,az_keyword
count,99330,99330.0,99330,99330,99330,99330.0,99330.0,96757.0,99330,99330,99330,99330,99330,99330,99330,99330
unique,4,46782.0,39,37472,46606,23013.0,35926.0,20410.0,8491,1805,7532,2,1801,212,34,10
top,replied_to,1.4127761090458092e+18,en,RT @heather_parisi: #VAERS by https://t.co/54a...,2021-07-07 14:10:47+00:00,1.3828635230347223e+18,1.4057863932011807e+18,9.429031202243256e+17,https://www.openvaers.com/covid-data,childrenshealthdefense.org,https://vaers.hhs.gov/,True,childrenshealthdefense.org,2021-07-17,vaccine,vaccine
freq,41905,55.0,86356,717,55,2543.0,720.0,1538.0,3653,17753,4373,98408,18932,2227,28308,55409
first,,,,,2021-01-01 00:59:34+00:00,,,,,,,,,,,
last,,,,,2021-07-31 23:57:21+00:00,,,,,,,,,,,


In [45]:
azgp = txturldf[txturldf.az_keyword.notna()] \
    .groupby(["expanded_url"]).nunique().sort_values(["nodeID"],ascending=False).reset_index()
azgp

"Number of Tweets, and authors for URLs (AstraZeneca mentioned)"

import plotly.express as px
fig = px.bar(azgp.rename(columns=dict(nodeID="tweets",nodeUserID="authors"))[azgp["nodeID"]>200],
              x="expanded_url",y="tweets",color="authors",height=800)
fig = fig.update_layout(legend_title="Legend",legend=dict(
    yanchor="top",
    y=1,
    xanchor="right",
    x=1,
    bgcolor='rgba(0,0,0,0)',
))
fig = fig.update_xaxes(title_text='URLs',ticktext=azgp['expanded_url'].str.slice(0,70).tolist(),
                       tickmode="array",
                       tickvals=list(range(len(azgp))))
fig = fig.update_yaxes(title_text='Unique Tweets')
fig.show()

Unnamed: 0,expanded_url,actionType,nodeID,nodeLang,nodeText,nodeTime,nodeUserID,parentID,parentUserID,url,domain,active,expanded_domain,nodeDate,az_keywordcs,az_keyword
0,https://vaers.hhs.gov/,4,1889,20,1472,1889,1430,1420,1191,29,5,1,1,207,13,6
1,https://openvaers.com/index.php,4,1635,21,1479,1635,809,1464,1033,37,4,1,1,174,12,5
2,https://wonder.cdc.gov/vaers.html,4,1264,21,1038,1264,768,1013,815,7,3,1,1,197,15,7
3,https://www.cdc.gov/coronavirus/2019-ncov/vacc...,4,1250,20,1207,1250,678,1194,918,12,6,1,1,163,7,3
4,https://www.thegatewaypundit.com/2021/06/shock...,3,1226,6,508,1221,1145,433,409,1,1,1,1,19,6,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7527,https://uncutnews.ch/vater-mein-sohn-wurde-von...,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
7528,https://uncutnews.ch/usa-impfverletzungen-und-...,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
7529,https://uncutnews.ch/impftodesfaelle-in-den-us...,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
7530,https://cassandravoices.com/science-environmen...,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2


'Number of Tweets, and authors for URLs (AstraZeneca mentioned)'

In [73]:
azkeygp = txturldf[txturldf.az_keyword.notna()] \
    .groupby(["az_keyword"]).nunique().sort_values(["nodeID"],ascending=False).reset_index()
azkeygp

"Number of Tweets, authors, and URLs for AstraZeneca keywords"

import plotly.express as px
fig = px.bar(azkeygp.rename(columns=dict(nodeID="tweets",nodeUserID="authors"))[azkeygp["nodeID"]>7],
              x="az_keyword",y=["tweets","authors","url"],barmode='group')
fig = fig.update_layout(
                        legend_title="Legend")
fig = fig.update_xaxes(title_text='AstraZeneca keywords')
fig = fig.update_yaxes(title_text='Unique Count')
fig.show()

Unnamed: 0,az_keyword,actionType,nodeID,nodeLang,nodeText,nodeTime,nodeUserID,parentID,parentUserID,url,domain,expanded_url,active,expanded_domain,nodeDate,az_keywordcs
0,vaccine,4,35409,34,28932,35290,17839,27814,16248,6709,1452,5960,2,1441,212,6
1,covid,4,28499,36,22925,28413,14362,22223,12837,5571,1308,4933,2,1307,212,10
2,astrazeneca,4,486,10,445,486,205,448,326,174,99,157,2,94,118,4
3,immunity,4,211,3,197,211,139,194,158,119,71,112,2,70,103,3
4,astra zeneca,2,27,2,26,27,20,27,22,17,16,17,2,15,20,4
5,herd immunity,3,23,1,23,23,18,23,21,18,16,18,2,16,18,3
6,barrington,2,8,1,8,8,7,8,7,7,6,7,1,6,6,1
7,azd1222,1,2,2,2,2,2,2,2,2,2,2,1,2,2,1
8,covi̇d,1,2,1,2,2,2,2,2,2,2,2,1,2,2,1
9,vaccıne,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


'Number of Tweets, authors, and URLs for AstraZeneca keywords'

In [79]:
nazgp = txturldf[~txturldf.az_keyword.notna()] \
    .groupby(["expanded_url"]).nunique().sort_values(["nodeID"],ascending=False).reset_index()
nazgp

"Number of Tweets, and authors for URLs (AstraZeneca not mentioned)"

import plotly.express as px
fig = px.bar(nazgp.rename(columns=dict(nodeID="tweets",nodeUserID="authors"))[nazgp["nodeID"]>500],
              x="expanded_url",y="tweets",color="authors",height=800)
fig = fig.update_layout(legend_title="Legend",legend=dict(
    yanchor="top",
    y=1,
    xanchor="right",
    x=1,
    bgcolor='rgba(0,0,0,0)',
))
fig = fig.update_xaxes(title_text='URLs',ticktext=nazgp['expanded_url'].str.slice(0,70).tolist(),
                       tickmode="array",
                       tickvals=list(range(len(nazgp))))
fig = fig.update_yaxes(title_text='Unique Tweets')
fig.show()

Unnamed: 0,expanded_url,actionType,nodeID,nodeLang,nodeText,nodeTime,nodeUserID,parentID,parentUserID,url,domain,active,expanded_domain,nodeDate,az_keywordcs,az_keyword
0,https://openvaers.com/index.php,4,4083,27,1844,4072,3074,1757,1374,58,4,1,1,181,0,0
1,https://vaers.hhs.gov/,4,4073,32,3140,4068,2606,3421,2527,22,2,1,1,209,0,0
2,https://wonder.cdc.gov/vaers.html,4,4059,26,1407,4041,3355,1382,1091,6,1,1,1,209,0,0
3,https://childrenshealthdefense.org/defender/va...,4,3416,19,690,3357,3117,640,554,3,1,1,1,41,0,0
4,https://childrenshealthdefense.org/defender/cd...,4,3247,18,801,3172,2855,774,642,1,1,1,1,28,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8053,https://rumble.com/vg4inv-michael-yeadon-full-...,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0
8054,https://rumble.com/vfn3vp-vaers-injuries-dismi...,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0
8055,https://rumble.com/vfhko9-dr.-gold-awaken.html,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0
8056,https://rumble.com/vfgsf1-2-uk-speakers-on-cov...,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0


'Number of Tweets, and authors for URLs (AstraZeneca not mentioned)'

## Identifying URLs sharing user behaviour

In [90]:
azgptime = txturldf[txturldf.az_keyword.notna()] \
    .groupby(["nodeDate"]).nunique().sort_values(["nodeDate"],ascending=True).reset_index()
azgptime

"Number of Tweets and URLs over time (AstraZeneca mentioned)"

import plotly.express as px
fig = px.line(azgptime.rename(columns=dict(nodeID="tweets")),x="nodeDate",y=["tweets","url"])
fig = fig.update_layout(legend_title="Legend")
fig = fig.update_xaxes(title_text='Date')
fig = fig.update_yaxes(title_text='Unique Count')
fig.show()

Unnamed: 0,nodeDate,actionType,nodeID,nodeLang,nodeText,nodeTime,nodeUserID,parentID,parentUserID,url,domain,expanded_url,active,expanded_domain,az_keywordcs,az_keyword
0,2021-01-01,2,6,2,6,6,6,6,6,5,5,5,1,5,4,2
1,2021-01-02,3,121,2,36,121,107,31,22,17,12,16,2,12,6,2
2,2021-01-03,3,24,2,22,24,20,22,19,15,8,14,1,8,6,2
3,2021-01-04,3,28,6,28,28,19,28,24,20,6,15,1,5,6,2
4,2021-01-05,2,51,8,49,51,25,51,46,24,14,24,2,13,5,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207,2021-07-27,4,331,13,279,331,227,268,234,147,79,138,2,77,8,3
208,2021-07-28,4,254,10,249,253,190,238,218,133,82,125,2,83,9,4
209,2021-07-29,4,415,10,390,411,353,378,356,142,80,131,2,75,10,4
210,2021-07-30,4,551,14,421,545,395,410,367,167,92,149,2,87,8,3


'Number of Tweets and URLs over time (AstraZeneca mentioned)'