# Twitter Dataset Analysis

In [1]:
from IPython.core.interactiveshell import InteractiveShell
import pandas as pd
pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 0)
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import jsonlines
import json
import re

with jsonlines.open('../../data/Master-Data/Twitter/vaers_2021_01_01_to_2021_08_01.json') as reader:
    twts_list = list()
    for obj in reader:
        for o in obj["data"]:
            try:
                twt = dict(text=o["text"],
                           created_at=o["created_at"],
                           author_id=o["author_id"],
                           twtid=o["id"],
                           urls=[url.get("expanded_url","") for url in o.get("entities",{}).get("urls",[])])
                twts_list.append(twt)
            except Exception:
                pass

In [12]:
df = pd.DataFrame(twts_list)
df

Unnamed: 0,text,created_at,author_id,twtid,urls
0,RT @Alzhacker: まずVAERSデータベースの信頼性の低さが指摘されています。そ...,2021-07-31T23:59:48.000Z,379173233,1421621648843710465,[]
1,RT @LuvmyCountry52: If anything good comes fro...,2021-07-31T23:59:44.000Z,818315474723282944,1421621633291218944,[]
2,RT @freethought202: From latest VAERS 🇺🇸 : The...,2021-07-31T23:59:26.000Z,1374150364857999360,1421621557051281410,[]
3,Them: Vaccines are safe!\n\nMe: But VAERS repo...,2021-07-31T23:59:22.000Z,94864837,1421621542480404481,[https://twitter.com/PressDoYourJob/status/142...
4,@Bowen_Thaylin @MandalaDanvers @Politics_1138 ...,2021-07-31T23:59:21.000Z,1269405809001197570,1421621536805449729,[]
...,...,...,...,...,...
727143,@iamgregk @latimes The VAERS Database of react...,2021-01-01T01:01:45.000Z,92691651,1344811013518966784,[https://www.medalerts.org/vaersdb/findfield.p...
727144,@latimes Check the VAERS for COVID-19 reaction...,2021-01-01T00:59:34.000Z,92691651,1344810463683461120,[https://www.medalerts.org/vaersdb/findfield.p...
727145,@Harrrybel @latimes Please take care of yourse...,2021-01-01T00:56:55.000Z,44425430,1344809796332097536,[]
727146,@Reuters From #CoronaVaccine Check out #CDC #F...,2021-01-01T00:54:48.000Z,204176104,1344809262091038726,[]


In [13]:
df.describe()

Unnamed: 0,text,created_at,author_id,twtid,urls
count,727148,727148,727148,727148,727148
unique,288261,696894,201744,727148,57898
top,RT @AlexBerenson: 1/ URGENT.\n\n@CDCgov has qu...,2021-07-21T11:59:28.000Z,1033930793892995073,1381217692489760771,[]
freq,4839,6,1273,1,529102


__Total Tweets in the original dataset: 727,148__

In [14]:
df_with_urls = df[df["urls"].apply(lambda x: len(x)>0)].reset_index(drop=True)
df_with_urls

Unnamed: 0,text,created_at,author_id,twtid,urls
0,Them: Vaccines are safe!\n\nMe: But VAERS repo...,2021-07-31T23:59:22.000Z,94864837,1421621542480404481,[https://twitter.com/PressDoYourJob/status/142...
1,RT @enamabula: https://t.co/vVzYZZdfhK\nU SAD ...,2021-07-31T23:58:37.000Z,3072191633,1421621352721788932,[https://twitter.com/val_ic/status/14215658979...
2,@kmbnrb09 @CreamedCornACob @ClapdVikesBurn You...,2021-07-31T23:57:49.000Z,1262800414979883015,1421621149683920898,[https://www.google.com/amp/s/mobile.reuters.c...
3,RT @seymour172: https://t.co/EOFwVtbgqRより\n日本語...,2021-07-31T23:57:21.000Z,130808380,1421621035154038788,[https://t.me/Hyer971/4320]
4,@jonjasniak @pickle_rick6 @coleph14 @BNODesk V...,2021-07-31T23:52:40.000Z,19253536,1421619854981828610,[https://twitter.com/AFTunion/status/142155199...
...,...,...,...,...,...
198041,READ: Adverse Events reported after COVID-19 v...,2021-01-01T08:31:34.000Z,16141591,1344924211677179904,[https://ift.tt/2WYTD3M]
198042,READ: Adverse Events reported after COVID-19 v...,2021-01-01T08:20:17.000Z,1245976226570702849,1344921371814588417,[https://ift.tt/2WYTD3M]
198043,Pathetic! When has there ever been such #Prop...,2021-01-01T04:57:29.000Z,380727130,1344870337071226882,[https://twitter.com/critica18495985/status/13...
198044,@iamgregk @latimes The VAERS Database of react...,2021-01-01T01:01:45.000Z,92691651,1344811013518966784,[https://www.medalerts.org/vaersdb/findfield.p...


__Total Tweets with URLs in the dataset: 198,046__

#### Obtaining URL Feature Vector for all the Tweets

In [188]:
allurldf = df_with_urls["urls"].apply(pd.Series) \
    .merge(df_with_urls, right_index = True, left_index = True) \
    .drop(["urls"], axis = 1) \
    .melt(id_vars = ['text','created_at','author_id','twtid'], value_name = "url") \
    .drop("variable", axis = 1) \
    .dropna(subset=["url"])

In [190]:
allurldf.describe()

Unnamed: 0,text,created_at,author_id,twtid,url
count,240475,240475,240475,240475,240475
unique,132420,195717,74718,198046,60181
top,RT @RobertKennedyJr: Here's the latest CDC VAE...,2021-06-14T02:31:46.000Z,1099177143601377280,1412503332346810371,https://wonder.cdc.gov/vaers.html
freq,2510,12,1141,11,5380


__Total URLs extracted: 240,475__

__Total unique URLs extracted: 60,181__

We need to remove all the URLs which are from Twitter domain

In [191]:
allurldffiltered = allurldf[~allurldf["url"].str.contains("https://twitter.com/")].reset_index(drop=True)
allurldffiltered

Unnamed: 0,text,created_at,author_id,twtid,url
0,@kmbnrb09 @CreamedCornACob @ClapdVikesBurn You...,2021-07-31T23:57:49.000Z,1262800414979883015,1421621149683920898,https://www.google.com/amp/s/mobile.reuters.co...
1,RT @seymour172: https://t.co/EOFwVtbgqRより\n日本語...,2021-07-31T23:57:21.000Z,130808380,1421621035154038788,https://t.me/Hyer971/4320
2,RT @SteredL: @ivanrioufol @JVerier https://t.c...,2021-07-31T23:50:56.000Z,472249581,1421619416840785920,https://childrenshealthdefense.org/defender/va...
3,RT @DuskyPat: @RealJoelSmalley Dr Anne McClosk...,2021-07-31T23:48:46.000Z,719820444526841861,1421618874382946311,http://fb.watch/v/2SY9meKNx/
4,RT @AmericnCnsrvitv: I wonder how much longer ...,2021-07-31T23:43:55.000Z,1100488788,1421617651399331840,https://www.thegatewaypundit.com/2021/07/secon...
...,...,...,...,...,...
155044,@SenRonJohnson https://t.co/xpldhFhjXd\nhttps:...,2021-06-29T07:14:29.000Z,1313652341623582723,1409772243521998861,https://www.theburningplatform.com/2021/06/27/...
155045,@pdellin @InformedNJNurse https://t.co/xpldhFh...,2021-06-29T06:31:53.000Z,1313652341623582723,1409761521723342849,https://www.theburningplatform.com/2021/06/27/...
155046,@InformedNJNurse @7tine76 https://t.co/xpldhFh...,2021-06-29T06:29:30.000Z,1313652341623582723,1409760922868928512,https://www.theburningplatform.com/2021/06/27/...
155047,Aşalamadan sonra uygulanacak kılavuzlar\nhttps...,2021-05-21T08:25:27.000Z,1354210995506458628,1395656975338332161,https://www.cdc.gov/vaccines/covid-19/info-by-...


In [192]:
allurldffiltered.describe()

Unnamed: 0,text,created_at,author_id,twtid,url
count,155049,155049,155049,155049,155049
unique,94071,142172,60845,143551,17001
top,RT @RobertKennedyJr: Here's the latest CDC VAE...,2021-06-29T07:59:06.000Z,1174099964642746368,1409783592234848261,https://wonder.cdc.gov/vaers.html
freq,2510,10,1020,10,5380


__URL Feature Vector for all Tweets__

In [193]:
ftvectorall = allurldffiltered.groupby(["url"]).nunique().sort_values(["twtid"],ascending=False).reset_index()
ftvectorall

ftvectorall.to_csv("./all_url_feature_vector.csv",index=False)

Unnamed: 0,url,text,created_at,author_id,twtid
0,https://wonder.cdc.gov/vaers.html,2338,5244,4006,5264
1,https://www.openvaers.com/covid-data,3146,5242,2930,5253
2,https://childrenshealthdefense.org/defender/va...,947,3706,3379,3771
3,https://vaers.hhs.gov/,2784,3735,2444,3740
4,https://childrenshealthdefense.org/defender/cd...,1098,3620,3157,3702
...,...,...,...,...,...
16996,https://metro.co.uk/2020/12/18/covid-vaccine-v...,1,1,1,1
16997,https://metro.co.uk/2021/03/01/over-60s-to-sta...,1,1,1,1
16998,https://metro.co.uk/2021/04/15/us-records-5800...,1,1,1,1
16999,https://metro.co.uk/2021/06/03/mum-of-three-43...,1,1,1,1


### Filtering the AstraZeneca mentioned Tweets

#### AstraZeneca Dictionary

In [15]:
import re

compiled_dictionary = re.compile(r"(?i)(AstraZeneca|Astra Zeneca|AZD1222|COVID|vaccine|immunity|herd immunity|Barrington|focused protection)" , flags=re.IGNORECASE)

__Getting all the Tweets which mention AZ keywords__

In [16]:
dfresult = df['text'].astype(str).str.extractall(compiled_dictionary)
dfresult_unstacked = dfresult.notnull().groupby(level=0).sum().rename(columns={0:"matched"})
dfmatched_df = df.join(dfresult_unstacked)
dfmatched_df["matched"] = dfmatched_df["matched"].notnull()
dffinal_results = dfmatched_df[dfmatched_df["matched"]].reset_index(drop=True)
dffinal_results

Unnamed: 0,text,created_at,author_id,twtid,urls,matched
0,RT @Alzhacker: まずVAERSデータベースの信頼性の低さが指摘されています。そ...,2021-07-31T23:59:48.000Z,379173233,1421621648843710465,[],True
1,Them: Vaccines are safe!\n\nMe: But VAERS repo...,2021-07-31T23:59:22.000Z,94864837,1421621542480404481,[https://twitter.com/PressDoYourJob/status/142...,True
2,An associate of mine pointed out that all the ...,2021-07-31T23:59:17.000Z,1362793984892919811,1421621521366142980,[],True
3,RT @LaureGonlezamar: Mise à jour du VAERS amér...,2021-07-31T23:58:25.000Z,2972768459,1421621301454770183,[],True
4,@MandalaDanvers @voluntaryasmine @Politics_113...,2021-07-31T23:57:33.000Z,151510218,1421621082113589250,[],True
...,...,...,...,...,...,...
296964,RT @Shay73415949: @justiceforevee @FOX9 @kare1...,2021-01-01T03:02:12.000Z,2731105046,1344841322998366208,[],True
296965,@professorbe3 @IDHW @IDHouseCaucus @Idsenatede...,2021-01-01T02:20:38.000Z,1319285623258148867,1344830863763189762,[],True
296966,RT @Shay73415949: @justiceforevee @FOX9 @kare1...,2021-01-01T01:10:30.000Z,55650343,1344813216476618752,[],True
296967,@latimes Check the VAERS for COVID-19 reaction...,2021-01-01T00:59:34.000Z,92691651,1344810463683461120,[https://www.medalerts.org/vaersdb/findfield.p...,True


In [17]:
dffinal_results.describe()

Unnamed: 0,text,created_at,author_id,twtid,urls,matched
count,296969,296969,296969,296969,296969,296969
unique,109784,289158,115763,296969,24927,1
top,RT @AlexBerenson: 1/ URGENT.\n\n@CDCgov has qu...,2021-07-21T11:39:26.000Z,1378056160645877762,1404120364469395458,[],True
freq,4839,6,738,1,233733,296969


__Total Tweets where AstraZeneca mentioned: 296,969__

In [111]:
result = df_with_urls['text'].astype(str).str.extractall(compiled_dictionary)
result_unstacked = result.notnull().groupby(level=0).sum().rename(columns={0:"matched"})
matched_df = df_with_urls.join(result_unstacked)
matched_df["matched"] = matched_df["matched"].notnull()
final_results = matched_df[matched_df["matched"]].reset_index(drop=True)
final_results

Unnamed: 0,text,created_at,author_id,twtid,urls,matched
0,Them: Vaccines are safe!\n\nMe: But VAERS repo...,2021-07-31T23:59:22.000Z,94864837,1421621542480404481,[https://twitter.com/PressDoYourJob/status/142...,True
1,RT @seymour172: https://t.co/EOFwVtbgqRより\n日本語...,2021-07-31T23:57:21.000Z,130808380,1421621035154038788,[https://t.me/Hyer971/4320],True
2,@NicoleJacksonMD @AmyOxentenkoMD @CDCgov @Futu...,2021-07-31T23:51:02.000Z,945824381103628288,1421619443776430082,[https://twitter.com/Drwealth54/status/1421619...,True
3,"If I were a betting person, Id bet the vaccina...",2021-07-31T23:41:44.000Z,785902212480315392,1421617105363996672,[https://childrenshealthdefense.org/defender/v...,True
4,https://t.co/EOFwVtbgqRより\n日本語文をそのまま。\n👇👇\nThe...,2021-07-31T23:40:06.000Z,1409050188346322949,1421616692644352003,[https://t.me/Hyer971/4320],True
...,...,...,...,...,...,...
63231,📝 Report an adverse reaction to any vaccine: h...,2021-01-01T19:01:41.000Z,1124397824785838080,1345082785522798592,[https://vaers.hhs.gov/esub/index.jsp],True
63232,READ: Adverse Events reported after COVID-19 v...,2021-01-01T08:31:34.000Z,16141591,1344924211677179904,[https://ift.tt/2WYTD3M],True
63233,READ: Adverse Events reported after COVID-19 v...,2021-01-01T08:20:17.000Z,1245976226570702849,1344921371814588417,[https://ift.tt/2WYTD3M],True
63234,Pathetic! When has there ever been such #Prop...,2021-01-01T04:57:29.000Z,380727130,1344870337071226882,[https://twitter.com/critica18495985/status/13...,True


__Total Tweets with URLs where AstraZeneca mentioned: 63,236__

In [112]:
final_results_ = final_results.iloc[:,:-1]

In [130]:
urldf = final_results_["urls"].apply(pd.Series) \
    .merge(final_results_, right_index = True, left_index = True) \
    .drop(["urls"], axis = 1) \
    .melt(id_vars = ['text','created_at','author_id','twtid'], value_name = "url") \
    .drop("variable", axis = 1) \
    .dropna(subset=["url"])

In [131]:
urldf

Unnamed: 0,text,created_at,author_id,twtid,url
0,Them: Vaccines are safe!\n\nMe: But VAERS repo...,2021-07-31T23:59:22.000Z,94864837,1421621542480404481,https://twitter.com/PressDoYourJob/status/1421...
1,RT @seymour172: https://t.co/EOFwVtbgqRより\n日本語...,2021-07-31T23:57:21.000Z,130808380,1421621035154038788,https://t.me/Hyer971/4320
2,@NicoleJacksonMD @AmyOxentenkoMD @CDCgov @Futu...,2021-07-31T23:51:02.000Z,945824381103628288,1421619443776430082,https://twitter.com/Drwealth54/status/14216194...
3,"If I were a betting person, Id bet the vaccina...",2021-07-31T23:41:44.000Z,785902212480315392,1421617105363996672,https://childrenshealthdefense.org/defender/va...
4,https://t.co/EOFwVtbgqRより\n日本語文をそのまま。\n👇👇\nThe...,2021-07-31T23:40:06.000Z,1409050188346322949,1421616692644352003,https://t.me/Hyer971/4320
...,...,...,...,...,...
590759,MULTIPLE studies show #NATURALIMMUNITY as effe...,2021-06-21T11:41:11.000Z,877679100,1406940258000900101,https://twitter.com/TruthWarriorG/status/14069...
602810,@JohnTory Questions? Why do you push UN-APPROV...,2021-05-27T13:33:35.000Z,1099177143601377280,1397908847369637893,https://twitter.com/MattMcInnis6/status/139790...
602965,@JohnTory Information John?\nYou mean PROPAGAN...,2021-05-26T22:26:02.000Z,1099177143601377280,1397680455470796802,https://twitter.com/MattMcInnis6/status/139768...
603024,@JohnTory Information?\nYou've CLEARLY omitted...,2021-05-26T19:33:18.000Z,1099177143601377280,1397636985284669443,https://twitter.com/MattMcInnis6/status/139763...


__Total URLs extracted: 77,556__

In [132]:
urldf.describe()

Unnamed: 0,text,created_at,author_id,twtid,url
count,77566,77566,77566,77566,77566
unique,51431,62959,29163,63236,26337
top,RT @heather_parisi: #VAERS by https://t.co/54a...,2021-04-27T22:38:27.000Z,2904376551,1397908847369637893,https://www.openvaers.com/covid-data
freq,717,10,755,10,1569


__Total unique URLs extracted: 26,337__

We need to remove all the URLs which are from Twitter domain

In [134]:
urldffiltered = urldf[~urldf["url"].str.contains("https://twitter.com/")].reset_index(drop=True)
urldffiltered

Unnamed: 0,text,created_at,author_id,twtid,url
0,RT @seymour172: https://t.co/EOFwVtbgqRより\n日本語...,2021-07-31T23:57:21.000Z,130808380,1421621035154038788,https://t.me/Hyer971/4320
1,"If I were a betting person, Id bet the vaccina...",2021-07-31T23:41:44.000Z,785902212480315392,1421617105363996672,https://childrenshealthdefense.org/defender/va...
2,https://t.co/EOFwVtbgqRより\n日本語文をそのまま。\n👇👇\nThe...,2021-07-31T23:40:06.000Z,1409050188346322949,1421616692644352003,https://t.me/Hyer971/4320
3,@TacereSol @TheFSMB @Applelaw1Sandra Between 1...,2021-07-31T23:31:59.000Z,381104227,1421614651729760259,http://childrenshealthdefense.org/defender/mod...
4,67% of #VAERS reports are from health service ...,2021-07-31T23:29:17.000Z,1339272245034835970,1421613970977431556,https://odysee.com/@DarkHorsePodcastClips:b/Va...
...,...,...,...,...,...
50120,Some covid vaccine background. It's all scienc...,2021-06-01T20:24:04.000Z,1250074491625095168,1399824085987758085,https://childrenshealthdefense.org/defender/va...
50121,@Drs4CovidEthics https://t.co/er17C54WYC\nhttp...,2021-05-23T20:48:11.000Z,1290031015151439882,1396568667069063169,https://www.aier.org/article/why-are-we-vaccin...
50122,@Drs4CovidEthics https://t.co/er17C54WYC\nhttp...,2021-05-23T18:10:51.000Z,1290031015151439882,1396529069840347138,https://www.aier.org/article/why-are-we-vaccin...
50123,@IkNet Wie moet gevaccineerd worden en wie nie...,2021-05-23T15:49:59.000Z,1290031015151439882,1396493619742322689,https://www.aier.org/article/why-are-we-vaccin...


In [183]:
urldffiltered.describe()

Unnamed: 0,text,created_at,author_id,twtid,url
count,50125,50125,50125,50125,50125
unique,37544,46683,23032,46859,9150
top,RT @heather_parisi: #VAERS by https://t.co/54a...,2021-07-29T20:46:00.000Z,1378056160645877762,1420848102865506308,https://www.openvaers.com/covid-data
freq,717,7,718,7,1569


__URL Feature Vector for AZ mentioned__

In [187]:
ftvectoraz = urldffiltered.groupby(["url"]).nunique().sort_values(["twtid"],ascending=False).reset_index()
ftvectoraz

ftvectoraz.to_csv("./az_url_feature_vector.csv",index=False)

Unnamed: 0,url,text,created_at,author_id,twtid
0,https://www.openvaers.com/covid-data,1362,1516,681,1516
1,https://www.cdc.gov/coronavirus/2019-ncov/vacc...,1202,1245,675,1245
2,https://www.thegatewaypundit.com/2021/06/shock...,509,1222,1145,1227
3,https://vaers.hhs.gov/,759,1122,965,1122
4,https://wonder.cdc.gov/vaers.html,879,1097,750,1097
...,...,...,...,...,...
9145,https://medalerts.org/vaersdb/findfield.php?TA...,1,1,1,1
9146,https://medalerts.org/vaersdb/findfield.php?TA...,1,1,1,1
9147,https://medalerts.org/vaersdb/findfield.php?TA...,1,1,1,1
9148,https://medalerts.org/vaersdb/findfield.php?TA...,1,1,1,1
