# Load dataset

In [1]:
from pathlib import Path
from mumin import MuminDataset

# Set file names and paths
data_dir = Path("../../data")
dataset_file = "mumin-medium_no-images.zip"

# Load the compiled dataset
size = "medium"
dataset_path = data_dir.joinpath(dataset_file)
include_tweet_images = False
include_articles = False
dataset = MuminDataset(dataset_path=dataset_path, size=size, include_tweet_images=include_tweet_images, include_articles=include_articles)
dataset.compile()

  from .autonotebook import tqdm as notebook_tqdm
2022-07-15 14:11:54,603 [INFO] Loading dataset


MuminDataset(num_nodes=805,586, num_relations=1,061,640, size='medium', compiled=True, bearer_token_available=False)

# Extract only the tweets that talk about COVID-19

The easiest way to do this seems to be to filter the claims before joining. However, I will also try filtering the tweets after joining to see if that makes a difference.

## Get claims, tweets and relations

In [2]:
# Get tweets, claims and the relations between them
tweets = dataset.nodes["tweet"].dropna()
claims = dataset.nodes["claim"]
rels = dataset.rels[("tweet", "discusses", "claim")]

## Filter claims

In [3]:
claims.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5537 entries, 0 to 5536
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   embedding         5537 non-null   object        
 1   label             5537 non-null   category      
 2   reviewers         5537 non-null   object        
 3   date              5537 non-null   datetime64[ns]
 4   language          5537 non-null   category      
 5   keywords          5537 non-null   object        
 6   cluster_keywords  5537 non-null   category      
 7   cluster           5537 non-null   category      
 8   train_mask        5537 non-null   bool          
 9   val_mask          5537 non-null   bool          
 10  test_mask         5537 non-null   bool          
dtypes: bool(3), category(4), datetime64[ns](1), object(3)
memory usage: 211.7+ KB


In [4]:
covid_mask = claims.keywords.str.contains('(corona(.*virus)?|covid(.*19)?)') | claims.cluster_keywords.str.contains('(corona(.*virus)?|covid(.*19)?)')
claims_filtered = claims.loc[covid_mask, :]
claims_filtered
#claims.loc[claims.keywords.str.contains('(corona(.*virus)?|covid 19)'), ["language", "label", "keywords", "cluster_keywords"]]
#claims.loc[claims.keywords.str.contains('corona(.*virus)?'), ["language", "label", "keywords", "cluster_keywords"]]

  covid_mask = claims.keywords.str.contains('(corona(.*virus)?|covid(.*19)?)') | claims.cluster_keywords.str.contains('(corona(.*virus)?|covid(.*19)?)')


Unnamed: 0,embedding,label,reviewers,date,language,keywords,cluster_keywords,cluster,train_mask,val_mask,test_mask
0,"[-0.04202667623758316, -0.00033039430854842067...",misinformation,[observador.pt],2020-03-15 12:30:21,pt,corona virus reaching lungs remains,coronavirus china covid 19 treatments recommended,0,True,False,False
2,"[0.05876247584819794, 0.02175525575876236, 0.0...",misinformation,[observador.pt],2020-03-23 01:55:11,pt,news corona virus vaccine ready,coronavirus china covid 19 treatments recommended,0,True,False,False
10,"[-0.058289170265197754, -0.014123783446848392,...",misinformation,[observador.pt],2020-02-23 18:31:23,pt,confirmed case coronavirus portugal,coronavirus china covid 19 treatments recommended,0,True,False,False
13,"[0.04389802739024162, 0.07366126775741577, 0.0...",misinformation,[observador.pt],2020-07-02 11:22:38,pt,advertises taking vaccine covid 19,coronavirus china covid 19 treatments recommended,0,True,False,False
16,"[-0.018961578607559204, 0.1207994893193245, 0....",misinformation,[aosfatos.org],2020-03-13 00:00:00,pt,cuba announces produces vaccine coronavirus,coronavirus china covid 19 treatments recommended,0,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...
5531,"[-0.0024142928887158632, -0.15177859365940094,...",misinformation,[dogrulukpayi.com],2021-04-06 00:00:00,tr,moderna astrazeneca vaccines 964 people,coronavirus china covid 19 treatments recommended,0,True,False,False
5533,"[0.0004821936599910259, -0.13943451642990112, ...",misinformation,[dogrulukpayi.com],2021-04-21 00:00:00,tr,pfizer biontech vaccination shingles,coronavirus china covid 19 treatments recommended,0,True,False,False
5534,"[0.05333065986633301, -0.1182158812880516, 0.0...",misinformation,[dogrulukpayi.com],2021-05-19 00:00:00,tr,coronavirus cause percent deaths described,coronavirus china covid 19 treatments recommended,0,True,False,False
5535,"[-0.06462758779525757, 0.0026502013206481934, ...",misinformation,[fakenews.pl],2021-05-10 00:00:00,pl,lift patents covid 19 vaccines,coronavirus china covid 19 treatments recommended,0,True,False,False


In [5]:
# Join tweets on filtered claims
tc_filtered_claims = (tweets.merge(rels, left_index=True, right_on='src')
                      .merge(claims_filtered, left_on='tgt', right_index=True)
                      .reset_index(drop=True))

In [6]:
tc_filtered_claims.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4483 entries, 0 to 4482
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   tweet_id          4483 non-null   uint64        
 1   text              4483 non-null   object        
 2   created_at        4483 non-null   datetime64[ns]
 3   lang              4483 non-null   category      
 4   source            4483 non-null   object        
 5   num_retweets      4483 non-null   uint64        
 6   num_replies       4483 non-null   uint64        
 7   num_quote_tweets  4483 non-null   uint64        
 8   src               4483 non-null   int64         
 9   tgt               4483 non-null   int64         
 10  embedding         4483 non-null   object        
 11  label             4483 non-null   category      
 12  reviewers         4483 non-null   object        
 13  date              4483 non-null   datetime64[ns]
 14  language          4483 n

In [7]:
tc_filtered_claims.head()

Unnamed: 0,tweet_id,text,created_at,lang,source,num_retweets,num_replies,num_quote_tweets,src,tgt,...,label,reviewers,date,language,keywords,cluster_keywords,cluster,train_mask,val_mask,test_mask
0,1238795119572049920,कॉरोना वायरस फेफड़ों में जाने से पहले तीन-चार ...,2020-03-14 11:52:26,hi,Twitter for Android,6,0,1,0,0,...,misinformation,[observador.pt],2020-03-15 12:30:21,pt,corona virus reaching lungs remains,coronavirus china covid 19 treatments recommended,0,True,False,False
1,1238947475471454220,Antes de llegar a los pulmones dura 4 días en ...,2020-03-14 21:57:51,es,Twitter for Android,8,3,0,1,0,...,misinformation,[observador.pt],2020-03-15 12:30:21,pt,corona virus reaching lungs remains,coronavirus china covid 19 treatments recommended,0,True,False,False
2,1240979964888899584,మంచి వార్త! కరోనా వైరస్ వ్యాక్సిన్ సిద్ధంగా ఉ...,2020-03-20 12:34:14,te,Twitter for Android,27,14,0,6,2,...,misinformation,[observador.pt],2020-03-23 01:55:11,pt,news corona virus vaccine ready,coronavirus china covid 19 treatments recommended,0,True,False,False
3,1241861910921871360,Great news! Carona virus vaccine ready. Able t...,2020-03-22 22:58:46,en,Twitter Web App,26,42,7,7,2,...,misinformation,[observador.pt],2020-03-23 01:55:11,pt,news corona virus vaccine ready,coronavirus china covid 19 treatments recommended,0,True,False,False
4,1240979964888899584,మంచి వార్త! కరోనా వైరస్ వ్యాక్సిన్ సిద్ధంగా ఉ...,2020-03-20 12:34:14,te,Twitter for Android,27,14,0,6,2269,...,misinformation,[boomlive.in],2020-03-27 06:56:30,en,coronavirus vaccine ready able cure,coronavirus china covid 19 treatments recommended,0,True,False,False


## Filter tweets after joining with claims

In [19]:
# Join tweets and claims on rels
tc_filtered_after = (tweets.merge(rels, left_index=True, right_on='src')
                     .merge(claims, left_on='tgt', right_index=True)
                     .reset_index(drop=True))

In [21]:
tc_filtered_after.shape

(39001, 21)

In [20]:
# Filter the resulting dataframe
covid_mask = tc_filtered_after.keywords.str.contains('(corona(.*virus)?|covid.*19)') | \
    tc_filtered_after.cluster_keywords.str.contains('(corona(.*virus)?|covid.*19)') | \
    tc_filtered_after.text.str.contains('(corona(.*virus)?|covid.*19)')
tc_filtered_after.loc[covid_mask, ["lang", "label", "keywords", "cluster_keywords", "text"]]

  covid_mask = tc_filtered_after.keywords.str.contains('(corona(.*virus)?|covid.*19)') | \
  tc_filtered_after.cluster_keywords.str.contains('(corona(.*virus)?|covid.*19)') | \
  tc_filtered_after.text.str.contains('(corona(.*virus)?|covid.*19)')


Unnamed: 0,lang,label,keywords,cluster_keywords,text
0,en,misinformation,corona virus reaching lungs remains,coronavirus china covid 19 treatments recommended,To keep our upper respiratory tract healthy in...
1,en,misinformation,corona virus reaching lungs remains,coronavirus china covid 19 treatments recommended,Gargling salt water does not 'kill' coronaviru...
2,hi,misinformation,corona virus reaching lungs remains,coronavirus china covid 19 treatments recommended,कॉरोना वायरस फेफड़ों में जाने से पहले तीन-चार ...
3,es,misinformation,corona virus reaching lungs remains,coronavirus china covid 19 treatments recommended,Antes de llegar a los pulmones dura 4 días en ...
4,en,misinformation,corona virus reaching lungs remains,coronavirus china covid 19 treatments recommended,So they say the first symptons are #coughing\n...
...,...,...,...,...,...
38994,en,misinformation,indicating covid virus created laboratory,coronavirus china covid 19 treatments recommended,After ridiculing President Trump’s assertion l...
38995,fr,misinformation,covid 19 vaccines currently production,coronavirus china covid 19 treatments recommended,Coronovirus : le virus mutant détecté en Grand...
38996,en,misinformation,amantadine miracle cure covid 19,coronavirus china covid 19 treatments recommended,🔥 New Science Brief: Remdesivir for Hospitaliz...
38999,hi,misinformation,shows risk contracting coronavirus location,coronavirus china covid 19 treatments recommended,#CoronaVirusUpdates : जानें- गांवों में क्यों ...


# Comments

It seems like there isn't much of a difference between filtering the claims and filtering after merging (18164 vs 18269). Filtering after merging might net more data if the tweet text is translated into English but since it isn't, I think I'll just go with filtering the claims before merging.