# Load dataset

In [1]:
from pathlib import Path
from mumin import MuminDataset

# Set file names and paths
data_dir = Path("data/mumin_archive/")
dataset_file = "mumin-large_no-article_image.zip"

# Load the compiled dataset
size = "large"
dataset_path = data_dir.joinpath(dataset_file)
include_tweet_images = False
include_articles = False
n_jobs = -1
dataset = MuminDataset(dataset_path=dataset_path, size=size, n_jobs=n_jobs, include_tweet_images=include_tweet_images, include_articles=include_articles)
dataset.compile()

  from .autonotebook import tqdm as notebook_tqdm
2022-08-04 09:42:55,327 [INFO] Loading dataset


MuminDataset(num_nodes=1,625,694, num_relations=2,394,768, size='large', rehydrated=True, compiled=True, bearer_token_available=True)

# Extract only the tweets that talk about COVID-19

The easiest way to do this seems to be to filter the claims before joining. However, I will also try filtering the tweets after joining to see if that makes a difference.

## Get claims, tweets and relations

In [2]:
# Get tweets, claims and the relations between them
tweets = dataset.nodes["tweet"].dropna()
claims = dataset.nodes["claim"]
rels = dataset.rels[("tweet", "discusses", "claim")]

## Filter claims

In [3]:
claims.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12885 entries, 0 to 12884
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   embedding         12885 non-null  object        
 1   label             12885 non-null  category      
 2   reviewers         12885 non-null  object        
 3   date              12885 non-null  datetime64[ns]
 4   language          12885 non-null  category      
 5   keywords          12885 non-null  object        
 6   cluster_keywords  12885 non-null  category      
 7   cluster           12885 non-null  category      
 8   train_mask        12885 non-null  bool          
 9   val_mask          12885 non-null  bool          
 10  test_mask         12885 non-null  bool          
dtypes: bool(3), category(4), datetime64[ns](1), object(3)
memory usage: 491.6+ KB


In [4]:
covid_mask = claims.keywords.str.contains('(corona(.*virus)?|covid(.*19)?)') | claims.cluster_keywords.str.contains('(corona(.*virus)?|covid(.*19)?)')
claims_filtered = claims.loc[covid_mask, :]
claims_filtered
#claims.loc[claims.keywords.str.contains('(corona(.*virus)?|covid 19)'), ["language", "label", "keywords", "cluster_keywords"]]
#claims.loc[claims.keywords.str.contains('corona(.*virus)?'), ["language", "label", "keywords", "cluster_keywords"]]

  covid_mask = claims.keywords.str.contains('(corona(.*virus)?|covid(.*19)?)') | claims.cluster_keywords.str.contains('(corona(.*virus)?|covid(.*19)?)')


Unnamed: 0,embedding,label,reviewers,date,language,keywords,cluster_keywords,cluster,train_mask,val_mask,test_mask
0,"[-0.04202667623758316, -0.00033039430854842067...",misinformation,[observador.pt],2020-03-15 12:30:21,pt,corona virus reaching lungs remains,coronavirus china covid 19 treatments recommended,0,True,False,False
2,"[0.05876247584819794, 0.02175525575876236, 0.0...",misinformation,[observador.pt],2020-03-23 01:55:11,pt,news corona virus vaccine ready,coronavirus china covid 19 treatments recommended,0,True,False,False
10,"[-0.058289170265197754, -0.014123783446848392,...",misinformation,[observador.pt],2020-02-23 18:31:23,pt,confirmed case coronavirus portugal,coronavirus china covid 19 treatments recommended,0,True,False,False
13,"[0.04389802739024162, 0.07366126775741577, 0.0...",misinformation,[observador.pt],2020-07-02 11:22:38,pt,advertises taking vaccine covid 19,coronavirus china covid 19 treatments recommended,0,True,False,False
16,"[-0.018961578607559204, 0.1207994893193245, 0....",misinformation,[aosfatos.org],2020-03-13 00:00:00,pt,cuba announces produces vaccine coronavirus,coronavirus china covid 19 treatments recommended,0,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...
12879,"[-0.13542534410953522, -0.029435541480779648, ...",misinformation,[fakenews.pl],2020-11-02 00:00:00,pl,amantadine miracle cure covid 19,coronavirus china covid 19 treatments recommended,0,True,False,False
12880,"[0.01928618736565113, 0.0550604909658432, 0.08...",misinformation,[fakenews.pl],2020-12-20 00:00:00,pl,nurse vaccinated covid 19 died,coronavirus china covid 19 treatments recommended,0,True,False,False
12882,"[-0.1658276617527008, -0.08447681367397308, -0...",misinformation,[fakenews.pl],2021-02-01 00:00:00,pl,shows risk contracting coronavirus location,coronavirus china covid 19 treatments recommended,0,True,False,False
12883,"[-0.030005285516381264, 0.0005143969319760799,...",misinformation,[fakenews.pl],2021-05-04 00:00:00,pl,died vaccinated covid 19,coronavirus china covid 19 treatments recommended,0,True,False,False


In [5]:
# Join tweets on filtered claims
tc_filtered_claims = (tweets.merge(rels, left_index=True, right_on='src')
                      .merge(claims_filtered, left_on='tgt', right_index=True)
                      .reset_index(drop=True))

In [6]:
tc_filtered_claims.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18225 entries, 0 to 18224
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   tweet_id          18225 non-null  uint64        
 1   text              18225 non-null  object        
 2   created_at        18225 non-null  datetime64[ns]
 3   lang              18225 non-null  category      
 4   source            18225 non-null  object        
 5   num_retweets      18225 non-null  uint64        
 6   num_replies       18225 non-null  uint64        
 7   num_quote_tweets  18225 non-null  uint64        
 8   src               18225 non-null  int64         
 9   tgt               18225 non-null  int64         
 10  embedding         18225 non-null  object        
 11  label             18225 non-null  category      
 12  reviewers         18225 non-null  object        
 13  date              18225 non-null  datetime64[ns]
 14  language          1822

In [7]:
tc_filtered_claims.head()

Unnamed: 0,tweet_id,text,created_at,lang,source,num_retweets,num_replies,num_quote_tweets,src,tgt,...,label,reviewers,date,language,keywords,cluster_keywords,cluster,train_mask,val_mask,test_mask
0,1243046281326534661,To keep our upper respiratory tract healthy in...,2020-03-26 05:25:02,en,Hootsuite Inc.,96,6,6,0,0,...,misinformation,[observador.pt],2020-03-15 12:30:21,pt,corona virus reaching lungs remains,coronavirus china covid 19 treatments recommended,0,True,False,False
1,1243148522209161217,Gargling salt water does not 'kill' coronaviru...,2020-03-26 12:11:18,en,Twitter for iPhone,7,0,0,1,0,...,misinformation,[observador.pt],2020-03-15 12:30:21,pt,corona virus reaching lungs remains,coronavirus china covid 19 treatments recommended,0,True,False,False
2,1238795119572049920,कॉरोना वायरस फेफड़ों में जाने से पहले तीन-चार ...,2020-03-14 11:52:26,hi,Twitter for Android,6,0,1,2,0,...,misinformation,[observador.pt],2020-03-15 12:30:21,pt,corona virus reaching lungs remains,coronavirus china covid 19 treatments recommended,0,True,False,False
3,1238947475471454220,Antes de llegar a los pulmones dura 4 días en ...,2020-03-14 21:57:51,es,Twitter for Android,8,3,0,3,0,...,misinformation,[observador.pt],2020-03-15 12:30:21,pt,corona virus reaching lungs remains,coronavirus china covid 19 treatments recommended,0,True,False,False
4,1239128401115516929,So they say the first symptons are #coughing\n...,2020-03-15 09:56:47,en,Twitter for Android,10,2,1,4,0,...,misinformation,[observador.pt],2020-03-15 12:30:21,pt,corona virus reaching lungs remains,coronavirus china covid 19 treatments recommended,0,True,False,False


In [10]:
# Write dataframe to csv file
cols_to_write = ["text", "label", "lang"]
data_file = "mumin_large-raw.csv"
tc_filtered_claims.reindex(columns=cols_to_write).to_csv(data_dir.parent.joinpath(data_file), index=False)

# Light pre-processing

Here we do some light preprocessing in order to make it easier to translate. Specifically, we will:

- Convert all text to lowercase
- Encode labels
- Remove newline characters as well as other strange characters
- Remove excess whitespace
- Replace some Twitter artifacts (mentions and URLs) with placeholder tokens (`<USER>` and `<URL>`)
- Remove records with a language code of `zxx`
- Remove duplicate records

## Convert all text to lowercase

In [11]:
# Create new dataframe for cleaned text
mumin_df = tc_filtered_claims[cols_to_write]

# Convert all text to lowercase
mumin_df.text = mumin_df.text.str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mumin_df.text = mumin_df.text.str.lower()


## Encode labels

Here we encode the labels in the following way:

- `misinformation`: 1
- `factual`: 0

In [17]:
encodings = {"misinformation": 1, "factual": 0}
mumin_df.label.replace(to_replace=encodings, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mumin_df.label.replace(to_replace=encodings, inplace=True)


In [18]:
mumin_df.label.value_counts()

1    17625
0      600
Name: label, dtype: int64

## Remove newline characters

We'll also take this opportunity to remove other strange characters that don't add any additional information. So far, these other characters are:

- `|`

In [13]:
mumin_df.text = mumin_df.text.str.replace("(\n|\r|\|)", "")

  mumin_df.text = mumin_df.text.str.replace("(\n|\r|\|)", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mumin_df.text = mumin_df.text.str.replace("(\n|\r|\|)", "")


## Remove excess whitespace

In [15]:
mumin_df.text = mumin_df.text.str.replace("\s+", " ")

  mumin_df.text = mumin_df.text.str.replace("\s+", " ")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mumin_df.text = mumin_df.text.str.replace("\s+", " ")


## Replace some Twitter artifacts

We'll replace mentions and URLs with some placeholder tokens:

- Mentions: `<USER>`
- URLs: `<URL>`

In [19]:
# Replace mentions with <USER>
mumin_df.text = mumin_df.text.str.replace("(?:@)\S+", "<USER>")

# Replace URLS with <URL>
mumin_df.text = mumin_df.text.str.replace("(?:https?://)\S+", "<URL>")

  mumin_df.text = mumin_df.text.str.replace("(?:@)\S+", "<USER>")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mumin_df.text = mumin_df.text.str.replace("(?:@)\S+", "<USER>")
  mumin_df.text = mumin_df.text.str.replace("(?:https?://)\S+", "<URL>")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mumin_df.text = mumin_df.text.str.replace("(?:https?://)\S+", "<URL>")


## Remove duplicate entries

In [21]:
size_before = mumin_df.shape[0]
# mumin_df.drop_duplicates(subset="text", inplace=True, ignore_index=True)
mumin_df.drop_duplicates(inplace=True, ignore_index=True)
size_after = mumin_df.shape[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mumin_df.drop_duplicates(inplace=True, ignore_index=True)


In [22]:
print(f"# of records before dropping duplicates: {size_before}")
print(f"# of records after dropping duplicates: {size_after}")

# of records before dropping duplicates: 18225
# of records after dropping duplicates: 9479


## Remove records with `zxx` language code

In [26]:
mumin_df = mumin_df.loc[~(mumin_df.lang == "zxx"), :].reset_index(drop=True)

## Write pre-cleaned dataset to CSV file

In [27]:
# Write dataframe to csv file
# cols_to_write = ["text", "label", "lang"]
data_file = "mumin_large-preproc.csv"
mumin_df.to_csv(data_dir.parent.joinpath(data_file), index=False)