In [1]:
import string
import re
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

"""
Library of various cleaning-related functions, regular expressions and variables.
"""


simple_latin = string.ascii_lowercase + string.ascii_uppercase
dirty_chars = string.digits + string.punctuation


def is_clean_text(text: str) -> bool:
    """
    Simple text cleaning method.
    """
    dirty = (
        len(text) < 25                                               # Short text
        or
        0.5 < sum(char in dirty_chars for char in text) / len(text)  # More than 50% dirty chars                                            
    )
    return not dirty


# Source: https://gist.github.com/dperini/729294
url_regex = re.compile(
    r'(?:^|(?<![\w\/\.]))'
    r'(?:(?:https?:\/\/|ftp:\/\/|www\d{0,3}\.))'
    r'(?:\S+(?::\S*)?@)?' r'(?:'
    r'(?!(?:10|127)(?:\.\d{1,3}){3})'
    r'(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})'
    r'(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})'
    r'(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])'
    r'(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}'
    r'(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))'
    r'|'
    r'(?:(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)'
    r'(?:\.(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)*'
    r'(?:\.(?:[a-z\\u00a1-\\uffff]{2,}))' r'|' r'(?:(localhost))' r')'
    r'(?::\d{2,5})?'
    r'(?:\/[^\)\]\}\s]*)?',
    flags=re.IGNORECASE,
)


def remove_urls(text: str) -> str:
    return url_regex.sub('', text)


# Source: https://gist.github.com/Nikitha2309/15337f4f593c4a21fb0965804755c41d
emoji_regex = re.compile('['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002500-\U00002BEF'  # chinese char
        u'\U00002702-\U000027B0'
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        u'\U0001f926-\U0001f937'
        u'\U00010000-\U0010ffff'
        u'\u2640-\u2642'
        u'\u2600-\u2B55'
        u'\u200d'
        u'\u23cf'
        u'\u23e9'
        u'\u231a'
        u'\ufe0f'  # dingbats
        u'\u3030'
    ']+')


def remove_emojis(text: str) -> str:
    return emoji_regex.sub('', text)


sentence_stop_regex = re.compile('['
    u'\u002e' # full stop
    u'\u2026' # ellipsis
    u'\u061F' # arabic question mark
    u'\u06D4' # arabic full stop
    u'\u2022' # bullet point
    u'\u3002' # chinese period
    u'\u25CB' # white circle
    '\|'      # pipe
']+')


def replace_stops(text: str) -> str:
    """
    Replaces some characters that are being used to end sentences. Used for sentence segmentation with sliding windows.
    """
    return sentence_stop_regex.sub('.', text)


whitespace_regex = re.compile(r'\s+')


def replace_whitespaces(text: str) -> str:
    return whitespace_regex.sub(' ', text)


def clean_ocr(ocr: str) -> str:
    """
    Remove all lines that are shorter than 6 and have more than 50% `dirty_chars`.
    """
    return '\n'.join(
        line
        for line in ocr.split('\n')
        if len(line) > 5 and sum(char in dirty_chars for char in line) / len(line) < 0.5
    )


def clean_twitter_picture_links(text):
    """
    Replaces links to picture in twitter post only with 'pic'. 
    """
    return re.sub(r'pic.twitter.com/\S+', 'pic', text)


def clean_twitter_links(text):
    """
    Replaces twitter links with 't.co'.
    """
    return re.sub(r'\S+//t.co/\S+', 't.co', text)


def remove_elongation(text):
    """
    Replaces any occurrence of a string of consecutive identical non-space 
    characters (at least three in a row) with just one instance of that character.
    """
    text = re.sub(r'(\S+)\1{2,}', r'\1', text)
    return text


def clean_text(text):
    
    if remove_urls:
        text = remove_urls(text)

    if remove_emojis:
        text = remove_emojis(text)

    if replace_whitespaces:
        text = replace_whitespaces(text)
    
    if True:
        text = clean_twitter_picture_links(text)
        text = clean_twitter_links(text)
    
    if remove_elongation:
        text = remove_elongation(text)

    return text.strip()        
    
    
def maybe_clean_ocr(ocr):
    return clean_ocr(ocr)

In [2]:
import ast
import os

import pandas as pd

our_dataset_path = '/SemEval_Task7_Test_Phase'

posts_path = os.path.join(our_dataset_path, 'posts.csv')
fact_checks_path = os.path.join(our_dataset_path, 'fact_checks.csv')

parse_col = lambda s: ast.literal_eval(s.replace('\n', '\\n')) if s else s

df_fact_checks = pd.read_csv(fact_checks_path).fillna('').set_index('fact_check_id')
for col in ['claim', 'title']:
    df_fact_checks[col] = df_fact_checks[col].apply(parse_col)

df_posts = pd.read_csv(posts_path).fillna('').set_index('post_id')
for col in ['ocr', 'text']:
    df_posts[col] = df_posts[col].apply(parse_col)


In [3]:
df_fact_checks

Unnamed: 0_level_0,claim,instances,title
fact_check_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,"( Are avocados good for you?, Are avocados go...","[(1525653998.0, 'https://metafact.io/factcheck...",
1,"( Can animals have headaches?, Can animals ha...","[(1617955634.0, 'https://metafact.io/factcheck...",
2,"( Can we help prevent Alzheimer's with diet?, ...","[(1525653998.0, 'https://metafact.io/factcheck...",
3,( Do any benefits of alcohol outweigh the risk...,"[(1525653998.0, 'https://metafact.io/factcheck...",
4,"( Does acupuncture work for headaches?, Does ...","[(1617955595.0, 'https://metafact.io/factcheck...",
...,...,...,...
372883,(Claudia Sheinbaum quiere cambiar la Constituc...,"[('2024-05-28 02:48:00+02', 'https://factual.a...","(Agence France-Presse ..., France Media Agency..."
372884,(Embaixador de Israel levanta cartaz comemoran...,"[('2024-05-27 22:34:00+02', 'https://checamos....","(Agence France-Presse ..., France Media Agency..."
372891,(The viral video shows African protesting at Q...,"[('2022-09-26 02:00:00+02', 'https://www.newsm...",(Fact Check: Viral Video Does NOT Show African...
372893,(Instagram hikayede ekran görüntüsü (SS) alınc...,"[('2017-07-12 02:00:00+02', 'https://teyit.org...",(Instagram hikayede ekran görüntüsü (SS) alınc...


In [4]:
import ast

def safe_literal_eval(value):
    try:
        return ast.literal_eval(str(value))
    except (ValueError, SyntaxError):
        return None  # Or `None`, depending on how you want to handle it

In [5]:
df_posts['text'] = df_posts['text'].apply(lambda x: safe_literal_eval(x))
df_posts['ocr'] = df_posts['ocr'].apply(lambda x: safe_literal_eval(x))
df_fact_checks['claim'] = df_fact_checks['claim'].apply(lambda x: safe_literal_eval(x))
df_fact_checks['title'] = df_fact_checks['title'].apply(lambda x: safe_literal_eval(x))

In [6]:
id_to_post = dict()
for post_id, post_text, ocr_text in zip(df_posts.index, df_posts['text'], df_posts['ocr']):
    texts = list()
    if post_text:
        texts.append(maybe_clean_ocr(post_text[1]))
    if ocr_text:
        texts.append(maybe_clean_ocr(ocr_text[0][1]))
    id_to_post[post_id] = clean_text(' '.join(texts))

id_to_fact_check = dict()
for fact_check_id, claim, title in zip(df_fact_checks.index, df_fact_checks['claim'], df_fact_checks['title']):
    texts = list()
    if claim:
        texts.append(maybe_clean_ocr(claim[1]))
    if title:
        texts.append(maybe_clean_ocr(title[1]))
    id_to_fact_check[fact_check_id] = clean_text(' '.join(texts))

In [7]:
len(id_to_fact_check)

272447

In [8]:
df_post_clean = pd.DataFrame([id_to_post]).T.reset_index()
df_fact_check_clean = pd.DataFrame([id_to_fact_check]).T.reset_index()
df_post_clean.columns = ['post_id', 'clean_text']
df_fact_check_clean.columns = ['fact_check_id', 'clean_text']

In [9]:
df_post_clean

Unnamed: 0,post_id,clean_text
0,28094,The new BRICS will control 80 percent of globa...
1,28095,"&quot;No Zelensky, no war&quot; - New York ban..."
2,28101,"A student in Bremen, Germany found an online s..."
3,28102,Amazon now has a cookbook citing Klaus&#39; ne...
4,28112,DONALD TRUMP IS TOTALLY INSANE. Donald J. Trum...
...,...,...
8271,79490,This plant appeared in the Harry Potter movie ...
8272,79492,- The Minister of Planning has already suggest...
8273,79493,LARRETA&#39;S DELIRIUM IS TOTAL: Larreta has j...
8274,79494,And the question that won&#39;t go away. Who w...


In [10]:
df_fact_check_clean

Unnamed: 0,fact_check_id,clean_text
0,0,Are avocados good for you?
1,1,Can animals have headaches?
2,2,Can we help prevent Alzheimer's with diet?
3,3,Do any benefits of alcohol outweigh the risks?
4,4,Does acupuncture work for headaches?
...,...,...
272442,372883,Claudia Sheinbaum wants to change the Constitu...
272443,372884,Israeli Ambassador raises poster commemorating...
272444,372891,The viral video shows African protesting at Qu...
272445,372893,Do you get notifications when you take a scree...


In [11]:
df_posts

Unnamed: 0_level_0,instances,ocr,verdicts,text
post_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
28094,"[('2023-08-24 13:31:03+00', 'tg')]",,,(🌎🛢 Nowy BRICS będzie kontrolował 80 procent ś...
28095,"[('2023-06-29 16:42:00+00', 'tg')]",,,"(🔥🇺🇸🇺🇦 "" Nie ma Zelenskiego, nie ma wojny"" - b..."
28101,"[('2022-09-18 19:22:27+00', 'tg')]",,,"(🇩🇪 A student in Bremen, Germany found an onli..."
28102,"[('2023-02-07 00:05:36+00', 'tg')]",,,(W Amazonie można już nabyć książkę kucharską ...
28112,"[('2019-10-05 15:19:19+00', 'fb')]",[(Donald J. Trump [USER] We need a Civil War t...,['False information'],"(DONALD TRUMP IS TOTALLY INSANE., DONALD TRUMP..."
...,...,...,...,...
79490,"[('2024-01-16 22:38:06+00', 'ig')]",[([USER] نبتة الماندرا كورا الصارخة من شافها ا...,['False information.'],(هاد النبتة ظهرت ففيلم هاري بوتر وهي نبتة ذات ...
79492,"[('2024-03-20 10:26:45+00', 'ig')]",,['Partly false information. Reviewed by indepe...,(- A ministra do Planejamento já sugeriu 30% d...
79493,"[('2024-03-24 18:03:50+00', 'ig')]","[(Yo tenía, ponele, 12 años y me acuerdo que, ...",['Partly false information. Reviewed by indepe...,(EL DELIRIO DEL LARRETA ES TOTAL: Larreta acab...
79494,"[('2024-03-24 21:42:20+00', 'tw')]","[(LA Não dou, não dou, não dou, LA I don&#39;t...",,(E a pergunta que não quer calar..... Quem foi...


In [12]:
df_fact_checks

Unnamed: 0_level_0,claim,instances,title
fact_check_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,"( Are avocados good for you?, Are avocados go...","[(1525653998.0, 'https://metafact.io/factcheck...",
1,"( Can animals have headaches?, Can animals ha...","[(1617955634.0, 'https://metafact.io/factcheck...",
2,"( Can we help prevent Alzheimer's with diet?, ...","[(1525653998.0, 'https://metafact.io/factcheck...",
3,( Do any benefits of alcohol outweigh the risk...,"[(1525653998.0, 'https://metafact.io/factcheck...",
4,"( Does acupuncture work for headaches?, Does ...","[(1617955595.0, 'https://metafact.io/factcheck...",
...,...,...,...
372883,(Claudia Sheinbaum quiere cambiar la Constituc...,"[('2024-05-28 02:48:00+02', 'https://factual.a...","(Agence France-Presse ..., France Media Agency..."
372884,(Embaixador de Israel levanta cartaz comemoran...,"[('2024-05-27 22:34:00+02', 'https://checamos....","(Agence France-Presse ..., France Media Agency..."
372891,(The viral video shows African protesting at Q...,"[('2022-09-26 02:00:00+02', 'https://www.newsm...",(Fact Check: Viral Video Does NOT Show African...
372893,(Instagram hikayede ekran görüntüsü (SS) alınc...,"[('2017-07-12 02:00:00+02', 'https://teyit.org...",(Instagram hikayede ekran görüntüsü (SS) alınc...


In [13]:
lingual_data = pd.read_json('/SemEval_Task7_Test_Phase/tasks.json').reset_index()

In [14]:
lingual_data

Unnamed: 0,index,monolingual,crosslingual
0,ara,"{'fact_checks': [12, 18, 19, 20, 21, 161, 4844...",
1,deu,"{'fact_checks': [455, 803, 923, 968, 969, 1381...",
2,eng,"{'fact_checks': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9,...",
3,fra,"{'fact_checks': [15, 136, 155, 156, 187, 214, ...",
4,msa,"{'fact_checks': [1096, 1461, 3178, 3459, 3504,...",
5,pol,"{'fact_checks': [53, 55, 95, 970, 1056, 1548, ...",
6,por,"{'fact_checks': [13, 14, 37, 40, 42, 43, 44, 4...",
7,spa,"{'fact_checks': [56, 58, 59, 64, 65, 73, 99, 1...",
8,tha,"{'fact_checks': [8144, 8145, 8857, 10587, 2064...",
9,tur,"{'fact_checks': [145, 560, 983, 3184, 3192, 38...",


In [15]:
df_fact_checks = df_fact_checks.reset_index()
df_posts = df_posts.reset_index()

In [16]:
print('crosslingual_fact_checks', len(lingual_data[lingual_data['index']=='fact_checks']['crosslingual'].values[0]))
print('crosslingual_posts_test', len(lingual_data[lingual_data['index']=='posts_test']['crosslingual'].values[0]))

crosslingual_fact_checks 272447
crosslingual_posts_test 4000


In [17]:
df = pd.DataFrame(list(lingual_data[['monolingual', 'index']]['monolingual'].dropna().values))
df['post_lang'] = lingual_data['index']
df

Unnamed: 0,fact_checks,posts_test,post_lang
0,"[12, 18, 19, 20, 21, 161, 4844, 7348, 7349, 73...","[28205, 28227, 28401, 28641, 28881, 28950, 290...",ara
1,"[455, 803, 923, 968, 969, 1381, 1439, 1469, 14...","[28173, 28222, 28253, 28276, 28319, 28329, 283...",deu
2,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 22,...","[28112, 28221, 28274, 28451, 28650, 28687, 286...",eng
3,"[15, 136, 155, 156, 187, 214, 584, 1187, 1193,...","[28277, 28387, 28456, 28502, 28567, 28597, 287...",fra
4,"[1096, 1461, 3178, 3459, 3504, 4849, 4930, 494...","[28150, 28553, 28671, 28701, 29170, 29248, 295...",msa
5,"[53, 55, 95, 970, 1056, 1548, 1549, 3038, 3046...","[28094, 28095, 28102, 28178, 28193, 28204, 282...",pol
6,"[13, 14, 37, 40, 42, 43, 44, 45, 46, 47, 48, 4...","[28158, 28543, 29046, 29164, 29261, 29384, 297...",por
7,"[56, 58, 59, 64, 65, 73, 99, 175, 193, 209, 30...","[28258, 28334, 28342, 28361, 28393, 28434, 286...",spa
8,"[8144, 8145, 8857, 10587, 20642, 33944, 33945,...","[28399, 28649, 28688, 28724, 28964, 29092, 290...",tha
9,"[145, 560, 983, 3184, 3192, 3896, 4523, 5177, ...","[29970, 30010, 30157, 30851, 31031, 31278, 316...",tur


In [18]:
df2 = pd.DataFrame()
df3 = pd.DataFrame()

for i in df['post_lang'].values:
    
    fact_df = pd.DataFrame()  
    post_df = pd.DataFrame()
    
    fact_df['fact_check_id'] = pd.Series(df[df['post_lang']==i]['fact_checks'].values[0])
    fact_df['post_lang'] = i

    df2 = pd.concat([df2, fact_df]).reset_index(drop = True)
    
    post_df['post_id'] = pd.Series(df[df['post_lang']==i]['posts_test'].values[0])
    post_df['post_lang'] = i
    print(i, len(pd.Series(df[df['post_lang']==i]['fact_checks'].values[0])), 
          len(pd.Series(df[df['post_lang']==i]['posts_test'].values[0])))

    df3 = pd.concat([df3, post_df]).reset_index(drop = True)

df3 = df3.dropna()
df3['post_id'] = df3['post_id'].astype(int)
df2 = df2.dropna()
df2['fact_check_id'] = df2['fact_check_id'].astype(int)

ara 21153 500
deu 7485 500
eng 145287 500
fra 6316 500
msa 686 93
pol 8796 500
por 32598 500
spa 25440 500
tha 583 183
tur 12536 500


In [19]:
df3['post_lang'].value_counts()

post_lang
ara    500
deu    500
eng    500
fra    500
pol    500
por    500
spa    500
tur    500
tha    183
msa     93
Name: count, dtype: int64

In [20]:
df_fact_check_2 = pd.merge(df_fact_checks, df_fact_check_clean, on = 'fact_check_id', how = 'left')
df_fact_check_2 = pd.merge(df_fact_check_2, df2, on = 'fact_check_id', how = 'left')

df_posts_2 = pd.merge(df_posts, df_post_clean, on = 'post_id', how = 'left')
df_posts_2 = pd.merge(df_posts_2, df3, on = 'post_id', how = 'left')

In [21]:
df_fact_check_2

Unnamed: 0,fact_check_id,claim,instances,title,clean_text,post_lang
0,0,"( Are avocados good for you?, Are avocados go...","[(1525653998.0, 'https://metafact.io/factcheck...",,Are avocados good for you?,eng
1,1,"( Can animals have headaches?, Can animals ha...","[(1617955634.0, 'https://metafact.io/factcheck...",,Can animals have headaches?,eng
2,2,"( Can we help prevent Alzheimer's with diet?, ...","[(1525653998.0, 'https://metafact.io/factcheck...",,Can we help prevent Alzheimer's with diet?,eng
3,3,( Do any benefits of alcohol outweigh the risk...,"[(1525653998.0, 'https://metafact.io/factcheck...",,Do any benefits of alcohol outweigh the risks?,eng
4,4,"( Does acupuncture work for headaches?, Does ...","[(1617955595.0, 'https://metafact.io/factcheck...",,Does acupuncture work for headaches?,eng
...,...,...,...,...,...,...
272442,372883,(Claudia Sheinbaum quiere cambiar la Constituc...,"[('2024-05-28 02:48:00+02', 'https://factual.a...","(Agence France-Presse ..., France Media Agency...",Claudia Sheinbaum wants to change the Constitu...,spa
272443,372884,(Embaixador de Israel levanta cartaz comemoran...,"[('2024-05-27 22:34:00+02', 'https://checamos....","(Agence France-Presse ..., France Media Agency...",Israeli Ambassador raises poster commemorating...,por
272444,372891,(The viral video shows African protesting at Q...,"[('2022-09-26 02:00:00+02', 'https://www.newsm...",(Fact Check: Viral Video Does NOT Show African...,The viral video shows African protesting at Qu...,eng
272445,372893,(Instagram hikayede ekran görüntüsü (SS) alınc...,"[('2017-07-12 02:00:00+02', 'https://teyit.org...",(Instagram hikayede ekran görüntüsü (SS) alınc...,Do you get notifications when you take a scree...,tur


In [22]:
df_posts_2.to_csv('test_posts_text.csv', index = False)
df_fact_check_2.to_csv('test_fact_checks_text.csv', index = False)