# Data

In [1]:
import os

import pandas as pd

In [2]:
HOME_DATA_DIR = os.path.join('../data')
assert os.path.isdir(HOME_DATA_DIR)

## Raw data

In [3]:
data_fpath_dict = {
    'english': {
        'train': os.path.join(HOME_DATA_DIR, 'english', 'v3', 'v3', 'covid19_disinfo_binary_english_train.tsv'),
        'dev': os.path.join(HOME_DATA_DIR, 'english', 'v3', 'v3', 'covid19_disinfo_binary_english_dev_input.tsv'),
        'test': os.path.join(HOME_DATA_DIR, 'english', 'test-input', 'test-input',
                             'covid19_disinfo_binary_english_test_input.tsv'),
    },
    'bulgarian': {
        'train': os.path.join(HOME_DATA_DIR, 'bulgarian', 'v1', 'v1', 'covid19_disinfo_binary_bulgarian_train.tsv'),
        'dev': os.path.join(HOME_DATA_DIR, 'bulgarian', 'v1', 'v1', 'covid19_disinfo_binary_bulgarian_dev.tsv'),
        'test': os.path.join(HOME_DATA_DIR, 'bulgarian', 'test-input', 'test-input',
                             'covid19_disinfo_binary_bulgarian_test_input.tsv'),
    },
    'arabic': {
        'train': os.path.join(HOME_DATA_DIR, 'arabic', 'v3', 'v3', 'covid19_disinfo_binary_arabic_train.tsv'),
        'dev': os.path.join(HOME_DATA_DIR, 'arabic', 'v3', 'v3', 'covid19_disinfo_binary_arabic_dev.tsv'),
        'test': os.path.join(HOME_DATA_DIR, 'arabic', 'test-input', 'test-input',
                             'covid19_disinfo_binary_arabic_test_input.tsv'),
    },

}

In [4]:
dfs_dict = {}
for lang, di in data_fpath_dict.items():
    for split_name, fpath in di.items():
        dfs_dict[lang, split_name] = pd.read_csv(fpath, sep='\t', encoding='utf-8', na_filter=False)

In [5]:
for (lang, split_name), df in dfs_dict.items():
    print(f'language: {lang}, split: {split_name}, total samples: {len(df)}')

language: english, split: train, total samples: 869
language: english, split: dev, total samples: 53
language: english, split: test, total samples: 418
language: bulgarian, split: train, total samples: 3000
language: bulgarian, split: dev, total samples: 350
language: bulgarian, split: test, total samples: 357
language: arabic, split: train, total samples: 2536
language: arabic, split: dev, total samples: 520
language: arabic, split: test, total samples: 1000


## Prepared data (train and dev splits)


In [6]:
table_dict = {
    'Language': [],
    'train': [],
    'dev': [],
}
for lang in ['en', 'bg', 'ar']:
    table_dict['Language'].append(lang)
    for split_name in ['train', 'dev']:
        table_dict[split_name].append(
            len(pd.read_csv(os.path.join(HOME_DATA_DIR, 'prepared_additional', f'{split_name}.{lang}.tsv'), sep='\t',
                            encoding='utf-8', na_filter=False))
        )

In [7]:
pd.DataFrame(table_dict)

Unnamed: 0,Language,train,dev
0,en,869,53
1,bg,3000,350
2,ar,2536,520


In [8]:
for lang in ['english', 'bulgarian', 'arabic']:
    print(f'*** {lang} ***')
    for split_name in ['train', 'dev']:
        print(f'{split_name}:')
        for col_name in [f'q{idx + 1}_label' for idx in range(7)]:
            print(col_name, ':', dfs_dict['bulgarian', split_name][col_name].value_counts().to_dict())
    print()

*** english ***
train:
q1_label : {'yes': 1933, 'no': 1067}
q2_label : {'no': 1897, 'nan': 1039, 'yes': 64}
q3_label : {'yes': 1910, 'nan': 1035, 'no': 55}
q4_label : {'no': 1770, 'nan': 1049, 'yes': 181}
q5_label : {'no': 1557, 'nan': 1051, 'yes': 392}
q6_label : {'no': 2680, 'yes': 316, 'nan': 4}
q7_label : {'no': 2655, 'yes': 300, 'nan': 45}
dev:
q1_label : {'yes': 315, 'no': 35}
q2_label : {'no': 316, 'nan': 29, 'yes': 5}
q3_label : {'yes': 308, 'nan': 30, 'no': 12}
q4_label : {'no': 288, 'nan': 37, 'yes': 25}
q5_label : {'no': 254, 'yes': 62, 'nan': 34}
q6_label : {'no': 288, 'yes': 62}
q7_label : {'no': 275, 'yes': 69, 'nan': 6}

*** bulgarian ***
train:
q1_label : {'yes': 1933, 'no': 1067}
q2_label : {'no': 1897, 'nan': 1039, 'yes': 64}
q3_label : {'yes': 1910, 'nan': 1035, 'no': 55}
q4_label : {'no': 1770, 'nan': 1049, 'yes': 181}
q5_label : {'no': 1557, 'nan': 1051, 'yes': 392}
q6_label : {'no': 2680, 'yes': 316, 'nan': 4}
q7_label : {'no': 2655, 'yes': 300, 'nan': 45}
dev:
q1

**Distribution of labels**

In [9]:
label_distribution_table = {
    'language': [],
    'split': [],
}
label_distribution_table.update({
    f'Q{idx + 1}': [] for idx in range(7)
})

for lang in ['english', 'bulgarian', 'arabic']:
    for split_name in ['train', 'dev']:
        label_distribution_table['language'].append(lang)
        label_distribution_table['split'].append(split_name)
        for idx in range(7):
            value_counts = dfs_dict[lang, split_name][f'q{idx + 1}_label'].value_counts()
            label_distribution_table[f'Q{idx + 1}'].append(f'{value_counts["yes"]} Y / {value_counts["no"]} N')

In [10]:
pd.DataFrame(label_distribution_table)

Unnamed: 0,language,split,Q1,Q2,Q3,Q4,Q5,Q6,Q7
0,english,train,569 Y / 300 N,39 Y / 460 N,510 Y / 51 N,156 Y / 409 N,185 Y / 384 N,138 Y / 729 N,229 Y / 634 N
1,english,dev,27 Y / 26 N,4 Y / 20 N,22 Y / 5 N,11 Y / 16 N,12 Y / 15 N,6 Y / 47 N,8 Y / 45 N
2,bulgarian,train,1933 Y / 1067 N,64 Y / 1897 N,1910 Y / 55 N,181 Y / 1770 N,392 Y / 1557 N,316 Y / 2680 N,300 Y / 2655 N
3,bulgarian,dev,315 Y / 35 N,5 Y / 316 N,308 Y / 12 N,25 Y / 288 N,62 Y / 254 N,62 Y / 288 N,69 Y / 275 N
4,arabic,train,1926 Y / 610 N,376 Y / 1545 N,1895 Y / 22 N,351 Y / 1566 N,936 Y / 990 N,459 Y / 2075 N,2208 Y / 328 N
5,arabic,dev,225 Y / 295 N,12 Y / 210 N,221 Y / 4 N,23 Y / 201 N,107 Y / 118 N,41 Y / 478 N,379 Y / 141 N


In [11]:
print(pd.DataFrame(label_distribution_table).to_latex(index=False))

\begin{tabular}{lllllllll}
\toprule
  language &  split &               Q1 &              Q2 &             Q3 &              Q4 &              Q5 &              Q6 &              Q7 \\
\midrule
   english &  train &    569 Y / 300 N &    39 Y / 460 N &   510 Y / 51 N &   156 Y / 409 N &   185 Y / 384 N &   138 Y / 729 N &   229 Y / 634 N \\
   english &    dev &      27 Y / 26 N &      4 Y / 20 N &     22 Y / 5 N &     11 Y / 16 N &     12 Y / 15 N &      6 Y / 47 N &      8 Y / 45 N \\
 bulgarian &  train &  1933 Y / 1067 N &   64 Y / 1897 N &  1910 Y / 55 N &  181 Y / 1770 N &  392 Y / 1557 N &  316 Y / 2680 N &  300 Y / 2655 N \\
 bulgarian &    dev &     315 Y / 35 N &     5 Y / 316 N &   308 Y / 12 N &    25 Y / 288 N &    62 Y / 254 N &    62 Y / 288 N &    69 Y / 275 N \\
    arabic &  train &   1926 Y / 610 N &  376 Y / 1545 N &  1895 Y / 22 N &  351 Y / 1566 N &   936 Y / 990 N &  459 Y / 2075 N &  2208 Y / 328 N \\
    arabic &    dev &    225 Y / 295 N &    12 Y / 210 N &   

## Look at some data samples

**English**

In [12]:
sorted(dfs_dict['english', 'train']['tweet_text'].to_list(), key=lambda x: len(x))[:10]

['It’s corona time URL',
 'Now they want to clean',
 'this is fucking bullshit',
 'But bitch y’all said— URL',
 'Exhibit A ð\x9f\x98\xadð\x9f¥´ URL URL',
 'COVID-19 health advice⚠️ URL',
 'chinese corona virus be like URL',
 'self quarantine #coronavirus URL',
 'Thank you, keep up the great work!',
 'Day 5: We have rediscovered farming']

In [13]:
sorted_df = dfs_dict['english', 'train'].sort_values(by=['tweet_text'], key=lambda x: x.str.len())

In [14]:
sorted_df.iloc[8]

tweet_no                                     303
tweet_text    Thank you, keep up the great work!
q1_label                                      no
q2_label                                     nan
q3_label                                     nan
q4_label                                     nan
q5_label                                     nan
q6_label                                      no
q7_label                                      no
Name: 302, dtype: object

**Bulgarian**

In [15]:
sorted(dfs_dict['bulgarian', 'train']['text'].to_list(), key=lambda x: len(x))[:10]

['Ж. 80. Covid пневмония.',
 'Туит на италиански. 😈 Corona ciao.',
 'Короната е умрена Да живее #Covid19',
 'Пълното име на вируса било Covid-1984',
 'Ей тъй са прай. https://t.co/byATViqUGE',
 'Шах и мат зоофоби.) https://t.co/YSz7x3WEF5',
 'И още малко сайънс. https://t.co/WMfJewLD4h',
 'Цял месец. Само 71. https://t.co/e6UF96FrLy',
 'Още малко благини ☺️ https://t.co/eZEiAwSTse',
 'Чат ботът COVID-19 във Вайбър е много удобен']

In [16]:
sorted_df = dfs_dict['bulgarian', 'train'].sort_values(by=['text'], key=lambda x: x.str.len())

In [17]:
sorted_df.iloc[6]


tweet_no                                           1319
text        Цял месец. Само 71. https://t.co/e6UF96FrLy
q1_label                                            yes
q2_label                                             no
q3_label                                            yes
q4_label                                             no
q5_label                                             no
q6_label                                             no
q7_label                                             no
Name: 1318, dtype: object

**Arabic**

In [18]:
sorted(dfs_dict['arabic', 'train']['text'].to_list(), key=lambda x: len(x))[:10]

['طيب هذي دعواتهم',
 'ما هو دي دعواتنا',
 'كوفيد ١٩ برو ماكس',
 'ماذا بعد #كورونا يا أهل قطر ؟😉',
 'لقاح الاديان اهم من لقاح #كورونا',
 '#كورونا راحت تقابل #كورونا 😊ً URL',
 'كورونا لما يشوف اللي بيحصل في اسكندريه URL',
 'ما هو أفضل علاج لفيروس كورونا كوفيد-١٩؟ URL',
 '❌ علاج ❌ لقاح ✅ مناعة https://t.co/WkoTQy7ok5',
 'روسيا تعلن عن أول لقاح مضاد لفيروس كورونا #كورونا']

In [19]:
sorted_df = dfs_dict['arabic', 'train'].sort_values(by=['text'], key=lambda x: x.str.len())

In [20]:
sorted_df.iloc[2]




tweet_no                   44
text        كوفيد ١٩ برو ماكس
q1_label                   no
q2_label                  nan
q3_label                  nan
q4_label                  nan
q5_label                  nan
q6_label                  yes
q7_label                  yes
Name: 43, dtype: object