# Data

In [1]:
import os

import pandas as pd

In [2]:
HOME_DATA_DIR = os.path.join('../data')
assert os.path.isdir(HOME_DATA_DIR)

## Raw data

In [3]:
data_fpath_dict = {
    'english': {
        'train': os.path.join(HOME_DATA_DIR, 'english', 'v1', 'v1', 'covid19_disinfo_binary_english_train.tsv'),
        'dev': os.path.join(HOME_DATA_DIR, 'english', 'v2', 'v2', 'covid19_disinfo_binary_english_dev_input.tsv'),
        'test': os.path.join(HOME_DATA_DIR, 'english', 'test-input', 'test-input', 'covid19_disinfo_binary_english_test_input.tsv'),
    },
    'bulgarian': {
        'train': os.path.join(HOME_DATA_DIR, 'bulgarian', 'v1', 'v1', 'covid19_disinfo_binary_bulgarian_train.tsv'),
        'dev': os.path.join(HOME_DATA_DIR, 'bulgarian', 'v1', 'v1', 'covid19_disinfo_binary_bulgarian_dev.tsv'),
        'test': os.path.join(HOME_DATA_DIR, 'bulgarian', 'test-input', 'test-input', 'covid19_disinfo_binary_bulgarian_test_input.tsv'),
    },
    'arabic': {
        'train': os.path.join(HOME_DATA_DIR, 'arabic', 'v1', 'v1', 'covid19_disinfo_binary_arabic_train.tsv'),
        'dev': os.path.join(HOME_DATA_DIR, 'arabic', 'v2', 'v2', 'covid19_disinfo_binary_arabic_dev.tsv'),
        'test': os.path.join(HOME_DATA_DIR, 'arabic', 'test-input', 'test-input', 'covid19_disinfo_binary_arabic_test_input.tsv'),
    },

}

In [4]:
dfs_dict = {}
for lang, di in data_fpath_dict.items():
    for split_name, fpath in di.items():
        dfs_dict[lang, split_name] = pd.read_csv(fpath, sep='\t', encoding='utf-8', na_filter=False)

In [5]:
for (lang, split_name), df in dfs_dict.items():
    print(f'language: {lang}, split: {split_name}, total samples: {len(df)}')

language: english, split: train, total samples: 451
language: english, split: dev, total samples: 53
language: english, split: test, total samples: 418
language: bulgarian, split: train, total samples: 3000
language: bulgarian, split: dev, total samples: 350
language: bulgarian, split: test, total samples: 357
language: arabic, split: train, total samples: 198
language: arabic, split: dev, total samples: 20
language: arabic, split: test, total samples: 1000


## Prepared data (train and dev splits)


In [6]:
table_dict = {
    'Language': [],
    'train': [],
    'dev': [],
}
for lang in ['en', 'bg', 'ar']:
    table_dict['Language'].append(lang)
    for split_name in ['train', 'dev']:
        table_dict[split_name].append(
            len(pd.read_csv(os.path.join(HOME_DATA_DIR, 'prepared', f'{split_name}.{lang}.tsv'), sep='\t', encoding='utf-8', na_filter=False))
        )

In [7]:
pd.DataFrame(table_dict)

Unnamed: 0,Language,train,dev
0,en,451,53
1,bg,3000,350
2,ar,198,20


In [8]:
for lang in ['english', 'bulgarian', 'arabic']:
    print(f'*** {lang} ***')
    for split_name in ['train', 'dev']:
        print(f'{split_name}:')
        for col_name in [f'q{idx + 1}_label' for idx in range(7)]:
            print(col_name, ':', dfs_dict['bulgarian', split_name][col_name].value_counts().to_dict())
    print()

*** english ***
train:
q1_label : {'yes': 1933, 'no': 1067}
q2_label : {'no': 1897, 'nan': 1039, 'yes': 64}
q3_label : {'yes': 1910, 'nan': 1035, 'no': 55}
q4_label : {'no': 1770, 'nan': 1049, 'yes': 181}
q5_label : {'no': 1557, 'nan': 1051, 'yes': 392}
q6_label : {'no': 2680, 'yes': 316, 'nan': 4}
q7_label : {'no': 2655, 'yes': 300, 'nan': 45}
dev:
q1_label : {'yes': 315, 'no': 35}
q2_label : {'no': 316, 'nan': 29, 'yes': 5}
q3_label : {'yes': 308, 'nan': 30, 'no': 12}
q4_label : {'no': 288, 'nan': 37, 'yes': 25}
q5_label : {'no': 254, 'yes': 62, 'nan': 34}
q6_label : {'no': 288, 'yes': 62}
q7_label : {'no': 275, 'yes': 69, 'nan': 6}

*** bulgarian ***
train:
q1_label : {'yes': 1933, 'no': 1067}
q2_label : {'no': 1897, 'nan': 1039, 'yes': 64}
q3_label : {'yes': 1910, 'nan': 1035, 'no': 55}
q4_label : {'no': 1770, 'nan': 1049, 'yes': 181}
q5_label : {'no': 1557, 'nan': 1051, 'yes': 392}
q6_label : {'no': 2680, 'yes': 316, 'nan': 4}
q7_label : {'no': 2655, 'yes': 300, 'nan': 45}
dev:
q1