# Data

In [1]:
import os

import pandas as pd

In [2]:
HOME_DATA_DIR = os.path.join('../data')
assert os.path.isdir(HOME_DATA_DIR)

## Raw data

In [3]:
data_fpath_dict = {
    'english': {
        'train': os.path.join(HOME_DATA_DIR, 'data_english_v1', 'v1', 'covid19_disinfo_binary_english_train.tsv'),
        'dev': os.path.join(HOME_DATA_DIR, 'data_english_v1', 'v1', 'covid19_disinfo_binary_english_dev_input.tsv'),
    },
    'bulgarian': {
        'train': os.path.join(HOME_DATA_DIR, 'data_bulgarian_v1', 'v1', 'covid19_disinfo_binary_bulgarian_train.tsv'),
        'dev': os.path.join(HOME_DATA_DIR, 'data_bulgarian_v1', 'v1', 'covid19_disinfo_binary_bulgarian_dev.tsv'),
    },
    'arabic': {
        'train': os.path.join(HOME_DATA_DIR, 'data_arabic_v1', 'v1', 'covid19_disinfo_binary_arabic_train.tsv'),
        'dev': os.path.join(HOME_DATA_DIR, 'data_arabic_v1', 'v1', 'covid19_disinfo_binary_arabic_dev.tsv'),
    },

}

In [4]:
dfs_dict = {}
for lang, di in data_fpath_dict.items():
    for split_name, fpath in di.items():
        dfs_dict[lang, split_name] = pd.read_csv(fpath, sep='\t', encoding='utf-8', na_filter=False)

In [5]:
for (lang, split_name), df in dfs_dict.items():
    print(f'language: {lang}, split: {split_name}, total samples: {len(df)}')

language: english, split: train, total samples: 451
language: english, split: dev, total samples: 53
language: bulgarian, split: train, total samples: 3000
language: bulgarian, split: dev, total samples: 350
language: arabic, split: train, total samples: 198
language: arabic, split: dev, total samples: 20


## Prepared data (train and dev splits)


In [6]:
table_dict = {
    'Language': [],
    'train': [],
    'dev': [],
}
for lang in ['en', 'bg', 'ar']:
    table_dict['Language'].append(lang)
    for split_name in ['train', 'dev']:
        table_dict[split_name].append(
            len(pd.read_csv(os.path.join(HOME_DATA_DIR, 'prepared', f'{split_name}.{lang}.tsv'), sep='\t', encoding='utf-8', na_filter=False))
        )

In [7]:
pd.DataFrame(table_dict)

Unnamed: 0,Language,train,dev
0,en,375,76
1,bg,3000,350
2,ar,165,33
