# Data

In [1]:
import os

import pandas as pd

In [2]:
TRAIN_DATA_DIR = os.path.join('../data/GermEval21_Toxic_Train')
TEST_DATA_DIR = os.path.join('../data/GermEval21_Toxic_Test')
CROSS_VALIDATION_DATA_DIR = os.path.join('../data/cross_validation')
assert os.path.isdir(TRAIN_DATA_DIR)
assert os.path.isdir(TEST_DATA_DIR)
assert os.path.isdir(CROSS_VALIDATION_DATA_DIR)

## Raw data

In [3]:
raw_data_fpath = os.path.join(TRAIN_DATA_DIR, 'GermEval21_Toxic_Train.csv')
assert os.path.isfile(raw_data_fpath)

In [4]:
raw_data_df = pd.read_csv(raw_data_fpath, encoding='utf-8', sep=',')

In [5]:
raw_data_df.columns

Index(['comment_id', 'comment_text', 'Sub1_Toxic', 'Sub2_Engaging',
       'Sub3_FactClaiming'],
      dtype='object')

In [6]:
for col_name in raw_data_df.columns:
    if col_name.startswith('comment'):
        continue
    print(col_name, ':', raw_data_df[col_name].value_counts().to_dict())

Sub1_Toxic : {0: 2122, 1: 1122}
Sub2_Engaging : {0: 2379, 1: 865}
Sub3_FactClaiming : {0: 2141, 1: 1103}


In [7]:
print('total number of raw samples:', len(raw_data_df))

total number of raw samples: 3244


In [8]:
label_col_names = ['Sub1_Toxic', 'Sub2_Engaging', 'Sub3_FactClaiming']
print(raw_data_df.apply(lambda x: ''.join([f'{x[label_col_name]}' for label_col_name in label_col_names]),
                        axis=1).value_counts().to_dict())

{'000': 1074, '100': 739, '001': 406, '011': 403, '010': 239, '101': 160, '111': 134, '110': 89}


## Prepared data (train and dev splits)


In [9]:
for split_name in ['train', 'dev']:
    print(f'\n*** {split_name} ***')
    data_fpath = os.path.join(TRAIN_DATA_DIR, f'{split_name}.csv')
    assert os.path.isfile(data_fpath)
    data_df = pd.read_csv(data_fpath, encoding='utf-8', sep=',')
    for col_name in data_df.columns:
        if col_name.startswith('comment'):
            continue
        print(col_name, ':', data_df[col_name].value_counts().to_dict())

    print(f'total number of {split_name} samples:', len(data_df))


*** train ***
Sub1_Toxic : {0: 1690, 1: 905}
Sub2_Engaging : {0: 1909, 1: 686}
Sub3_FactClaiming : {0: 1713, 1: 882}
total number of train samples: 2595

*** dev ***
Sub1_Toxic : {0: 432, 1: 217}
Sub2_Engaging : {0: 470, 1: 179}
Sub3_FactClaiming : {0: 428, 1: 221}
total number of dev samples: 649


## Test data

In [10]:
test_data_df = pd.read_csv(os.path.join(TEST_DATA_DIR, 'test.csv'), encoding='utf-8')
print(f'No. of test samples: {len(test_data_df)}')

No. of test samples: 944


In [11]:
test_labels_df = pd.read_csv(os.path.join(TEST_DATA_DIR, 'truth.csv'), encoding='utf-8')
for col_name in test_labels_df.columns:
    if col_name.startswith('comment'):
        continue
    print(col_name, ':', test_labels_df[col_name].value_counts().to_dict())


Sub1_Toxic : {0: 594, 1: 350}
Sub2_Engaging : {0: 691, 1: 253}
Sub3_FactClaiming : {0: 630, 1: 314}


## Cross validation data

In [12]:
for fold_name in ['fold_A', 'fold_B', 'fold_C', 'fold_D', 'fold_E']:
    for split_name in ['train', 'dev']:
        print(f'\n*** {fold_name} {split_name} ***')
        data_fpath = os.path.join(CROSS_VALIDATION_DATA_DIR, fold_name, f'{split_name}.csv')
        assert os.path.isfile(data_fpath)
        data_df = pd.read_csv(data_fpath, encoding='utf-8', sep=',')
        for col_name in data_df.columns:
            if col_name.startswith('comment'):
                continue
            print(col_name, ':', data_df[col_name].value_counts().to_dict())

        print(f'total number of {split_name} samples:', len(data_df))




*** fold_A train ***
Sub1_Toxic : {0: 1690, 1: 905}
Sub2_Engaging : {0: 1909, 1: 686}
Sub3_FactClaiming : {0: 1713, 1: 882}
total number of train samples: 2595

*** fold_A dev ***
Sub1_Toxic : {0: 432, 1: 217}
Sub2_Engaging : {0: 470, 1: 179}
Sub3_FactClaiming : {0: 428, 1: 221}
total number of dev samples: 649

*** fold_B train ***
Sub1_Toxic : {0: 1689, 1: 907}
Sub2_Engaging : {0: 1910, 1: 686}
Sub3_FactClaiming : {0: 1713, 1: 883}
total number of train samples: 2596

*** fold_B dev ***
Sub1_Toxic : {0: 433, 1: 215}
Sub2_Engaging : {0: 469, 1: 179}
Sub3_FactClaiming : {0: 428, 1: 220}
total number of dev samples: 648

*** fold_C train ***
Sub1_Toxic : {0: 1716, 1: 879}
Sub2_Engaging : {0: 1902, 1: 693}
Sub3_FactClaiming : {0: 1713, 1: 882}
total number of train samples: 2595

*** fold_C dev ***
Sub1_Toxic : {0: 406, 1: 243}
Sub2_Engaging : {0: 477, 1: 172}
Sub3_FactClaiming : {0: 428, 1: 221}
total number of dev samples: 649

*** fold_D train ***
Sub1_Toxic : {0: 1686, 1: 909}
Sub2_