# Data

In [1]:
import os

import pandas as pd

In [2]:
HOME_DATA_DIR = os.path.join('../data/GermEval21_Toxic_Train')
assert os.path.isdir(HOME_DATA_DIR)

## Raw data

In [3]:
raw_data_fpath = os.path.join(HOME_DATA_DIR, 'GermEval21_Toxic_Train.csv')
assert os.path.isfile(raw_data_fpath)

In [4]:
raw_data_df = pd.read_csv(raw_data_fpath, encoding='utf-8', sep=',')

In [5]:
raw_data_df.columns

Index(['comment_id', 'comment_text', 'Sub1_Toxic', 'Sub2_Engaging',
       'Sub3_FactClaiming'],
      dtype='object')

In [6]:
for col_name in raw_data_df.columns:
    if col_name.startswith('comment'):
        continue
    print(col_name, ':', raw_data_df[col_name].value_counts().to_dict())

Sub1_Toxic : {0: 2122, 1: 1122}
Sub2_Engaging : {0: 2379, 1: 865}
Sub3_FactClaiming : {0: 2141, 1: 1103}


In [7]:
print('total number of raw samples:', len(raw_data_df))

total number of raw samples: 3244


In [8]:
label_col_names = ['Sub1_Toxic', 'Sub2_Engaging', 'Sub3_FactClaiming']
print(raw_data_df.apply(lambda x: ''.join([f'{x[label_col_name]}' for label_col_name in label_col_names]), axis=1).value_counts().to_dict())

{'000': 1074, '100': 739, '001': 406, '011': 403, '010': 239, '101': 160, '111': 134, '110': 89}


## Prepared data (train and dev splits)


In [9]:
for split_name in ['train', 'dev']:
    print(f'\n*** {split_name} ***')
    data_fpath = os.path.join(HOME_DATA_DIR, f'{split_name}.csv')
    assert os.path.isfile(data_fpath)
    data_df = pd.read_csv(data_fpath, encoding='utf-8', sep=',')
    for col_name in data_df.columns:
        if col_name.startswith('comment'):
            continue
        print(col_name, ':', data_df[col_name].value_counts().to_dict())

    print(f'total number of {split_name} samples:', len(data_df))


*** train ***
Sub1_Toxic : {0: 1690, 1: 905}
Sub2_Engaging : {0: 1909, 1: 686}
Sub3_FactClaiming : {0: 1713, 1: 882}
total number of train samples: 2595

*** dev ***
Sub1_Toxic : {0: 432, 1: 217}
Sub2_Engaging : {0: 470, 1: 179}
Sub3_FactClaiming : {0: 428, 1: 221}
total number of dev samples: 649
