In [None]:
import random
import typing as t
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [None]:
tqdm.pandas()

In [None]:
train_2017_df = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv')
valid_df = pd.read_csv('/kaggle/input/jt-combined/valid.csv')

In [None]:
CLS_LIST = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [None]:
def _build_readable_label(row: t.Dict[str, int]) -> str:
    return ' '.join([cls for cls in CLS_LIST if row[cls]])


def _build_bitmap_label(row: t.Dict[str, int]) -> str:
    return ' '.join([str(row[cls]) for cls in CLS_LIST])


def assign_label_to_comment(df: pd.DataFrame, labels_df: pd.DataFrame) -> pd.DataFrame:
    result_row_list = []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        less_toxic_comment_text = str(row['less_toxic'])
        more_toxic_comment_text = str(row['more_toxic'])
        less_toxic_label_row_candidate_df = labels_df[labels_df['comment_text'] == less_toxic_comment_text]
        if len(less_toxic_label_row_candidate_df):
            less_toxic_readable_label = _build_readable_label(less_toxic_label_row_candidate_df.iloc[0])
            less_toxic_bitmap_label = _build_bitmap_label(less_toxic_label_row_candidate_df.iloc[0])
        else:
            less_toxic_readable_label = ''
            less_toxic_bitmap_label = ''
        more_toxic_label_row_candidate_df = labels_df[labels_df['comment_text'] == more_toxic_comment_text]
        if len(more_toxic_label_row_candidate_df):
            more_toxic_readable_label = _build_readable_label(more_toxic_label_row_candidate_df.iloc[0])
            more_toxic_bitmap_label = _build_bitmap_label(more_toxic_label_row_candidate_df.iloc[0])
        else:
            more_toxic_readable_label = ''
            more_toxic_bitmap_label = ''
        result_row_list.append({
            'less_toxic': less_toxic_comment_text,
            'less_toxic_readable_label': less_toxic_readable_label,
            'less_toxic_bitmap_label': less_toxic_bitmap_label,
            'more_toxic': more_toxic_comment_text,
            'more_toxic_readable_label': more_toxic_readable_label,
            'more_toxic_bitmap_label': more_toxic_bitmap_label,
        })
    return pd.DataFrame(result_row_list)

In [None]:
valid_with_labels_df = assign_label_to_comment(df=valid_df, labels_df=train_2017_df)

In [None]:
analyze_cls_label_df = valid_with_labels_df[(valid_with_labels_df['less_toxic_readable_label'] != '') & (valid_with_labels_df['more_toxic_readable_label'] != '')]

Are there any samples where `less_toxic` and `more_toxic` comments have the same labels?

In [None]:
len(analyze_cls_label_df[analyze_cls_label_df['less_toxic_bitmap_label'] == analyze_cls_label_df['more_toxic_bitmap_label']]) / len(analyze_cls_label_df)

Let's consider a pair of `(less_toxic_bitmap_label, more_toxic_bitmap_label)` ambiguous if there is at least 1 pair of comments in the validation set where a comment with `less_toxic_bitmap_label` is ranked as more toxic than a comment with `more_toxic_bitmap_label`. How many ambigous samples do we have?

In [None]:
def get_amb_label_pair_set(df: pd.DataFrame) -> t.Set[t.Tuple[str, str]]:
    amb_label_pair_set = set()
    for _, row in tqdm(df.iterrows(), total=len(df)):
        less_toxic_bitmap_label, more_toxic_bitmap_label = str(row['less_toxic_bitmap_label']), str(row['more_toxic_bitmap_label'])
        if less_toxic_bitmap_label == more_toxic_bitmap_label:
            continue
        if len(df[(df['less_toxic_bitmap_label'] == more_toxic_bitmap_label) & (df['more_toxic_bitmap_label'] == less_toxic_bitmap_label)]):
            amb_label_pair_set.add((
                min(less_toxic_bitmap_label, more_toxic_bitmap_label),
                max(less_toxic_bitmap_label, more_toxic_bitmap_label),
            ))
    return amb_label_pair_set


def count_rows_with_bitmap_labels(df: pd.DataFrame, bitmap_label_set: t.Set[t.Tuple[str, str]]) -> int:
    n = 0
    for bitmap_left, bitmap_right in tqdm(bitmap_label_set):
        n += len(df[
            ((df['less_toxic_bitmap_label'] == bitmap_left) & (df['more_toxic_bitmap_label'] == bitmap_right)) |
            ((df['less_toxic_bitmap_label'] == bitmap_right) & (df['more_toxic_bitmap_label'] == bitmap_left))
        ])
    return n

In [None]:
amb_label_pair_set = get_amb_label_pair_set(analyze_cls_label_df)
count_rows_with_bitmap_labels(analyze_cls_label_df, amb_label_pair_set) / len(analyze_cls_label_df)

Just to sum up, for the 19.78% of validation samples the toxicity labels from the https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge are identical. Moreover, for the 63% of the validation samples the labels are ambigous. So there is a question of how useful those toxicity labels are for the ranking task.