# Quality Checks for the Translated BeaverTails Dataset

This notebook performs a series of quality assurance checks on the translated version of the BeaverTails dataset.

Quality checks:
- Length ratio
- Token ratio
- Question preserved
- Exclamation preserved
- Truncated translations

In [None]:
import typing

from datasets import Dataset, load_dataset
from pandas import DataFrame
from transformers import AutoTokenizer

from src.evaluation.checks import exclamation_preserved, length_ratio, question_preserved, token_ratio

In [None]:
NEW_MAX_TOKENS = 512
SEPARATOR = "-" * 80

In [None]:
# get tokens for original and translated examples
tokenizer = AutoTokenizer.from_pretrained("haoranxu/X-ALMA-13B-Group2")


def get_tokens(example: dict[str, str], src_field: str, mt_field: str) -> dict[str, str | list[str]]:
    return {
        "src": example[src_field],
        "mt": example[mt_field],
        "src_tokens": tokenizer.tokenize(example[src_field]),
        "mt_tokens": tokenizer.tokenize(example[mt_field]),
    }

In [None]:
def compute_stats(df: DataFrame):
    # length and token ratios
    df["length_ratio"] = df.apply(lambda x: length_ratio(x["src"], x["mt"]), axis=1)
    df["token_ratio"] = df.apply(lambda x: token_ratio(x["src_tokens"], x["mt_tokens"]), axis=1)

    # token lengths
    df["src_token_len"] = df.apply(lambda x: len(x["src_tokens"]), axis=1)
    df["mt_token_len"] = df.apply(lambda x: len(x["mt_tokens"]), axis=1)

    # qestion marks
    df["contains_question"] = df.apply(lambda x: "?" in x["src"], axis=1)
    df["question_preserved"] = df.apply(lambda x: question_preserved(x["src"], x["mt"]), axis=1)

    # exclamation marks
    df["contains_exclamation"] = df.apply(lambda x: "!" in x["src"], axis=1)
    df["exclamation_preserved"] = df.apply(lambda x: exclamation_preserved(x["src"], x["mt"]), axis=1)

    return df

In [None]:
def check_truncation(df: DataFrame, new_max_tokens: int, field: str = "prompt"):
    truncated = df[df["mt_token_len"] >= new_max_tokens]
    print(f"Number of truncated {field} translations: ", len(truncated))

    if len(truncated) > 0:
        display(truncated[["src", "mt", "mt_token_len"]].sort_values("mt_token_len", ascending=False))
        print()


def check_questions(df: DataFrame, field: str = "prompt"):
    not_preserved = df[df["contains_question"] & ~df["question_preserved"]]
    print(f"Number of {field}s with question but not preserved: ", len(not_preserved))

    if len(not_preserved) > 0:
        display(not_preserved)
        print()


def check_exclamations(df: DataFrame, field: str = "prompt"):
    not_preserved = df[df["contains_exclamation"] & ~df["exclamation_preserved"]]
    print(f"Number of {field}s with exclamation but not preserved: ", len(not_preserved))

    if len(not_preserved) > 0:
        display(not_preserved)
        print()


def display_length_ratio(df: DataFrame, sort_by: str = "length_ratio", ascending: bool = False, field: str = "prompt"):
    df = df[["src", "mt", "src_token_len", "mt_token_len", "length_ratio", "token_ratio"]].copy()

    print(f"{field.capitalize()} Length and Token Ratio Analysis")
    display(df.sort_values(sort_by, ascending=ascending))
    print()

## PKU-Alignment/BeaverTails

In [None]:
# load the dataset and get tokens
dataset = load_dataset("saiteki-kai/BeaverTails-it", split="330k_test")

dataset_prompts = dataset.map(lambda example: get_tokens(example, "prompt", "prompt_it"))
dataset_prompts = dataset_prompts.remove_columns(["prompt", "prompt_it", "response", "response_it"])
dataset_prompts = typing.cast(Dataset, dataset_prompts)
dataset_prompts_df = typing.cast(DataFrame, dataset_prompts.to_pandas())

dataset_responses = dataset.map(lambda example: get_tokens(example, "response", "response_it"))
dataset_responses = dataset_responses.remove_columns(["prompt", "prompt_it", "response", "response_it"])
dataset_responses = typing.cast(Dataset, dataset_responses)
dataset_responses_df = typing.cast(DataFrame, dataset_responses.to_pandas())

In [None]:
dataset_prompts_df = compute_stats(dataset_prompts_df)
dataset_responses_df = compute_stats(dataset_responses_df)

In [None]:
check_truncation(dataset_prompts_df, NEW_MAX_TOKENS, "prompt")
check_truncation(dataset_responses_df, NEW_MAX_TOKENS, "response")
print(SEPARATOR)

check_questions(dataset_prompts_df, "prompt")
check_questions(dataset_responses_df, "response")
print(SEPARATOR)

check_exclamations(dataset_prompts_df, "prompt")
check_exclamations(dataset_responses_df, "response")
print(SEPARATOR)

display_length_ratio(dataset_prompts_df, "length_ratio", True, "prompt")
display_length_ratio(dataset_responses_df, "length_ratio", True, "response")

## PKU-Alignment/BeaverTails-Evaluation

In [None]:
# load the dataset and get tokens
dataset = load_dataset("saiteki-kai/BeaverTails-it-Evaluation", split="test")

dataset_prompts = dataset.map(lambda example: get_tokens(example, "prompt", "prompt_it"))
dataset_prompts = typing.cast(Dataset, dataset_prompts)
dataset_prompts_df = typing.cast(DataFrame, dataset_prompts.to_pandas())

In [None]:
dataset_prompts_df = compute_stats(dataset_prompts_df)

In [None]:
check_truncation(dataset_prompts_df, NEW_MAX_TOKENS)
print(SEPARATOR)

check_questions(dataset_prompts_df)
print(SEPARATOR)

check_exclamations(dataset_prompts_df)
print(SEPARATOR)

display_length_ratio(dataset_prompts_df)