In [None]:
import typing

import numpy as np

from datasets import Dataset, load_dataset
from pandas import DataFrame
from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast


tokenizer = AutoTokenizer.from_pretrained("haoranxu/X-ALMA-13B-Group2", padding_side="left")

In [8]:
def compute_stats(title, lengths):
    stats = {
        "min": np.min(lengths),
        "max": np.max(lengths),
        "mean": np.mean(lengths),
        "std": np.std(lengths),
    }

    print(title)
    print("-" * len(title))

    for key, value in stats.items():
        print(f"{key:<20}: {value:.2f}")

    print()

In [None]:
def dataset_statistics(dataset: Dataset, tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast) -> DataFrame:
    statistics = dataset.map(
        lambda example: {
            "prompt_len": len(tokenizer.tokenize(example["prompt"])),
            "response_len": len(tokenizer.tokenize(example["response"])),
        },
    )
    statistics_df = typing.cast(DataFrame, statistics.to_pandas())

    prompt_lengths = statistics_df["prompt_len"]
    response_lengths = statistics_df["response_len"]

    compute_stats("prompt", prompt_lengths)
    compute_stats("response", response_lengths)

    return statistics_df

In [None]:
print("PKU-Alignment/BeaverTails - 330k_train")
dataset = load_dataset("PKU-Alignment/BeaverTails", split="330k_train")
dataset = typing.cast(Dataset, dataset)
df_train = dataset_statistics(dataset, tokenizer)


prompt
------
min                 : 1.00
max                 : 315.00
mean                : 17.23
std                 : 12.31
response
--------
min                 : 1.00
max                 : 487.00
mean                : 79.99
std                 : 50.42


In [None]:
print("PKU-Alignment/BeaverTails - 330k_test")
dataset = load_dataset("PKU-Alignment/BeaverTails", split="330k_test")
dataset = typing.cast(Dataset, dataset)
df_test = dataset_statistics(dataset, tokenizer)


prompt
------
min                 : 1.00
max                 : 229.00
mean                : 16.90
std                 : 11.36

response
--------
min                 : 1.00
max                 : 423.00
mean                : 79.77
std                 : 49.02



In [None]:
df_train.sort_values(["prompt_len", "response_len"], ascending=True)

In [None]:
df_test.sort_values(["prompt_len", "response_len"], ascending=True)