In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
import numpy as np

# 加载数据集
dataset = load_dataset("squad", split="train")

# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained("llama3")

# 定义函数计算token长度
def get_token_length(example):
    context_length = len(tokenizer.encode(example['context']))
    question_length = len(tokenizer.encode(example['question']))
    return {'context_token_length': context_length, 'question_token_length': question_length}

# 随机采样1000条数据
sampled_dataset = dataset.shuffle(seed=42).select(range(1000))

# 计算采样数据的token长度
token_lengths = sampled_dataset.map(get_token_length)

# 计算平均长度
avg_context_length = np.mean(token_lengths['context_token_length'])
avg_question_length = np.mean(token_lengths['question_token_length'])

print(f"SQuAD v2 数据集采样数据的平均token长度:")
print(f"上下文 (Context): {avg_context_length:.2f}")
print(f"问题 (Question): {avg_question_length:.2f}")

# 计算最大长度
max_context_length = np.max(token_lengths['context_token_length'])
max_question_length = np.max(token_lengths['question_token_length'])

print(f"\n最大token长度:")
print(f"上下文 (Context): {max_context_length}")
print(f"问题 (Question): {max_question_length}")

# 计算90%分位数的长度
percentile_90_context = np.percentile(token_lengths['context_token_length'], 90)
percentile_90_question = np.percentile(token_lengths['question_token_length'], 90)

print(f"\n90%分位数的token长度:")
print(f"上下文 (Context): {percentile_90_context:.2f}")
print(f"问题 (Question): {percentile_90_question:.2f}")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

SQuAD v2 数据集采样数据的平均token长度:
上下文 (Context): 160.19
问题 (Question): 13.41

最大token长度:
上下文 (Context): 741
问题 (Question): 33

90%分位数的token长度:
上下文 (Context): 245.00
问题 (Question): 19.00


In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer
import numpy as np

# 加载数据集
dataset = load_dataset("hh-rlhf", split="train")

# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained("llama3")  # 使用 GPT-2 的 tokenizer

# 定义函数计算token长度
def get_token_length(example):
    chosen_length = len(tokenizer.encode(example['chosen']))
    rejected_length = len(tokenizer.encode(example['rejected']))
    return {'chosen_token_length': chosen_length, 'rejected_token_length': rejected_length}

# 随机采样1000条数据
sampled_dataset = dataset.shuffle(seed=42).select(range(1000))

# 计算采样数据的token长度
token_lengths = sampled_dataset.map(get_token_length)

# 计算平均长度
avg_chosen_length = np.mean(token_lengths['chosen_token_length'])
avg_rejected_length = np.mean(token_lengths['rejected_token_length'])

print(f"hh-rlhf 数据集采样数据的平均token长度:")
print(f"被选择的回复 (Chosen): {avg_chosen_length:.2f}")
print(f"被拒绝的回复 (Rejected): {avg_rejected_length:.2f}")

# 计算最大长度
max_chosen_length = np.max(token_lengths['chosen_token_length'])
max_rejected_length = np.max(token_lengths['rejected_token_length'])

print(f"\n最大token长度:")
print(f"被选择的回复 (Chosen): {max_chosen_length}")
print(f"被拒绝的回复 (Rejected): {max_rejected_length}")

# 计算90%分位数的长度
percentile_90_chosen = np.percentile(token_lengths['chosen_token_length'], 90)
percentile_90_rejected = np.percentile(token_lengths['rejected_token_length'], 90)

print(f"\n90%分位数的token长度:")
print(f"被选择的回复 (Chosen): {percentile_90_chosen:.2f}")
print(f"被拒绝的回复 (Rejected): {percentile_90_rejected:.2f}")

Downloading data:   0%|          | 0.00/20.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/25.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/743k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/875k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/160800 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8552 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

hh-rlhf 数据集采样数据的平均token长度:
被选择的回复 (Chosen): 220.70
被拒绝的回复 (Rejected): 220.43

最大token长度:
被选择的回复 (Chosen): 1860
被拒绝的回复 (Rejected): 1860

90%分位数的token长度:
被选择的回复 (Chosen): 459.40
被拒绝的回复 (Rejected): 453.10
