In [57]:
from datasets import load_dataset, concatenate_datasets

# Load the dataset
wp = load_dataset("euclaise/writingprompts")

# Combine all splits (train, validation, test) into one dataset
all_splits = concatenate_datasets([split for split in wp.values()])

# Print combined size
print("Total combined samples:", len(all_splits))

# Compute averages efficiently
total_prompt_words = 0
total_story_words = 0
n = len(all_splits)

for example in all_splits:
    total_prompt_words += len(example["prompt"].split())
    total_story_words += len(example["story"].split())

avg_prompt = total_prompt_words / n
avg_story = total_story_words / n

print("Average prompt length (words):", avg_prompt)
print("Average story length (words):", avg_story)


Total combined samples: 303358
Average prompt length (words): 28.398819216898847
Average story length (words): 553.1542830583007


In [105]:
from datasets import load_dataset

# Load dataset
eli5 = load_dataset("sentence-transformers/eli5", "pair")
train = eli5["train"]

print("Number of samples:", len(train))

# Running sums for efficiency
total_q_words = 0
total_a_words = 0
n = len(train)

for ex in train:
    total_q_words += len(ex["question"].split())
    total_a_words += len(ex["answer"].split())

avg_q = total_q_words / n
avg_a = total_a_words / n

print("Average question length (words):", avg_q)
print("Average answer length (words):", avg_a)


Number of samples: 325475
Average question length (words): 15.072871956371458
Average answer length (words): 76.03513941162916


In [109]:
import re
import numpy as np
from datasets import load_dataset

# Load dataset
se2 = load_dataset("PrimeIntellect/stackexchange-question-answering")["train"]

print("Number of samples:", len(se2))

def clean_question(raw_q):
    # Extract only the real question text (remove wrapping)
    match = re.search(r"Question:\s*(.*?)(?:Now provide|$)", raw_q, flags=re.S)
    return match.group(1).strip() if match else raw_q.strip()

# Compute average lengths (fast way)
q_lengths = [len(clean_question(ex["prompt"]).split()) for ex in se2]
a_lengths = [len(ex["gold_standard_solution"].split()) for ex in se2]

print("Average question length (words):", np.mean(q_lengths))
print("Average answer length (words):", np.mean(a_lengths))


Number of samples: 313105
Average question length (words): 132.25116175085034
Average answer length (words): 219.61877325497835
