# Minimal Dataset Peek
This working notebook gives a tiny, fast view into the prepared ReasonIR HQ dataset.

Cells: load → inspect → quick stats. Add more below as needed.

In [None]:
# Imports & config
import os, json, textwrap
from datasets import load_from_disk

DATASET_PATH = 'data/prepared_reasonir_hq'  # adjust if needed
assert os.path.isdir(DATASET_PATH), f'Missing dataset dir: {DATASET_PATH}. Run reason-prepare first.'
ds = load_from_disk(DATASET_PATH)
train = ds['train'] if 'train' in ds else ds
print(f'Train examples: {len(train):,}')

In [None]:
# Show one sample (index 0)
i = 0
ex = train[i]
def fmt_pair(p):
    if isinstance(p, (list, tuple)) and len(p) >= 2: return {'instruction': p[0], 'text': p[1][:300]}
    return p
sample = {
    'query': ex.get('query')[:300],
    'pos': fmt_pair(ex.get('pos')),
    'neg': fmt_pair(ex.get('neg')),
}
import pprint; pprint.pp(sample)

In [None]:
# Tiny length stats over a small slice
SLICE = 50 if len(train) >= 50 else len(train)
queries = []
positive_texts = []
for j in range(SLICE):
    ex = train[j]
    q = ex.get('query', '')
    if isinstance(q, list): q = ' '.join(q)
    queries.append(q)
    pos = ex.get('pos')
    if isinstance(pos, (list, tuple)) and len(pos) >= 2:
        positive_texts.append(pos[1])

print(f'Stats over first {SLICE} examples:')
print(f'Avg query chars: {sum(len(q) for q in queries)/len(queries):.1f}')
if positive_texts:
    print(f'Avg positive doc chars: {sum(len(t) for t in positive_texts)/len(positive_texts):.1f}')
print('Done.')