# Explore Educational Datasets
Interactive exploration and validation for Phase 3 data preparation.

In [None]:
import sys, json, logging
from pathlib import Path

logging.basicConfig(level=logging.INFO)

# Add project root to path if needed
root = Path.cwd().parent
if str(root) not in sys.path:
    sys.path.insert(0, str(root))

from src.data_preparation import (
    load_eli5_dataset, load_sciq_dataset, load_openbookqa_dataset, load_wikihow_dataset,
    format_for_instruction_tuning, validate_instruction_format,
)


In [None]:
# Load small samples
eli5 = load_eli5_dataset('train', max_samples=50)
sciq = load_sciq_dataset('train', max_samples=50)
obqa = load_openbookqa_dataset('train', max_samples=50)
wkh = load_wikihow_dataset('train', max_samples=50)

len(eli5), len(sciq), len(obqa), len(wkh)

In [None]:
# Format a few examples and preview structure
fmt_eli5 = format_for_instruction_tuning(eli5.select(range(min(5, len(eli5)))), 'eli5')
fmt_sciq = format_for_instruction_tuning(sciq.select(range(min(5, len(sciq)))), 'sciq')
fmt_obqa = format_for_instruction_tuning(obqa.select(range(min(5, len(obqa)))), 'openbookqa')
fmt_wkh = format_for_instruction_tuning(wkh.select(range(min(5, len(wkh)))), 'wikihow')

print(json.dumps(fmt_eli5[0], indent=2) if fmt_eli5 else 'No ELI5 sample')
print(json.dumps(fmt_sciq[0], indent=2) if fmt_sciq else 'No SciQ sample')
print(json.dumps(fmt_obqa[0], indent=2) if fmt_obqa else 'No OBQA sample')
print(json.dumps(fmt_wkh[0], indent=2) if fmt_wkh else 'No WikiHow sample')


In [None]:
# Validate format
is_valid, errs = validate_instruction_format(fmt_eli5 + fmt_sciq + fmt_obqa + fmt_wkh)
print('Valid:' , is_valid)
print('Errors:', errs[:5])
