# Explore Prepared Dataset (Minimal)

Loads prepared dataset samples, builds a tiny cosine-sim index using Qwen 0.6B embeddings, and lets you run a quick text query.

In [2]:
%cd ~/development/lab/reasoning_embedder

/Users/shishirjoshi/development/lab/reasoning_embedder


In [3]:
# Minimal imports
import os, json, glob, math
from typing import List, Tuple

import numpy as np

# Try sentence-transformers first, fall back to transformers if needed
try:
    from sentence_transformers import SentenceTransformer
    _USE_ST = True
except Exception:
    from transformers import AutoTokenizer, AutoModel
    import torch
    _USE_ST = False

DATA_DIR = 'data/prepared_reasonir_hq'  # change if your prepared set lives elsewhere
MODEL_NAME = 'Qwen/Qwen3-Embedding-0.6B'
DEVICE = 'cuda' if (not _USE_ST and 'torch' in globals() and torch.cuda.is_available()) else ('mps' if (not _USE_ST and 'torch' in globals() and hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()) else 'cpu')
MAX_DOCS = 300  # keep minimal for a quick demo index
TOP_K = 5


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Minimal loader: support HF dataset saved to disk OR fallback JSONL scan
def _extract_texts_from_json(obj) -> List[str]:
    texts = []
    if isinstance(obj, dict):
        for key in ('document_text', 'text', 'doc', 'content'):
            if key in obj and isinstance(obj[key], str):
                texts.append(obj[key])
        for key in ('pos', 'neg', 'positives', 'negatives', 'pairs'):
            if key in obj and isinstance(obj[key], list):
                for item in obj[key]:
                    if isinstance(item, list) and len(item) == 2 and isinstance(item[1], str):
                        texts.append(item[1])
    elif isinstance(obj, list):
        if len(obj) == 2 and isinstance(obj[1], str):
            texts.append(obj[1])
        else:
            for it in obj:
                if isinstance(it, str):
                    texts.append(it)
    return texts

def load_corpus_from_prepared(data_dir: str, max_docs: int = 300) -> List[str]:
    corpus: List[str] = []
    ds_path = os.path.join(data_dir, 'dataset_dict.json')
    if os.path.exists(ds_path):
        # Try HuggingFace load_from_disk first
        try:
            from datasets import load_from_disk
            dsd = load_from_disk(data_dir)  # DatasetDict with splits like 'train'
            split = dsd['train'] if 'train' in dsd else list(dsd.values())[0]
            for ex in split.select(range(min(max_docs, len(split)))):
                if isinstance(ex, dict):
                    if isinstance(ex.get('document_text'), str):
                        corpus.append(ex['document_text'])
                        continue
                    for key in ('pos', 'neg', 'positives', 'negatives', 'pairs'):
                        val = ex.get(key)
                        if isinstance(val, list) and val:
                            itm = val[0]
                            if isinstance(itm, list) and len(itm) == 2 and isinstance(itm[1], str):
                                corpus.append(itm[1])
                                break
                if len(corpus) < max_docs and isinstance(ex, dict):
                    for key in ('text', 'doc', 'content'):
                        if isinstance(ex.get(key), str):
                            corpus.append(ex[key])
                            break
            return corpus
        except Exception as e:
            print(f'Failed to read HF dataset at {data_dir}: {e}')
            # Fallback to reading Arrow directly via pyarrow
            try:
                import pyarrow as pa, pyarrow.ipc as pa_ipc
                arrow_files = glob.glob(os.path.join(data_dir, 'train', '**', '*.arrow'), recursive=True)
                if not arrow_files:
                    arrow_files = glob.glob(os.path.join(data_dir, 'train', '*.arrow'))
                taken = 0
                for ap in arrow_files:
                    with pa.memory_map(ap, 'r') as source:
                        reader = pa_ipc.open_file(source)
                        table = reader.read_all()
                    rows = table.to_pylist()
                    for row in rows:
                        if taken >= max_docs:
                            return corpus
                        # Reuse JSON extractor on the row dict
                        texts = _extract_texts_from_json(row)
                        if texts:
                            corpus.append(texts[0])
                            taken += 1
                if corpus:
                    return corpus
            except Exception as e2:
                print(f'Failed to read Arrow directly at {data_dir}: {e2}')
            # fall through to JSONL scan
    # JSONL fallback
    jsonl_paths = glob.glob(os.path.join(data_dir, '**', '*.jsonl'), recursive=True)
    if not jsonl_paths:
        print(f'No HF dataset or JSONL files found in {data_dir}. Ensure you have run `reason-prepare`.')
        return corpus
    for p in jsonl_paths:
        with open(p, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                try:
                    obj = json.loads(line)
                except Exception:
                    continue
                texts = _extract_texts_from_json(obj)
                for t in texts:
                    if t and isinstance(t, str):
                        corpus.append(t)
                        if len(corpus) >= max_docs:
                            return corpus
    return corpus

corpus_texts = load_corpus_from_prepared(DATA_DIR, MAX_DOCS)
print(f'Loaded {len(corpus_texts)} docs for the mini index.')


Failed to read HF dataset at data/prepared_reasonir_hq: Feature type 'List' not found. Available feature types: ['Value', 'ClassLabel', 'Translation', 'TranslationVariableLanguages', 'LargeList', 'Sequence', 'Array2D', 'Array3D', 'Array4D', 'Array5D', 'Audio', 'Image', 'Video', 'Pdf']
No HF dataset or JSONL files found in data/prepared_reasonir_hq. Ensure you have run `reason-prepare`.
Loaded 0 docs for the mini index.


In [5]:
# Load embedding model (minimal)
if _USE_ST:
    model = SentenceTransformer(MODEL_NAME)
    def encode_texts(texts: List[str]) -> np.ndarray:
        embs = model.encode(texts, convert_to_numpy=True, show_progress_bar=False, normalize_embeddings=True)
        return embs.astype(np.float32)
else:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    base_model = AutoModel.from_pretrained(MODEL_NAME)
    if DEVICE != 'cpu':
        base_model = base_model.to(DEVICE)
    base_model.eval()
    import torch
    @torch.no_grad()
    def encode_texts(texts: List[str]) -> np.ndarray:
        all_embs = []
        for i in range(0, len(texts), 32):
            batch = texts[i:i+32]
            inputs = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
            if DEVICE != 'cpu':
                inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
            outputs = base_model(**inputs)
            # Use last hidden state CLS pooling (minimal); some embedding models expose specific methods
            hidden = outputs.last_hidden_state[:, 0, :]
            # L2 normalize
            hidden = torch.nn.functional.normalize(hidden, p=2, dim=1)
            all_embs.append(hidden.detach().cpu().float())
        embs = torch.cat(all_embs, dim=0).numpy()
        return embs.astype(np.float32)

print('Model ready.')


ValueError: The checkpoint you are trying to load has model type `qwen3` but Transformers does not recognize this architecture. This could be because of an issue with the checkpoint, or because your version of Transformers is out of date.

You can update Transformers with the command `pip install --upgrade transformers`. If this does not work, and the checkpoint is very new, then there may not be a release version that supports this model yet. In this case, you can get the most up-to-date code by installing Transformers from source with the command `pip install git+https://github.com/huggingface/transformers.git`

In [None]:
# Build a tiny cosine-sim index
if corpus_texts:
    doc_embs = encode_texts(corpus_texts)  # shape [N, D], normalized
    # Ensure normalized (some encoders already normalize)
    norms = np.linalg.norm(doc_embs, axis=1, keepdims=True) + 1e-8
    doc_embs = doc_embs / norms
else:
    doc_embs = np.zeros((0, 1), dtype=np.float32)

def search(query: str, top_k: int = TOP_K) -> List[Tuple[int, float]]:
    if len(corpus_texts) == 0:
        return []
    q = encode_texts([query])[0]
    q = q / (np.linalg.norm(q) + 1e-8)
    sims = (doc_embs @ q)  # cosine since both normalized
    idx = np.argpartition(-sims, min(top_k, len(sims)-1))[:top_k]
    idx = idx[np.argsort(-sims[idx])]
    return [(int(i), float(sims[i])) for i in idx]

print(f'Index built with {len(corpus_texts)} docs.')


In [None]:
# Query the index (edit `query` and re-run this cell)
query = 'how to structure reasoning steps for QA over documents?'
hits = search(query, top_k=TOP_K)
print(f'Query: {query}
')
for rank, (i, score) in enumerate(hits, 1):
    text = corpus_texts[i]
    preview = (text[:400] + '...') if len(text) > 400 else text
    print(f'#{rank} | score={score:.4f} | doc_id={i}')
    print(preview)
    print('-'*80)
