In [2]:
# Cell 1 - imports & config
import os
import json
import time
import re
import random
from pathlib import Path
from typing import List, Tuple, Dict
from tqdm.auto import tqdm

import requests
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss

# local config
SAMUDRIK_TXT = "hasta_samudrik_cleaned.txt"   # path to your book text
OUTPUT_JSONL = "json_to_interpretation_pairs.jsonl"
INDEX_DIR = "faiss_index"
CHUNK_SIZE = 800            # characters per chunk
CHUNK_OVERLAP = 200
EMB_MODEL = "all-MiniLM-L6-v2"   # small & fast
EMB_DIM = 384
TOP_K = 5
NUM_PAIRS = 1000            # target number of pairs to produce
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://localhost:11434")
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "mistral:7b-instruct-q4_0")  # set to the model you pulled
BATCH_JSONS_PER_CALL = 10   # how many random JSONs to request per Ollama call
OLLAMA_SLEEP = 0.08         # small delay between Ollama requests
RAG_TEMPERATURE = 0.2       # low temp for grounded interpretations
RNG_SEED = 42

# numeric ranges that match typical CV outputs; tune as needed
BREAKS_RANGE = (0, 10)
BRANCHES_RANGE = (0, 300)

random.seed(RNG_SEED)
np.random.seed(RNG_SEED)

Path(INDEX_DIR).mkdir(exist_ok=True)


  from .autonotebook import tqdm as notebook_tqdm





In [3]:
import re
from typing import List

CHUNK_SIZE = 800  # adjust if needed
CHUNK_OVERLAP = 100

def chunk_file(path: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
    chunks = []
    buffer = ""

    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            buffer += line.strip() + " "
            while len(buffer) > chunk_size:
                # cut at chunk_size
                j = chunk_size
                # try to cut nicely at sentence boundary
                tail = buffer[j:j+200]
                m = re.search(r'([.?!]\s)', tail)
                if m:
                    j = j + m.end()
                chunk = buffer[:j].strip()
                chunks.append(chunk)
                # prepare buffer for next chunk (with overlap)
                buffer = buffer[j - overlap:]

    # add any leftover
    if buffer.strip():
        chunks.append(buffer.strip())

    return chunks

chunks = chunk_file(SAMUDRIK_TXT)
print(f"Created {len(chunks)} chunks.")


Created 750 chunks.


In [4]:
# Cell 3 - embeddings + FAISS index
embedder = SentenceTransformer(EMB_MODEL)

INDEX_PATH = Path(INDEX_DIR) / "index.faiss"
META_PATH = Path(INDEX_DIR) / "meta.json"

def build_or_load_index(chunks: List[str]):
    if INDEX_PATH.exists() and META_PATH.exists():
        print("Loading existing FAISS index & metadata ...")
        index = faiss.read_index(str(INDEX_PATH))
        with open(META_PATH, "r", encoding="utf-8") as f:
            metas = json.load(f)
        return index, metas
    print("Computing embeddings and building FAISS index (this may take a moment)...")
    embeddings = []
    batch = 64
    for i in range(0, len(chunks), batch):
        batch_texts = chunks[i:i+batch]
        emb = embedder.encode(batch_texts, show_progress_bar=False, convert_to_numpy=True)
        embeddings.append(emb)
    embeddings = np.vstack(embeddings).astype('float32')
    # normalize for IP search
    faiss.normalize_L2(embeddings)
    index = faiss.IndexFlatIP(EMB_DIM)
    index.add(embeddings)
    faiss.write_index(index, str(INDEX_PATH))
    metas = [{"id": idx, "text": chunks[idx]} for idx in range(len(chunks))]
    with open(META_PATH, "w", encoding="utf-8") as f:
        json.dump(metas, f, ensure_ascii=False, indent=2)
    print("Index built and saved.")
    return index, metas

index, metas = build_or_load_index(chunks)


Loading existing FAISS index & metadata ...


In [5]:
# Cell 4 - retrieval helper
def retrieve_passages_for_json(json_input: Dict, top_k: int = TOP_K) -> List[Tuple[int, float, str]]:
    # build simple text query summarizing JSON features
    parts = []
    for ln, info in json_input["lines"].items():
        if not info.get("present"):
            parts.append(f"{ln}:absent")
        else:
            parts.append(f"{ln}:{info.get('length','')},{info.get('clarity','')},breaks={info.get('breaks',0)},branches={info.get('branches',0)}")
    query = " ; ".join(parts)
    q_emb = embedder.encode([query], convert_to_numpy=True).astype('float32')
    faiss.normalize_L2(q_emb)
    D, I = index.search(q_emb, top_k)
    idxs, scores = I[0], D[0]
    results = []
    for idx, score in zip(idxs, scores):
        if idx < 0:
            continue
        results.append((int(idx), float(score), metas[idx]["text"]))
    return results

# quick test retrieval
test_json = {"lines": {"life_line": {"present": True, "clarity": "clear", "length": "long", "breaks": 1, "branches": 5},
                       "head_line": {"present": False, "clarity": "", "length": "", "breaks": 0, "branches": 0},
                       "heart_line": {"present": False, "clarity": "", "length": "", "breaks": 0, "branches": 0},
                       "fate_line": {"present": False, "clarity": "", "length": "", "breaks": 0, "branches": 0}}}
print("Sample retrieval (top2):")
for pid, sc, txt in retrieve_passages_for_json(test_json, top_k=2):
    print(pid, sc, txt[:200].replace("\n"," ") + "...")


Sample retrieval (top2):
208 0.594800591468811 , in many cases, death, unless this break is supported bv a sister line or is enclosed in a square. But the break should be verified from both the hands. If the line is laddered and broken up, a perio...
619 0.55412757396698 him with the gifts of organization and administration denoted, by a straight line of Head. Fic. 60. THE HAND OF A POLITICIAN AND ADMINISTRATOR 1. Double line of Life. 2. More than three up-going lines...


In [6]:
# Cell 5 - Ollama call and parse utilities
OLLAMA_GENERATE_ENDPOINT = OLLAMA_URL.rstrip("/") + "/api/generate"
JSON_ARRAY_RE = re.compile(r'\[.*\]', re.S)
JSON_OBJ_RE = re.compile(r'\{.*\}', re.S)

def call_ollama(prompt: str, model: str = OLLAMA_MODEL, max_tokens: int = 1024, temperature: float = 1.0, timeout: int = 60) -> str:
    payload = {
        "model": model,
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "stream": False   # 👈 IMPORTANT
    }
    r = requests.post(OLLAMA_GENERATE_ENDPOINT, json=payload, timeout=timeout)
    r.raise_for_status()
    dd = r.json()
    
    if isinstance(dd, dict) and "response" in dd:
        return dd["response"]
    return str(dd)


def extract_json_list_from_text(text: str) -> List[dict]:
    """Try to extract a JSON array from text; if only object found, wrap it into a list."""
    text = text.strip()
    m = JSON_ARRAY_RE.search(text)
    if m:
        s = m.group(0)
    else:
        m2 = JSON_OBJ_RE.search(text)
        s = m2.group(0) if m2 else text
        if s and not s.strip().startswith("["):
            s = "[" + s + "]"
    try:
        parsed = json.loads(s)
        if isinstance(parsed, dict):
            return [parsed]
        return parsed
    except Exception as e:
        # attempt simple fixes: single quotes -> double quotes
        try:
            fixed = s.replace("'", '"')
            parsed = json.loads(fixed)
            return parsed if isinstance(parsed, list) else [parsed]
        except Exception:
            raise RuntimeError(f"Failed to parse JSON from Ollama output. Raw text:\n{s[:1000]}\nError: {e}")


In [7]:
# Cell 6 - prompt builders

def prompt_generate_jsons(batch_n: int = BATCH_JSONS_PER_CALL) -> str:
    # ask for an array of JSON objects to speed up generation
    return f"""
Generate exactly {batch_n} RANDOM JSON objects, and return them as a single JSON ARRAY (i.e. start with '[' and end with ']').
Each object must follow this EXACT schema (use only these fields and types):

{{ "lines": {{
  "life_line":  {{ "present": true|false, "clarity": "clear|faint|broken", "length": "short|medium|long", "breaks": <int>, "branches": <int> }},
  "head_line":  {{ "present": true|false, "clarity": "clear|faint|broken", "length": "short|medium|long", "breaks": <int>, "branches": <int> }},
  "heart_line": {{ "present": true|false, "clarity": "clear|faint|broken", "length": "short|medium|long", "breaks": <int>, "branches": <int> }},
  "fate_line":  {{ "present": true|false, "clarity": "clear|faint|broken", "length": "short|medium|long", "breaks": <int>, "branches": <int> }}
}} }}

Rules:
- If 'present' is false for a line, then 'clarity' and 'length' MUST be empty strings (""), and 'breaks' & 'branches' MUST be 0.
- If 'present' is true, pick clarity from (clear,faint,broken), length from (short,medium,long).
- Use realistic integers: breaks (0-12), branches (0-500).
- Return ONLY the JSON ARRAY and nothing else (no commentary, no numbering).
- Produce varied combinations (not the same repeated object).
"""
def prompt_rag_interpretation(json_input: dict, passages: List[Tuple[int,float,str]]) -> str:
    # Build prompt that includes retrieved passages and strict instruction to use only them
    header = ("You are an expert in traditional Indian palmistry (Hasta Samudrik Shastra). "
              "Using ONLY the passages provided below (do not invent or add extra classical references), "
              "write a concise interpretation (2-6 sentences) for the following palm features in JSON form. "
              "If no passage directly supports a certain feature, do not assert it.\n\n")
    json_block = "Palm features (JSON):\n" + json.dumps(json_input, ensure_ascii=False, indent=2) + "\n\n"
    passages_block = "Retrieved passages (id => excerpt):\n"
    for pid, score, text in passages:
        excerpt = text.replace("\n", " ").strip()
        if len(excerpt) > 800:
            excerpt = excerpt[:800] + "..."
        passages_block += f"PASSAGE {pid} (score={score:.4f}): {excerpt}\n\n"
    instruction = ("\nNow provide a concise, grounded interpretation in 2-6 sentences. Use only the passages above. "
                   "Start the interpretation with 'Interpretation:' and then the content. Return just the interpretation text.")
    return header + json_block + passages_block + instruction


In [8]:
# Cell 7 - normalization and sanity check
LENGTH_CHOICES = {"short","medium","long"}
CLARITY_CHOICES = {"clear","faint","broken"}

def normalize_json_obj(j: dict) -> dict:
    """Ensure object matches schema strictly; coerce types and clamp numeric ranges."""
    out = {"lines": {}}
    for ln in ["life_line","head_line","heart_line","fate_line"]:
        v = (j.get("lines") or {}).get(ln)
        if not v or not bool(v.get("present")):
            out["lines"][ln] = {"present": False, "clarity": "", "length": "", "breaks": 0, "branches": 0}
            continue
        # present True
        clarity = str(v.get("clarity","")).strip().lower()
        length = str(v.get("length","")).strip().lower()
        if clarity not in CLARITY_CHOICES:
            clarity = random.choice(list(CLARITY_CHOICES))
        if length not in LENGTH_CHOICES:
            length = random.choice(list(LENGTH_CHOICES))
        try:
            breaks = int(v.get("breaks", 0))
        except:
            breaks = random.randint(*BREAKS_RANGE)
        try:
            branches = int(v.get("branches", 0))
        except:
            branches = random.randint(*BRANCHES_RANGE)
        # clamp numeric ranges
        breaks = max(BREAKS_RANGE[0], min(BREAKS_RANGE[1], breaks))
        branches = max(BRANCHES_RANGE[0], min(BRANCHES_RANGE[1], branches))
        out["lines"][ln] = {"present": True, "clarity": clarity, "length": length, "breaks": breaks, "branches": branches}
    return out

def validate_json_schema(j: dict) -> bool:
    try:
        assert isinstance(j, dict) and "lines" in j
        for ln in ["life_line","head_line","heart_line","fate_line"]:
            v = j["lines"][ln]
            assert isinstance(v["present"], bool)
            assert isinstance(v["clarity"], str)
            assert isinstance(v["length"], str)
            assert isinstance(v["breaks"], int)
            assert isinstance(v["branches"], int)
        return True
    except Exception:
        return False


In [None]:
# Cell 8 - main generation loop
def produce_jsonl_pairs(target_n: int = NUM_PAIRS, out_file: str = OUTPUT_JSONL,
                        batch_size: int = BATCH_JSONS_PER_CALL, top_k: int = TOP_K):
    produced = 0
    seen_inputs = set()
    Path(out_file).parent.mkdir(parents=True, exist_ok=True)
    with open(out_file, "w", encoding="utf-8") as fout:
        pbar = tqdm(total=target_n, desc="Generating pairs")
        while produced < target_n:
            # 1) request a batch of random JSONs from Ollama
            try:
                prompt = prompt_generate_jsons(batch_n=batch_size)
                raw = call_ollama(prompt, model=OLLAMA_MODEL, max_tokens=2048, temperature=1.0)
                objs = extract_json_list_from_text(raw)
            except Exception as e:
                print("Ollama JSON generation error:", e)
                # fallback to simple python sampling if Ollama fails
                objs = []
                for _ in range(batch_size):
                    objs.append({
                        "lines":{
                            ln: ({"present": False, "clarity": "", "length": "", "breaks":0, "branches":0} 
                                 if random.random() < 0.15 else
                                 {"present": True,
                                  "clarity": random.choice(list(CLARITY_CHOICES)),
                                  "length": random.choice(list(LENGTH_CHOICES)),
                                  "breaks": random.randint(*BREAKS_RANGE),
                                  "branches": random.randint(*BRANCHES_RANGE)})
                            for ln in ["life_line","head_line","heart_line","fate_line"]
                        }
                    })
            # 2) normalize & dedup
            normalized_batch = []
            for obj in objs:
                try:
                    norm = normalize_json_obj(obj)
                    key = json.dumps(norm, sort_keys=True)
                    if key in seen_inputs:
                        continue
                    seen_inputs.add(key)
                    normalized_batch.append(norm)
                except Exception as e:
                    continue

            # 3) for each normalized json do RAG retrieve and ask Ollama to interpret
            for j_in in normalized_batch:
                if produced >= target_n:
                    break
                # retrieve passages
                passages = retrieve_passages_for_json(j_in, top_k=top_k)
                # build prompt and generate interpretation (low temperature)
                rag_prompt = prompt_rag_interpretation(j_in, passages)
                try:
                    interp_raw = call_ollama(rag_prompt, model=OLLAMA_MODEL, max_tokens=1024, temperature=RAG_TEMPERATURE)
                except Exception as e:
                    print("Ollama error on interpretation:", e)
                    interp_raw = ""
                # best effort: try to extract the interpretation text (we asked to start with 'Interpretation:')
                interp = interp_raw.strip()
                # If the model returned JSON or extra, try to isolate 'Interpretation:' line
                m = re.search(r'Interpretation:\s*(.+)', interp, re.S)
                if m:
                    interp = m.group(1).strip()
                # fallback: if empty, set a placeholder
                if not interp:
                    interp = "(No interpretation generated - fallback) "

                # save entry
                entry = {
                    "input": j_in,
                    "output": interp,
                    "retrieved": [{"id": pid, "score": score, "excerpt": txt[:800]} for pid, score, txt in passages]
                }
                fout.write(json.dumps(entry, ensure_ascii=False) + "\n")
                produced += 1
                pbar.update(1)
                time.sleep(OLLAMA_SLEEP)  # small delay to be polite
        pbar.close()
    print(f"Finished. Wrote {produced} pairs to {out_file}")

# run
produce_jsonl_pairs(target_n=NUM_PAIRS, out_file=OUTPUT_JSONL, batch_size=BATCH_JSONS_PER_CALL, top_k=TOP_K)


Generating pairs:   0%|          | 0/1000 [00:00<?, ?it/s]

Ollama JSON generation error: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   0%|          | 1/1000 [02:04<34:27:26, 124.17s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   0%|          | 3/1000 [03:57<20:02:50, 72.39s/it] 

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   0%|          | 4/1000 [04:59<18:54:21, 68.34s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   0%|          | 5/1000 [06:01<18:16:30, 66.12s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   1%|          | 6/1000 [07:03<17:52:59, 64.77s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   1%|          | 9/1000 [09:56<16:37:55, 60.42s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   1%|          | 10/1000 [10:58<16:45:40, 60.95s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)
Ollama JSON generation error: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   1%|          | 11/1000 [13:03<22:03:34, 80.30s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   1%|▏         | 13/1000 [15:05<19:20:28, 70.55s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   1%|▏         | 14/1000 [16:07<18:37:35, 68.01s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   2%|▏         | 15/1000 [17:09<18:07:29, 66.24s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   2%|▏         | 18/1000 [20:09<17:00:32, 62.36s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   2%|▏         | 19/1000 [21:11<16:58:26, 62.29s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   2%|▏         | 20/1000 [22:13<16:56:39, 62.24s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)
Ollama JSON generation error: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   2%|▏         | 21/1000 [24:17<21:59:00, 80.84s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   2%|▏         | 22/1000 [25:19<20:26:21, 75.24s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   2%|▏         | 23/1000 [26:21<19:20:59, 71.30s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   2%|▏         | 24/1000 [27:23<18:35:13, 68.56s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   2%|▎         | 25/1000 [28:26<18:02:52, 66.64s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   3%|▎         | 26/1000 [29:28<17:39:52, 65.29s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   3%|▎         | 27/1000 [30:30<17:23:24, 64.34s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   3%|▎         | 28/1000 [31:32<17:11:37, 63.68s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   3%|▎         | 29/1000 [32:34<17:03:10, 63.22s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   3%|▎         | 30/1000 [33:36<16:57:00, 62.91s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)
Ollama JSON generation error: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   3%|▎         | 31/1000 [35:41<21:53:01, 81.30s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   3%|▎         | 32/1000 [36:43<20:18:58, 75.56s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   3%|▎         | 33/1000 [37:45<19:12:50, 71.53s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   4%|▎         | 35/1000 [39:49<17:51:08, 66.60s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   4%|▎         | 36/1000 [40:51<17:28:33, 65.26s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   4%|▎         | 37/1000 [41:53<17:12:25, 64.33s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   4%|▍         | 38/1000 [42:55<17:00:49, 63.67s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   4%|▍         | 39/1000 [43:57<16:52:26, 63.21s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   4%|▍         | 40/1000 [44:59<16:46:11, 62.89s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)
Ollama JSON generation error: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   4%|▍         | 41/1000 [47:03<21:39:07, 81.28s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   4%|▍         | 43/1000 [49:02<18:43:18, 70.43s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   4%|▍         | 44/1000 [50:05<18:02:28, 67.94s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   4%|▍         | 45/1000 [51:07<17:33:40, 66.20s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   5%|▍         | 46/1000 [52:09<17:13:18, 64.99s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   5%|▍         | 47/1000 [53:11<16:58:35, 64.13s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   5%|▌         | 50/1000 [56:16<16:30:47, 62.58s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)
Ollama JSON generation error: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   5%|▌         | 51/1000 [58:20<21:22:00, 81.05s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   5%|▌         | 52/1000 [59:22<19:51:01, 75.38s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   5%|▌         | 54/1000 [1:01:04<16:46:56, 63.86s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   6%|▌         | 57/1000 [1:03:29<14:28:04, 55.23s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   6%|▌         | 60/1000 [1:05:48<12:46:36, 48.93s/it]

Ollama JSON generation error: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   6%|▋         | 63/1000 [1:09:14<15:13:20, 58.48s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   6%|▋         | 64/1000 [1:10:16<15:29:27, 59.58s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   6%|▋         | 65/1000 [1:11:18<15:40:26, 60.35s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   7%|▋         | 68/1000 [1:13:57<14:42:33, 56.82s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   7%|▋         | 70/1000 [1:15:38<13:49:17, 53.50s/it]

Ollama JSON generation error: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   7%|▋         | 71/1000 [1:17:42<19:16:42, 74.71s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   7%|▋         | 72/1000 [1:18:44<18:17:24, 70.95s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   7%|▋         | 73/1000 [1:19:46<17:35:26, 68.31s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   7%|▋         | 74/1000 [1:20:49<17:05:47, 66.47s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   8%|▊         | 75/1000 [1:21:51<16:44:50, 65.18s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   8%|▊         | 76/1000 [1:22:53<16:29:41, 64.27s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   8%|▊         | 78/1000 [1:24:56<16:06:50, 62.92s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   8%|▊         | 79/1000 [1:25:58<16:02:13, 62.69s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   8%|▊         | 80/1000 [1:26:55<15:32:46, 60.83s/it]

Ollama JSON generation error: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   8%|▊         | 81/1000 [1:28:59<20:22:58, 79.85s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   9%|▉         | 88/1000 [1:34:24<13:38:16, 53.83s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:   9%|▉         | 90/1000 [1:35:54<12:25:31, 49.16s/it]

Ollama JSON generation error: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:  10%|▉         | 95/1000 [1:41:08<14:19:54, 57.01s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:  10%|█         | 100/1000 [1:45:34<13:49:21, 55.29s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)
Ollama JSON generation error: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:  10%|█         | 101/1000 [1:47:38<18:58:22, 75.98s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:  10%|█         | 103/1000 [1:49:35<16:45:35, 67.26s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:  11%|█         | 109/1000 [1:54:34<13:13:04, 53.41s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:  11%|█         | 110/1000 [1:55:18<12:29:37, 50.54s/it]

Ollama JSON generation error: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:  11%|█▏        | 114/1000 [1:59:54<14:59:01, 60.88s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:  12%|█▏        | 119/1000 [2:04:05<12:59:23, 53.08s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


Generating pairs:  12%|█▏        | 120/1000 [2:05:08<13:38:30, 55.81s/it]

Ollama error on interpretation: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=60)


In [1]:
# Cell 9 - inspect a few random examples
def sample_jsonl(path: str, k: int = 5):
    with open(path, "r", encoding="utf-8") as f:
        lines = f.readlines()
    print(f"Total entries: {len(lines)}")
    import random
    for item in random.sample(lines, min(k, len(lines))):
        j = json.loads(item)
        print("INPUT:")
        print(json.dumps(j["input"], indent=2, ensure_ascii=False))
        print("RETRIEVED (top2):")
        for r in j.get("retrieved", [])[:2]:
            print("  id:", r["id"], "score:", r["score"])
            print("  excerpt:", r["excerpt"][:300].replace("\n"," "), "...")
        print("OUTPUT:")
        print(j["output"])
        print("-"*60)

# sample a few
sample_jsonl(OUTPUT_JSONL, k=3)


NameError: name 'OUTPUT_JSONL' is not defined