In [2]:
import os, json, uuid, re, pickle, time
from pathlib import Path
from typing import List, Dict
import sys, os
from pathlib import Path
sys.path.append(str(Path('.').resolve()))
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import requests  # simple Ollama client
from src.setup_path import setup_path
setup_path()
from src.rag.preprocess1 import extract_text, extract_tables, extract_images_and_ocr
from src.rag.embedding   import build_corpus, create_embeddings, build_faiss_index

In [3]:
DATA_DIR   = Path("data/Clinical Files")      # raw PDFs
PRE_DIR    = Path("preprocessed")       # output of preprocess
VDB_DIR    = Path("vectordb")
CLAIMS_JSON = Path("data/Flublok_Claims.json")       # marketing claims input
OUTPUT_DIR  = Path("outputs")
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)
BATCH_SIZE = 64
EMB_MODEL  = "sentence-transformers/paraphrase-MiniLM-L3-v2"

OLLAMA_URL = "http://localhost:11434/api/generate"
OLLAMA_MODEL = "mistral"

In [4]:
PRE_DIR.mkdir(exist_ok=True, parents=True)
for pdf_path in sorted(DATA_DIR.glob("*.pdf")):
    out_dir = PRE_DIR / pdf_path.stem
    out_dir.mkdir(exist_ok=True, parents=True)
    if not (out_dir / "text_chunks.jsonl").exists():
        extract_text(pdf_path, out_dir)
    if not (out_dir / "tables.jsonl").exists():
        extract_tables(pdf_path, out_dir)
    if not (out_dir / "image_ocr.jsonl").exists():
        extract_images_and_ocr(pdf_path, out_dir)
print(" Pre‑processing done")

 Pre‑processing done


In [None]:


index_path = VDB_DIR / "index.faiss"

if not index_path.exists():
    corpus = build_corpus(PRE_DIR)

    model = SentenceTransformer(EMB_MODEL, device="cpu")   # swap to "cuda" if it fits
    dim   = model.get_sentence_embedding_dimension()
    index = faiss.IndexFlatIP(dim)                         # for cosine (embs normalised)
    metadata = []

    for rec in tqdm(corpus, desc="embed ➜ index"):
        text = (
            rec.get("text")
            or " ".join([w for line in rec.get("data", []) for w in line])
            or rec.get("ocr_text", "")
        )

        vec = model.encode(text, normalize_embeddings=True)
        index.add(np.expand_dims(vec.astype("float32"), 0))
        metadata.append(rec)

    VDB_DIR.mkdir(parents=True, exist_ok=True)
    faiss.write_index(index, str(index_path))
    with (VDB_DIR / "docstore.pkl").open("wb") as f:
        pickle.dump(metadata, f)

    print(f"✅  Stored {len(metadata):,} vectors to {index_path}")
else:
    print("🔎 FAISS index already present – skipping embedding step")


🗂️  Loaded 15,922 records from preprocessed


  _torch_pytree._register_pytree_node(
embed ➜ index: 100%|██████████| 15922/15922 [01:03<00:00, 249.08it/s]


✅  Stored 15,922 vectors to vectordb/index.faiss


In [7]:
index = faiss.read_index(str(VDB_DIR / "index.faiss"))
with (VDB_DIR / "docstore.pkl").open("rb") as f:
    docstore: List[Dict] = pickle.load(f)
sbert = SentenceTransformer(EMB_MODEL)

def retrieve(query: str, k: int = 5) -> List[Dict]:
    q_emb = sbert.encode([query], normalize_embeddings=True)
    D, I = index.search(q_emb, k)
    return [docstore[i] | {"score": float(D[0][j])} for j,i in enumerate(I[0])]

In [8]:
def ollama_chat(prompt: str, temperature: float = 0.0) -> str:
    payload = {
        "model": OLLAMA_MODEL,
        "prompt": prompt,
        "temperature": temperature,
        "stream": False,
    }
    r = requests.post(OLLAMA_URL, json=payload, timeout=120)
    r.raise_for_status()
    return r.json()["response"].strip()


In [20]:
# def corag_answer(claim: str, max_steps: int = 6) -> List[Dict]:
#     qa_pairs = []
#     q = ollama_chat(f"Given the claim:\n\"{claim}\"\nGenerate one focused question to help verify it.")
#     for _ in range(max_steps):
#         hits = retrieve(q, k=5)
#         context = "\n".join([f"[{h['score']:.2f}] {h.get('text') or h.get('ocr_text') or h['data']}" for h in hits])
#         a = ollama_chat(f"Claim: {claim}\nQuestion: {q}\nEvidence snippets:\n{context}\nAnswer the question succinctly.")
#         qa_pairs.append({"q": q, "a": a, "evidence": hits})
#         follow = ollama_chat(
#             f"Claim: {claim}\nCollected Q/A so far:\n{json.dumps(qa_pairs, indent=2)}\n"
#             "Is the evidence sufficient to decide the claim? Answer only yes or no."
#         )
#         if follow.lower().startswith("y"):
#             break
#         q = ollama_chat(
#             f"Claim: {claim}\nGiven the evidence and answers so far:\n{json.dumps(qa_pairs, indent=2)}\n"
#             "Ask one follow‑up question that would best help verify the claim."
#         )
#     return qa_pairs
# def torag_answer(claim: str, max_steps: int = 6, branches: int = 3) -> List[Dict]:
#     best_pairs = []
#     qs = [ollama_chat(f"Claim: \"{claim}\"\nGenerate question #{i+1} to verify.") for i in range(branches)]
#     for _ in range(max_steps):
#         branch_pairs = []
#         for q in qs:
#             hits = retrieve(q, k=5)
#             ctx = "\n".join([f"[{h['score']:.2f}] {h.get('text') or h.get('ocr_text') or h['data']}" for h in hits])
#             a = ollama_chat(f"Claim: {claim}\nQuestion: {q}\nEvidence:\n{ctx}\nAnswer succinctly.")
#             branch_pairs.append({"q": q, "a": a, "evidence": hits})
#         # elimination
#         prompt = (
#             f"Claim: {claim}\nHere are {len(branch_pairs)} question‑answer pairs with evidence.\n"
#             f"{json.dumps(branch_pairs, indent=2)}\n"
#             "Select the single pair most helpful to verify the claim. Return its index (0‑based)."
#         )
#         idx = int(re.findall(r'\d+', ollama_chat(prompt))[0])
#         best = branch_pairs[idx]
#         best_pairs.append(best)
#         follow = ollama_chat(
#             f"Claim: {claim}\nBest pairs so far:\n{json.dumps(best_pairs, indent=2)}\n"
#             "Is the evidence now sufficient? Answer yes or no."
#         )
#         if follow.lower().startswith("y"):
#             break
#         # new follow‑up questions
#         qs = [
#             ollama_chat(
#                 f"Claim: {claim}\nCurrent best evidence:\n{json.dumps(best_pairs, indent=2)}\n"
#                 f"Generate follow‑up question #{i+1}."
#             )
#             for i in range(branches)
#         ]
#     return best_pairs

In [21]:

def decide_claim(claim: str, qa_pairs: List[Dict]) -> Dict:
    prompt = (
        f"Claim: {claim}\nEvidence Q/A:\n{json.dumps(qa_pairs, indent=2)}\n"
        "Based on the evidence, respond with JSON {{\"verdict\": \"supported|refuted|failed\", "
        "\"explanation\": \"concise justification\"}}"
    )
    return json.loads(ollama_chat(prompt, temperature=0.1))

claims = json.load(CLAIMS_JSON.open())


In [23]:
def infer_matches_for_claim(claim: str, candidates: List[Dict]) -> List[Dict]:
    """
    For a given claim and its candidate evidence, use Mistral (via ollama_chat) to determine
    which clinical evidence snippets best support the claim. Each match output has keys:
      - document_name: source document name,
      - matching_text: the text snippet from the evidence,
      - score: the similarity score from the retrieval.
    Output is a JSON array.
    """
    evidence_str = "\n".join(
        [f"[{c['score']:.2f}] Document: {c.get('source_pdf', 'Unknown')} | Text: {c.get('text') or ' '.join(sum(c.get('data', []), [])) or c.get('ocr_text', '')}"
         for c in candidates]
    )
    
    prompt = f"""
You are provided with a marketing claim and several candidate clinical evidence snippets extracted from various documents.
Your task is to select the snippets that best support the claim. A claim may be supported by multiple pieces of evidence.
For each supporting snippet, output a JSON object with the following keys:
- "document_name": the name of the clinical document,
- "matching_text": the exact text of the evidence snippet,
- "score": the similarity score (as provided).
If no evidence is relevant, output an empty JSON array.

Claim:
"{claim}"

Candidate Evidence:
{evidence_str}

Return only a valid JSON array.
"""
    response = ollama_chat(prompt, temperature=0.0)
    try:
        matches = json.loads(response)
    except Exception as e:
        print("Error parsing JSON from LLM response for claim:")
        print(claim)
        print("Response was:", response)
        matches = []
    return matches


In [24]:
with CLAIMS_JSON.open("r", encoding="utf-8") as f:
    marketing_claims = json.load(f)

final_results = {"claims": []}

for c in tqdm(marketing_claims["claims"], desc="Processing claims"):
    claim_text = c["claim"]
    
    # Retrieve candidates from your prebuilt FAISS index
    candidates = retrieve(claim_text, k=5)  # Adjust k as needed
    
    # Use the LLM (Mistral) to infer the best matching evidence from the candidates
    match_sources = infer_matches_for_claim(claim_text, candidates)
    
    final_results["claims"].append({
        "claim": claim_text,
        "match_sources": match_sources
    })

# Save final results as JSON
output_file = OUTPUT_DIR / "ollama_results.json"
with output_file.open("w", encoding="utf-8") as f:
    json.dump(final_results, f, ensure_ascii=False, indent=2)

print(f"Results saved to {output_file}")


Processing claims: 100%|██████████| 9/9 [02:12<00:00, 14.67s/it]

Results saved to outputs/ollama_results.json





In [25]:
def infer_matches_with_reason(claim: str, candidates: List[Dict]) -> List[Dict]:
    """
    Uses Mistral via ollama to infer support and reasoning per candidate match.
    Returns list of dicts with fields: document_name, matching_text, score, supports, reason
    """
    evidence_str = "\n".join(
        [f"[{c['score']:.2f}] Document: {c.get('source_pdf', 'Unknown')} | Text: {c.get('text') or ' '.join(sum(c.get('data', []), [])) or c.get('ocr_text', '')}"
         for c in candidates]
    )

    prompt = f"""
You are provided with a marketing claim and a list of candidate evidence snippets from clinical documents.

Your task is to analyze whether each snippet supports the claim. For each, output a JSON object with:
- "document_name": source document name,
- "matching_text": the full evidence text,
- "score": similarity score (float),
- "supports": true or false (does it support the claim?),
- "reason": 1–2 sentence explanation.

Here is the claim:
"{claim}"

Candidate Evidence:
{evidence_str}

Return only a JSON array of decisions, one per candidate.
    """.strip()

    try:
        response = ollama_chat(prompt, temperature=0.0)
        return json.loads(response)
    except Exception as e:
        print(" JSON parsing failed for claim:", claim)
        print("Raw response:", response)
        return []


In [26]:
with CLAIMS_JSON.open("r", encoding="utf-8") as f:
    marketing_claims = json.load(f)

enhanced_results = {"claims": []}

for c in tqdm(marketing_claims["claims"], desc="Running LLM with reasoning"):
    claim_text = c["claim"]
    candidates = retrieve(claim_text, k=7)  # optional: increase k
    detailed_matches = infer_matches_with_reason(claim_text, candidates)

    enhanced_results["claims"].append({
        "claim": claim_text,
        "match_sources": detailed_matches
    })

# Save to outputs/ollama2_results.json
with (OUTPUT_DIR / "ollama2_results.json").open("w", encoding="utf-8") as f:
    json.dump(enhanced_results, f, ensure_ascii=False, indent=2)

print("✅ Saved to outputs/ollama2_results.json")


Running LLM with reasoning: 100%|██████████| 9/9 [09:18<00:00, 62.00s/it]

✅ Saved to outputs/ollama2_results.json



