In [None]:
GOOGLE_API_KEY="AIzaSyADh3uaI2cy7snxXT1vAyVnoNoGuFUkUiU"

import os
import re
import time
import json
from datetime import datetime
from typing import List, Dict, Any, Optional
import requests
from bs4 import BeautifulSoup
import google.generativeai as genai
# Hard cap so we don’t ship entire 10-Q into a single call
MAX_SECTION_CHARS_FOR_LLM = 120_000
LLM_REQ_SLEEP = 0.2
GEMINI_MODEL = "gemini-2.5-flash"

ANNOTATION_SYSTEM = """\
You are a precise financial document tagger for SEC filings (10-K, 10-Q, 8-K, Forms 3/4/5).
Your job is to prepare a consice summary not exceeding 1800 characters. Factual summary of THIS part, using only information present in the text (no speculation)

RULES:
- Do not fabricate numbers, periods, or items. Only extract what is present.
- If a number lacks units/periods, leave those fields empty; do NOT guess.
"""

def _build_annotation_user_prompt(section_text: str,
                                  section_title: str,
                                  form: str,
                                  filed_at: str,
                                  ticker: str) -> str:
    header = f"""{ANNOTATION_SYSTEM}
[CONTEXT]
Ticker: {ticker}
Form: {form}
Filed Date: {filed_at}
Section Title: {section_title}

[INSTRUCTIONS]
Output a single paragraph summary

"""
    return header + section_text

def annotate_and_split_with_llm(section: Dict[str, Any],
                                ticker: str,
                                filed_at: str,
                                form: str,
                                model_name: str = GEMINI_MODEL,
                                timeout: int = 90) -> List[Dict[str, Any]]:
    """
    Calls Gemini to split a section into annotated parts.
    Returns a list of dicts: each with 'meta_header' and 'text' (verbatim).
    Falls back to returning a single, unannotated part on failure.
    """
    text = section.get("text", "")[:MAX_SECTION_CHARS_FOR_LLM]
    if not text.strip():
        return []
    model = genai.GenerativeModel(model_name)
    genai.configure(api_key=GOOGLE_API_KEY)
    prompt = _build_annotation_user_prompt(
        section_text=text,
        section_title=section.get("section_title") or "",
        form=form,
        filed_at=filed_at,
        ticker=ticker,
    )
    try:
        resp = model.generate_content(
            [
                {"role": "user", "parts": [prompt]}
            ],
            safety_settings=None,
            generation_config={
                "temperature": 0.2, ## less temperature for factual summary
                "top_p": 0.9,
                "max_output_tokens": 4096 ## approx 3000 words
            }
        )
        raw = resp.text
    except Exception as e:
        print(f"[ERROR] Gemini call failed: {e}")
        
    print(raw)
    out: List[Dict[str, Any]] = []
    summary = raw
    
    # Compact, human-readable header that still encodes rich tags
    # Keep it short so it doesn’t eat too many tokens during embedding.
    meta_bits = [
        f"TICKER={ticker}",
        f"FORM={form}",
        f"DATE={filed_at}",
    ]
    if section.get("section_id"):
        meta_bits.append(f"SEC={section['section_id']}")

    meta_header = "[" + " | ".join(meta_bits) + "]"

    if not summary:
        print(f"[INFO] No summary returned by gemini for: {meta_header}")
    else:
        print("summary: ", summary)
        out.append({"meta_header": meta_header, "text": text, "summary": summary})

    return out


# ==== CELL A: INDEX (multiple tickers & multiple forms in a date range) ====
import os
import re
import time
from datetime import datetime
from typing import List, Dict, Any, Optional
import requests
from bs4 import BeautifulSoup
import json  # <-- for saving artifacts

# --- sec-parser ---
import warnings
import sec_parser as sp
from sec_parser.semantic_tree import TreeBuilder
from sec_parser.semantic_elements import TopSectionTitle
from sec_parser.processing_steps import (
    TopSectionManagerFor10Q,
    IndividualSemanticElementExtractor,
    TopSectionTitleCheck
)

# --- vector store & embeddings ---
import chromadb
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer

# =========================
# CONFIG
# =========================
USER_AGENT = "Your Name your_email@example.com"  # <-- use a real UA
CHROMA_DIR = "./chroma_sec_multi"  #  store path
EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
GEMINI_MODEL = "gemini-2.5-flash"
MAX_TOKENS = 512
REQ_SLEEP = 0.25  # be kind to SEC endpoints (<= 10 req/sec)
TICKERS = ["AAPL"]  # <-- multi-ticker support
FORMS = ["10-Q","10-K","8-K","3","4","5"]  # <-- choose which forms "10-Q", "10-K", "8-K","4", "5"
START_DATE = "2025-06-01"
END_DATE = "2025-08-01"

# =========================
# SEC helpers (no API key)
# =========================
def _headers():
    return {
        "User-Agent": USER_AGENT,
        "Accept-Encoding": "gzip, deflate",
        "Accept-Language": "en-US,en;q=0.9",
        "Connection": "keep-alive",
    }

## cik is an unique identifier assigned by the SEC to each company, ideally we should use cik to fetch filings.
def get_cik_for_ticker(ticker: str) -> Dict[str, str]:
    url = "https://www.sec.gov/files/company_tickers.json"
    r = requests.get(url, headers=_headers(), timeout=60)
    r.raise_for_status()
    data = r.json()
    t_up = ticker.upper()
    
    for _, row in data.items():
        if row.get("ticker", "").upper() == t_up:
            cik = str(int(row["cik_str"]))
            return {"cik": cik, "cik_padded": f"{int(cik):010d}"}
    
    raise ValueError(f"Ticker {ticker} not found in SEC company_tickers.json")

def list_filings_in_range(ticker: str, forms: List[str], start_date: str, end_date: str) -> List[Dict[str, str]]:
    info = get_cik_for_ticker(ticker)
    cik, cik_padded = info["cik"], info["cik_padded"]
    
    def _form_matches(f: str, forms: List[str]) -> bool:
        for frm in forms:
            if frm in ("10-K", "10-Q", "8-K"):
                if f.startswith(frm):  # allow "10-K/A", etc.
                    return True
            else:
                if f == frm:  # exact match for 3, 4, 5
                    return True
        return False
    
    def _collect(sub_json: dict, acc: List[Dict[str, str]]):
        recent = sub_json.get("filings", {}).get("recent", {})
        forms_list = recent.get("form", [])
        dates_list = recent.get("filingDate", [])
        accnos = recent.get("accessionNumber", [])
        primary = recent.get("primaryDocument", [])
        
        for f, d, a, p in zip(forms_list, dates_list, accnos, primary):
            if not f or not d or not a or not p:
                continue
            # match any of our requested forms (prefix match for variants like "10-K/A", "8-K/A")
            if not _form_matches(f, forms):
                continue
            accnod = a.replace("-", "")
            url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{accnod}/{p}"
            acc.append({"url": url, "filed_at": d, "accession": a, "form": f, "primary": p})
    
    # main submissions
    base = f"https://data.sec.gov/submissions/CIK{cik_padded}.json"
    r = requests.get(base, headers=_headers(), timeout=60)
    r.raise_for_status()
    sub = r.json()
    time.sleep(REQ_SLEEP)
    
    rows: List[Dict[str, str]] = []
    _collect(sub, rows)
    
    # older files index pages
    for f in sub.get("filings", {}).get("files", []):
        name = f.get("name")
        if not name:
            continue
        url = f"https://data.sec.gov/submissions/{name}"
        rr = requests.get(url, headers=_headers(), timeout=60)
        if rr.status_code == 200:
            _collect(rr.json(), rows)
        time.sleep(REQ_SLEEP)
    
    # date filter inclusive
    sd = datetime.strptime(start_date, "%Y-%m-%d").date()
    ed = datetime.strptime(end_date, "%Y-%m-%d").date()
    rows = [
        r for r in rows
        if sd <= datetime.strptime(r["filed_at"], "%Y-%m-%d").date() <= ed
    ]
    
    # de-dup by accession; sort oldest→newest
    seen = set()
    uniq = []
    for r in sorted(rows, key=lambda x: x["filed_at"]):
        if r["accession"] in seen:
            continue
        seen.add(r["accession"])
        uniq.append(r)
    
    return uniq

def normalize_edgar_url(url: str) -> str:
    # handle inline XBRL viewer links: /ix?doc=/Archives/...
    from urllib.parse import urlparse, parse_qs
    try:
        p = urlparse(url)
        if p.path.lower() == "/ix":
            doc = parse_qs(p.query).get("doc", [None])[0]
            if doc:
                return "https://www.sec.gov" + doc if doc.startswith("/") else doc
    except Exception:
        pass
    return url

def fetch_html(url: str) -> str:
    url = normalize_edgar_url(url)
    r = requests.get(url, headers=_headers(), timeout=90, allow_redirects=True)
    r.raise_for_status()
    time.sleep(REQ_SLEEP)
    return r.text

# =========================
# Parsing by form
# =========================
ITEM_RE = re.compile(r"\bItem\s+\d+[A]?(?:\.\d+)?\b", re.IGNORECASE)  # supports 8-K item 2.02 etc.

def _parse_items_10q(html: str) -> List[Dict[str, Any]]:
    """Full Item bodies for 10-Q using standard parser."""
    parser = sp.Edgar10QParser()
    elements = parser.parse(html)
    tree = TreeBuilder().build(elements)
    nodes = list(tree.nodes)
    
    # find Item titles
    item_idx = []
    for i, node in enumerate(nodes):
        if isinstance(node.semantic_element, TopSectionTitle):
            title = (node.text or "").strip()
            if ITEM_RE.search(title):
                item_idx.append((i, title))
    
    if not item_idx:
        for i, node in enumerate(nodes):
            title = (node.text or "").strip()
            if ITEM_RE.match(title):
                item_idx.append((i, title))
    
    sections = []
    for j, (start_i, title) in enumerate(item_idx):
        end_i = item_idx[j + 1][0] if j + 1 < len(item_idx) else len(nodes)
        html_parts, text_parts = [], []
        
        for k in range(start_i, end_i):
            n = nodes[k]
            try:
                html_parts.append(n.get_source_code(pretty=False))
            except Exception:
                pass
            t = (n.text or "").strip()
            if t:
                text_parts.append(t)
        
        html_chunk = "".join(html_parts).strip()
        text_chunk = (
            BeautifulSoup(html_chunk, "html.parser").get_text(separator="\n").strip()
            if html_chunk
            else "\n".join(text_parts).strip()
        )
        
        m = re.search(r"\bItem\s+(\d+[A]?(?:\.\d+)?)\b", title, flags=re.IGNORECASE)
        section_id = m.group(1).upper() if m else None
        title_norm = re.sub(r"\s+", " ", title.replace("\xa0", " ")).strip()
        text_chunk = re.sub(r"\n{3,}", "\n\n", text_chunk.replace("\xa0", " ")).strip()
        
        if text_chunk:
            sections.append({
                "section_id": section_id,
                "section_title": title_norm,
                "html": html_chunk or title_norm,
                "text": text_chunk
            })
    
    return sections

def _get_steps_10k_generic():
    """
    Remove 10-Q specific top-section logic as per sec-parser docs.
    """
    all_steps = sp.Edgar10QParser().get_default_steps()
    steps_wo_top_mgr = [st for st in all_steps if not isinstance(st, TopSectionManagerFor10Q)]
    
    def get_checks_without_top_section_title_check():
        checks = sp.Edgar10QParser().get_default_single_element_checks()
        return [ck for ck in checks if not isinstance(ck, TopSectionTitleCheck)]
    
    return [
        IndividualSemanticElementExtractor(get_checks=get_checks_without_top_section_title_check)
        if isinstance(st, IndividualSemanticElementExtractor)
        else st
        for st in steps_wo_top_mgr
    ]

def _parse_items_10k(html: str) -> List[Dict[str, Any]]:
    """Parse 10-K by skipping 10-Q-specific steps, then split by Item titles."""
    parser = sp.Edgar10QParser(get_steps=_get_steps_10k_generic)
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", message="Invalid section type for")
        elements = parser.parse(html)
    
    tree = TreeBuilder().build(elements)
    nodes = list(tree.nodes)
    
    # find Item titles (10-K has Item 1..15)
    item_idx = []
    for i, node in enumerate(nodes):
        title = (node.text or "").strip()
        if ITEM_RE.match(title):
            item_idx.append((i, title))
    
    sections = []
    for j, (start_i, title) in enumerate(item_idx):
        end_i = item_idx[j + 1][0] if j + 1 < len(item_idx) else len(nodes)
        html_parts, text_parts = [], []
        
        for k in range(start_i, end_i):
            n = nodes[k]
            try:
                html_parts.append(n.get_source_code(pretty=False))
            except Exception:
                pass
            t = (n.text or "").strip()
            if t:
                text_parts.append(t)
        
        html_chunk = "".join(html_parts).strip()
        text_chunk = (
            BeautifulSoup(html_chunk, "html.parser").get_text(separator="\n").strip()
            if html_chunk
            else "\n".join(text_parts).strip()
        )
        
        m = re.search(r"\bItem\s+(\d+[A]?)\b", title, flags=re.IGNORECASE)
        section_id = m.group(1).upper() if m else None
        title_norm = re.sub(r"\s+", " ", title.replace("\xa0", " ")).strip()
        text_chunk = re.sub(r"\n{3,}", "\n\n", text_chunk.replace("\xa0", " ")).strip()
        
        if text_chunk:
            sections.append({
                "section_id": section_id,
                "section_title": title_norm,
                "html": html_chunk or title_norm,
                "text": text_chunk
            })
    
    return sections

def _parse_items_8k(html: str) -> List[Dict[str, Any]]:
    """
    8-K Items are like 'Item 2.02 Results of Operations and Financial Condition'.
    We'll split on these titles similarly.
    """
    # Reuse 10-K generic parser (no 10-Q top manager)
    parser = sp.Edgar10QParser(get_steps=_get_steps_10k_generic)
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", message="Invalid section type for")
        elements = parser.parse(html)
    
    tree = TreeBuilder().build(elements)
    nodes = list(tree.nodes)
    
    item_idx = []
    for i, node in enumerate(nodes):
        title = (node.text or "").strip()
        if re.match(r"\bItem\s+\d+\.\d+\b", title, flags=re.IGNORECASE):
            item_idx.append((i, title))
    
    # fallback: any "Item x.xx" found anywhere
    if not item_idx:
        for i, node in enumerate(nodes):
            title = (node.text or "").strip()
            if re.search(r"\bItem\s+\d+\.\d+\b", title, flags=re.IGNORECASE):
                item_idx.append((i, title))
    
    sections = []
    for j, (start_i, title) in enumerate(item_idx):
        end_i = item_idx[j + 1][0] if j + 1 < len(item_idx) else len(nodes)
        html_parts, text_parts = [], []
        
        for k in range(start_i, end_i):
            n = nodes[k]
            try:
                html_parts.append(n.get_source_code(pretty=False))
            except Exception:
                pass
            t = (n.text or "").strip()
            if t:
                text_parts.append(t)
        
        html_chunk = "".join(html_parts).strip()
        text_chunk = (
            BeautifulSoup(html_chunk, "html.parser").get_text(separator="\n").strip()
            if html_chunk
            else "\n".join(text_parts).strip()
        )
        
        m = re.search(r"\bItem\s+(\d+\.\d+)\b", title, flags=re.IGNORECASE)
        section_id = m.group(1) if m else None
        title_norm = re.sub(r"\s+", " ", title.replace("\xa0", " ")).strip()
        text_chunk = re.sub(r"\n{3,}", "\n\n", text_chunk.replace("\xa0", " ")).strip()
        
        if text_chunk:
            sections.append({
                "section_id": section_id,
                "section_title": title_norm,
                "html": html_chunk or title_norm,
                "text": text_chunk
            })
    
    return sections

def _parse_whole_doc(html: str) -> List[Dict[str, Any]]:
    """For Forms 3/4/5 (insider forms), treat the entire doc as a single section."""
    soup = BeautifulSoup(html, "html.parser")
    text = soup.get_text(separator="\n").strip()
    text = re.sub(r"\n{3,}", "\n\n", text.replace("\xa0", " "))
    
    if not text:
        return []
    
    return [{
        "section_id": "ALL",
        "section_title": "Entire Document",
        "html": html,
        "text": text
    }]

def parse_sections_by_form(html: str, form: str) -> List[Dict[str, Any]]:
    base = form.upper()
    if base.startswith("10-Q"):
        return _parse_items_10q(html)
    if base.startswith("10-K"):
        return _parse_items_10k(html)
    if base.startswith("8-K"):
        return _parse_items_8k(html)
    if base in {"3", "4", "5"}:
        return _parse_whole_doc(html)
    # default fallback
    return _parse_whole_doc(html)

# =========================
# Chunk, embed, upsert (metadata inside embedding text)
# =========================
# def chunk_text(s: str, max_chars: int = 1800, overlap: int = 200) -> List[str]:
#     s = s.strip()
#     if len(s) <= max_chars:
#         return [s]
    
#     chunks, start = [], 0
#     while start < len(s):
#         end = min(len(s), start + max_chars)
#         split = s.rfind("\n\n", start, end)
#         if split == -1 or split <= start + 200:
#             split = end
#         chunks.append(s[start:split].strip())
#         if split == len(s):
#             break
#         start = max(split - overlap, split)
    
#     return [c for c in chunks if c]

# def _trim_to_tokens(text: str, tokenizer, max_tokens: int) -> str:
#     enc = tokenizer(text, add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False)
#     ids = enc["input_ids"]
#     if len(ids) <= max_tokens:
#         return text
#     trimmed = tokenizer.decode(ids[:max_tokens], skip_special_tokens=True, clean_up_tokenization_spaces=True)
#     return trimmed.strip()

def build_chroma(persist_dir: str = CHROMA_DIR):
    client = chromadb.Client()
    coll = client.get_or_create_collection(name="sec_multi_forms")
    return client, coll
    
def upsert_sections_to_chroma(
    coll,
    sections: List[Dict[str, Any]],
    ticker: str,
    filed_at: str,
    form: str,
    model: SentenceTransformer
) -> int:
    ids, docs, metas, embs = [], [], [], []
    uid = 0

    for s in sections:
        # ===== NEW: ask LLM to split+annotate this section =====
        try:
            annotated_parts = annotate_and_split_with_llm(
                section=s,
                ticker=ticker,
                filed_at=filed_at,
                form=form,
                model_name=GEMINI_MODEL
            )
        except Exception:
            annotated_parts = []

        # Fallback to previous mono-section behavior if LLM unavailable
        if not annotated_parts:
            continue
        else:
            # base_meta_str = (
            #     f"[TICKER={ticker}|FORM={form}|DATE={filed_at}"
            #     f"{'|SEC='+str(s.get('section_id')) if s.get('section_id') else ''}] "
            #     f"{(s.get('section_title') or '')[:120]}"
            # ).strip()
            # annotated_parts = [{"meta_header": base_meta_str, "text": s.get("text","")}]
            for part in annotated_parts:
                header = part["meta_header"].strip()
                part_text = part["text"].strip()
                part_summary = part["summary"].strip() if "summary" in part else ""
                if not part_summary:
                    print(f"[INFO] No summary found for: {header}")
            
                meta = {
                    "ticker": ticker,
                    "form": form,
                    "filed_at": filed_at,
                    "section_id": s.get("section_id"),
                    "section_title": s.get("section_title")
                }
            
                # --- Save chunk metadata, summary, and text before embedding ---
                artifact = {
                    "meta_header": header,
                    "meta": meta,
                    "summary": part_summary,
                    "text": part_text
                }
                artifact_path = os.path.join(
                    ARTIFACT_DIR,
                    f"{ticker}_{form}_{filed_at}_{s.get('section_id')}_{uid:06d}_chunk.json"
                )
                with open(artifact_path, "w", encoding="utf-8") as af:
                    json.dump(artifact, af, indent=2, ensure_ascii=False)
            
                doc_text = f"{header}\n\n{meta}\n\n{part_summary}".strip()
            
                emb = model.encode(doc_text, normalize_embeddings=True).tolist()
                # ...existing code...
                ids.append(f"{ticker}_{form}_{filed_at}_{s.get('section_id')}_{uid:06d}")
                docs.append(doc_text)
                metas.append(meta)
                embs.append(emb)
                uid += 1

    if ids:
        coll.upsert(ids=ids, documents=docs, metadatas=metas, embeddings=embs)
    return len(ids)


# =========================
# RUN: iterate tickers & forms over date range
# =========================
client, collection = build_chroma(CHROMA_DIR)
model = SentenceTransformer(EMBED_MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL_NAME)

# Directory to store artifacts
ARTIFACT_DIR = "form_artifacts"
os.makedirs(ARTIFACT_DIR, exist_ok=True)

total = 0
for tkr in TICKERS:
    filings = list_filings_in_range(tkr, FORMS, START_DATE, END_DATE)
    print(f"\n{tkr}: {len(filings)} filings from {START_DATE} to {END_DATE}")
    
    for f in filings:
        print("  ", f["filed_at"], f["form"], "→", f["url"])
        html = fetch_html(f["url"])

        # --- Save raw HTML (pre-parsing) ---
        raw_path = os.path.join(ARTIFACT_DIR, f"{tkr}_{f['form']}_{f['filed_at']}_raw.html")
        with open(raw_path, "w", encoding="utf-8") as fh:
            fh.write(html)

        sections = parse_sections_by_form(html, f["form"])

        # --- Save parsed sections (post-parsing) ---
        parsed_path = os.path.join(ARTIFACT_DIR, f"{tkr}_{f['form']}_{f['filed_at']}_sections.json")
        with open(parsed_path, "w", encoding="utf-8") as fp:
            json.dump(sections, fp, indent=2, ensure_ascii=False)

        # --- Save final chunks (post-embedding) ---
        upsert_sections_to_chroma(collection, sections, tkr, f["filed_at"], f["form"], model)
print(f"\nDONE. Total chunks upserted: {total}. DB: {CHROMA_DIR}")



import os
import json
from typing import List, Dict, Any, Optional
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer
import chromadb
import google.generativeai as genai

# Reuse the same settings used at index time
CHROMA_DIR = "./chroma_sec_multi"
EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
GEMINI_MODEL = "gemini-2.5-flash"   # or "gemini-1.5-pro"
TOP_K = 5

def load_chroma():
    client = chromadb.Client()
    return client.get_or_create_collection(name="sec_multi_forms")

def retrieve_top_k(coll, query: str, k: int = TOP_K):
    model = SentenceTransformer(EMBED_MODEL_NAME)
    q_emb = model.encode(query, normalize_embeddings=True).tolist()
    res = coll.query(query_embeddings=[q_emb], n_results=k)
    hits = []
    if res and res.get("documents"):
        for i in range(len(res["documents"][0])):
            hits.append({
                "id": res["ids"][0][i],
                "text": res["documents"][0][i],
                "meta": res["metadatas"][0][i],
                "distance": res["distances"][0][i],  # cosine distance (smaller is better)
            })
    hits.sort(key=lambda x: x["distance"])
    return hits

def gemini_answer(query: str, contexts: List[Dict[str, Any]]) -> str:
    api_key = GOOGLE_API_KEY
    if not api_key:
        raise RuntimeError("Set GOOGLE_API_KEY env var for Gemini.")
    genai.configure(api_key=api_key)
    model = genai.GenerativeModel(GEMINI_MODEL)

    blocks = []
    for c in contexts:
        m = c.get("meta", {})
        tag = f"[{m.get('ticker')} | {m.get('form')} | {m.get('filed_at')} | {m.get('section_title')} | chunk {m.get('chunk_index')}]"
        blocks.append(f"{tag}\n{c['text']}\n")

    prompt = (
        "You are a financial research assistant.\n"
        "Use the provided SEC excerpts to answer the user question. Use no data except the one provided in the context."
        "Cite the section titles inline, and be concise. If uncertain, say so.\n\n"
        "whenever numerical values need to be analysed or compared give output in markdown tabular format"
        f"USER QUESTION:\n{query}\n\n" +
        "\n".join(f"CONTEXT {i+1}:\n{blk}" for i, blk in enumerate(blocks[:TOP_K]))
    )

    resp = model.generate_content(prompt)
    return resp.text

# ======= RUN THIS CELL MULTIPLE TIMES =======
collection = load_chroma()

user_query = "khan sabih initial rsu holding data"
hits = retrieve_top_k(collection, user_query, k=10)
print("Top hits:\n", json.dumps(hits, indent=2))

answer = gemini_answer(user_query, hits[:2])
print("\n=== GEMINI ANSWER ===\n")
print(answer)