In [13]:
import boto3
from botocore.client import Config
from dotenv import load_dotenv
import os, re, math
from typing import List, Tuple, Dict, Optional
import uuid
from pathlib import Path
import mimetypes
from supabase import create_client, Client
from postgrest import APIError 
import base64
from openai import OpenAI
from pdf2image import convert_from_path
from PIL import Image
import pypdfium2 as pdfium


load_dotenv(override=True)

True

In [14]:
BASE_DIR = Path.cwd()
endpoint_url=os.getenv("B2_ENDPOINT_URL")
aws_access_key_id=os.getenv("B2_KEY_ID")
aws_secret_access_key=os.getenv("B2_APP_KEY")
bucket_name = os.getenv("B2_BUCKET_NAME")
user_id   = os.getenv("B2_USER_ID")
region_name  = os.getenv("B2_REGION")

In [15]:
s3 = boto3.client(
    "s3",
    endpoint_url=endpoint_url,
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    config=Config(signature_version="s3v4", s3={"addressing_style": "virtual"}),
)

In [16]:
# Helper: generate sensible keys for uploaded files
def make_report_key(user_id: str, report_id: str, filename: str, subfolder: str = "source") -> str:
    """
    Build an object key like:
    acct/{user_id}/reports/{report_id}/{subfolder}/{filename}
    """
    return f"acct/{user_id}/reports/{report_id}/{subfolder}/{filename}"

In [17]:
def get_presigned_url(user_id: str, report_id: str, filename: str, expires_in: int = 900) -> str:
    """
    Generate a presigned URL so the client can download/view the file directly.
    """
    key = make_report_key(user_id, report_id, filename)
    url = s3.generate_presigned_url(
        "get_object",
        Params={"Bucket": bucket_name, "Key": key},
        ExpiresIn=expires_in
    )
    print(f"🔗 Presigned URL (valid {expires_in}s): {url}")
    return url

In [18]:
def infer_content_type(filename: str) -> str:
    ctype, _ = mimetypes.guess_type(filename)
    if ctype:
        return ctype
    ext = Path(filename).suffix.lower()
    return {
        ".jpg": "image/jpeg",
        ".jpeg": "image/jpeg",
        ".png": "image/png",
        ".gif": "image/gif",
        ".webp": "image/webp",
        ".pdf": "application/pdf",
    }.get(ext, "application/octet-stream")

def infer_extension(local_path: str) -> str:
    ext = Path(local_path).suffix.lstrip(".") 
    return ext

In [19]:
supabase_url = os.getenv("SUPABASE_URL")
supabase_service_role_key = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
supabase_default_account_id = os.getenv("SUPABASE_DEFAULT_ACCOUNT_ID")

supabase: Client = create_client(supabase_url, supabase_service_role_key)

In [20]:
def upload_report(user_id: str, local_path: str, report_id: str, filename: str, content_type: str) -> str:
    key = make_report_key(user_id, report_id, filename, subfolder="source")
    with open(local_path, "rb") as f:
        s3.put_object(
            Bucket=bucket_name,
            Key=key,
            Body=f,
            ContentType=content_type,
            Metadata={"report-id": report_id},
        )
    print(f"✅ Uploaded → {key}")
    return key

In [21]:
client = OpenAI()

def pdf_to_images_pypdfium2(pdf_path: str, dpi: int = 200):
    pdf = pdfium.PdfDocument(str(pdf_path))
    scale = dpi / 72.0  # PDF base is 72 dpi
    images = []
    for i in range(len(pdf)):
        page = pdf[i]
        pil = page.render(scale=scale).to_pil()   # PIL.Image
        images.append(pil.convert("RGB"))
    pdf.close()
    return images

def encode_image_to_base64(img: Image.Image) -> str:
    """Convert PIL image to base64 string."""
    from io import BytesIO
    buf = BytesIO()
    img.save(buf, format="PNG")
    return base64.b64encode(buf.getvalue()).decode("utf-8")

def ocr_page(image: Image.Image) -> str:
    """Send one image to OpenAI for OCR."""
    b64 = encode_image_to_base64(image)
    resp = client.chat.completions.create(
        model="gpt-4o-mini",  # cheap + handles handwriting
        messages=[
            {"role": "system", "content": "You are an OCR engine. Output the text exactly as written."},
            {"role": "user", "content": [
                {"type": "text", "text": "Extract the text from this document page. Only return the extracted text and nothing else"},
                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64}"}},
            ]},
        ],
        temperature=0
    )
    return resp.choices[0].message.content.strip()

def ocr_file(file_path: str) -> str:
    """Auto-detect type and OCR accordingly."""
    path = Path(file_path)
    ext = path.suffix.lower()

    texts = []

    if ext == ".pdf":
        #pages = convert_from_path(file_path, dpi=200)
        pages = pdf_to_images_pypdfium2(file_path, dpi=200)
        for i, page in enumerate(pages, start=1):
            print(f"OCR page {i}/{len(pages)}...")
            texts.append(ocr_page(page))
    else:
        img = Image.open(file_path).convert("RGB")
        texts.append(ocr_page(img))

    return "\n\n--- PAGE BREAK ---\n\n".join(texts)

In [22]:
def add_database_report(account_id: str, report_id: str, filename: str, mime_type: str, size_bytes: int) -> None:
    try:
        payload = {
        "id": report_id,
        "account_id": account_id,
        "filename": filename,
        "mime_type": mime_type,
        "size_bytes": size_bytes,
        "upload_status": "uploaded",
        "ocr_status": "queued",
        }
        supabase.table("reports").insert(payload).execute()
        print(f"🗄️  Report metadata added to database (ID: {report_id})")
    except APIError as e:
        print("❌ Failed to add report metadata to database:", e)

In [23]:
# connect vector store
EMBED_MODEL = "text-embedding-3-small"  # 1536 dims

# ---- tiny splitter (roughly 700–900 tokens per chunk if English-like)
def simple_chunk(text: str, max_chars: int = 3500, overlap: int = 200) -> List[str]:
    text = re.sub(r'\s+', ' ', text).strip()
    chunks = []
    i = 0
    while i < len(text):
        end = min(i + max_chars, len(text))
        # try to break at sentence boundary near the end
        cut = text.rfind(". ", i, end)
        if cut == -1 or cut < i + int(max_chars * 0.6):
            cut = end
        else:
            cut += 1  # include the period
        chunks.append(text[i:cut].strip())
        i = max(cut - overlap, cut)  # ensure progress if overlap > chunk
    return [c for c in chunks if c]


def embed_texts(texts: List[str]) -> List[List[float]]:
    resp = client.embeddings.create(model=EMBED_MODEL, input=texts)
    return [d.embedding for d in resp.data]


def upsert_chunks(
    account_id: str, report_id: str, page_no: int | None, chunks: List[str]
) -> None:
    if not chunks:
        return
    embs = embed_texts(chunks)
    rows = []
    for idx, (c, e) in enumerate(zip(chunks, embs)):
        rows.append({
            "account_id": account_id,
            "report_id": report_id,
            "page_no": page_no,
            "chunk_no": idx,
            "content": c,
            "content_tokens": None,   # fill if you count tokens later
            "embedding": e,
        })
    # Supabase can insert lists of JSON rows directly
    supabase.table("report_chunks").upsert(rows, on_conflict="report_id,chunk_no").execute()
    

# Example: from your OCR text (whole doc) → chunk → embed → store
def index_ocr_text(account_id: str, report_id: str, full_text: str, page_map: List[Tuple[int, str]] | None = None):
    """
    If you have per-page OCR, pass page_map=[(1, text1), (2, text2)...].
    Otherwise pass full_text and leave page_map=None.
    """
    if page_map:
        for page_no, page_text in page_map:
            chunks = simple_chunk(page_text)
            upsert_chunks(account_id, report_id, page_no, chunks)
    else:
        chunks = simple_chunk(full_text)
        upsert_chunks(account_id, report_id, None, chunks)
    print("✅ Upserting embeddings to database completed")

In [24]:
def prepare_and_upload_report(account_id: str, file_path: str):
    
    path = Path(file_path)
    report_id = str(uuid.uuid4())
    filename = "file." + infer_extension(file_path)
    mime_type = infer_content_type(filename)
    size_bytes = path.stat().st_size

    add_database_report(account_id, report_id, filename, mime_type, size_bytes)
    upload_report(user_id=user_id, local_path=file_path, report_id=report_id, filename=filename, content_type=mime_type)
    full_text = ocr_file(file_path)
    index_ocr_text(supabase_default_account_id, report_id, full_text)

    return report_id

In [25]:
img_path = BASE_DIR / "resources" / "sample_report.png"
pdf_path = BASE_DIR / "resources" / "sample_report.pdf"
report_id = prepare_and_upload_report(supabase_default_account_id, pdf_path)
url = get_presigned_url(user_id, report_id, "file." + infer_extension(pdf_path), expires_in=600)  # 10 minutes
print("Download from:", url)

🗄️  Report metadata added to database (ID: fe41627e-82fe-4121-8fb0-49d24b9fb130)
✅ Uploaded → acct/tester/reports/fe41627e-82fe-4121-8fb0-49d24b9fb130/source/file.pdf
OCR page 1/1...
✅ Upserting embeddings to database completed
🔗 Presigned URL (valid 600s): https://medical-reports.s3.us-east-005.backblazeb2.com/acct/tester/reports/fe41627e-82fe-4121-8fb0-49d24b9fb130/source/file.pdf?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=005704149b704450000000002%2F20250920%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250920T055037Z&X-Amz-Expires=600&X-Amz-SignedHeaders=host&X-Amz-Signature=629ac37e287e8984089c7184700ff9bdb70eda7aaec0f271d4fc8e7c4a9246a4
Download from: https://medical-reports.s3.us-east-005.backblazeb2.com/acct/tester/reports/fe41627e-82fe-4121-8fb0-49d24b9fb130/source/file.pdf?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=005704149b704450000000002%2F20250920%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250920T055037Z&X-Amz-Expires=600&X-Amz-SignedHeaders=host&X-Amz-Sign

In [26]:
def embed_query(q: str) -> list[float]:
    return client.embeddings.create(
        model="text-embedding-3-small", input=[q]
    ).data[0].embedding

def search_chunks(sb, account_id: str, query: str, k: int = 5, report_id: str | None = None):
    q_emb = embed_query(query)
    resp = sb.rpc(
        "search_report_chunks_json_simple",
        {"p_account_id": account_id, "p_query": q_emb, "p_limit": k, "p_report_id": report_id}
    ).execute()
    return resp.data

In [27]:
def get_context_from_embeddings(
    supabase,
    account_id: str,
    query: str,
    report_id: Optional[str],
    *,
    k: int = 5,
    sim_threshold: float = 0.10,     # tweak: 0.30 (weak) · 0.50 (medium) · 0.70 (strong)
    max_snippet: int = 1200,
    max_total_chars: int = 4000,     # safety cap on total context
    min_hits: int = 1,               # require at least this many hits above threshold
) -> str:
    rows: List[Dict] = search_chunks(supabase, account_id, query, k=k, report_id=report_id) or []

    # keep only rows with similarity >= threshold
    good: List[Dict] = []
    for r in rows:
        try:
            sim = float(r.get("similarity", 0.0))
        except (TypeError, ValueError):
            sim = 0.0
        if sim >= sim_threshold:
            good.append(r)

    if len(good) < min_hits:
        return ""  # << no usable context

    parts: List[str] = []
    for r in good:
        sim = float(r.get("similarity", 0.0))
        page = r.get("page_no")
        page_str = str(page) if page is not None else "?"
        snippet = (r.get("content") or "").strip()
        if len(snippet) > max_snippet:
            snippet = snippet[:max_snippet] + " …"
        parts.append(f"[page {page_str} | sim {sim:.3f}] {snippet}")

    context = "\n\n".join(parts)
    if len(context) > max_total_chars:
        context = context[:max_total_chars] + " …"
    return context

In [28]:
context = get_context_from_embeddings(supabase, supabase_default_account_id, "Does this mention balance?", report_id=report_id)
print(context)

[page ? | sim 0.205] ``` MyPrime Simpletree Anarkali, Holding No. 89, Plot No. 03, Block: CWS(A), Gulshan Avenue, Dhaka North City Corporation, Bangladesh. TRANSACTION RECEIPT Other Bank Transfer(NPSB) Source Account/Card 2125214023710 Amount BDT 3892.00 Transaction Date Time 2025-09-17 09:45:16 Narration September 2025 due bill Transaction Info To Account/Card No. 1114112000000042 Reference No. 526009452078 Total 3892.00 This is an iBanking generated e-receipt and does not require any signature ```


In [29]:
context = get_context_from_embeddings(supabase, supabase_default_account_id, "Earth Rotation?", report_id=report_id)
print(context)




In [30]:
from typing import Annotated, TypedDict, List
from langgraph.graph import StateGraph, START, END
from langgraph.graph.message import add_messages
from dotenv import load_dotenv
from IPython.display import display, Markdown, Image
from langchain_openai import ChatOpenAI
from pydantic import BaseModel
from langgraph.prebuilt import ToolNode, tools_condition
from langgraph.checkpoint.memory import MemorySaver
from langchain_core.messages import BaseMessage, HumanMessage, AIMessage, SystemMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.agents import Tool
import json
import uuid

import random
import gradio as gr

load_dotenv()

True

In [45]:
def search_medical_documents(query: str) -> str:
    """Useful when you need to look for information in the provided medical documents.
    Input should be a fully formed question."""
    return get_context_from_embeddings(supabase, supabase_default_account_id, query, report_id=report_id) or "No relevant information found in the documents."

tool_search_docs = Tool(
    name="search_medical_documents",
    description="Useful for when you need to look for information in the medical documents provided"
                "The input to this tool should be a fully formed question.",
    func=search_medical_documents
)

tools = [tool_search_docs]

In [46]:
llm = ChatOpenAI(model="gpt-4o-mini")
llm = llm.bind_tools(tools=tools)
system_prompt = """
    You are a medical assistant. Based on the user message, you will decide whether to response normally or to look for information
    in the medical documents provided. When relevant be sure to look into the documents to provide accurate information. 
    """
prompt = ChatPromptTemplate.from_messages([
    SystemMessage(content=system_prompt),
    MessagesPlaceholder(variable_name="messages")
    ])

chain = prompt | llm

class State(TypedDict):
    messages: Annotated[List[BaseMessage], add_messages]


def log_messages(messages: List[BaseMessage]) -> None:
    for message in messages:
        if isinstance(message, HumanMessage):
            print(f"Human: {message.content}")
        elif isinstance(message, AIMessage):
            print(f"AI: {message.content}")
        elif isinstance(message, SystemMessage):
            print(f"System: {message.content}")

In [47]:
def advisor_node(old_state: State) -> State:
    response = chain.invoke(old_state["messages"])
    print("________________________")
    print(log_messages(old_state["messages"] + [response]))
    return {"messages": [response]}

In [49]:
graph_builder = StateGraph(State)

graph_builder.add_node("advisor", advisor_node)
graph_builder.add_node("tools", ToolNode(tools=tools))

graph_builder.add_edge(START, "advisor")
graph_builder.add_conditional_edges("advisor", tools_condition, "tools")
graph_builder.add_edge("tools", "advisor")
graph_builder.add_edge("advisor", END)

memory = MemorySaver()
graph = graph_builder.compile(checkpointer=memory)
#display(Image(graph.get_graph().draw_mermaid_png()))

def make_thread_id() -> str:
    return str(uuid.uuid4())

config = {"configurable": {"thread_id": make_thread_id()}}

def chat(user_message, history):
    out = graph.invoke({"messages": [HumanMessage(content=user_message)]}, config=config)
    return out["messages"][-1].content

gr.ChatInterface(fn=chat, title="LangGraph LLM Test").launch()

  self.chatbot = Chatbot(


* Running on local URL:  http://127.0.0.1:7872
* To create a public link, set `share=True` in `launch()`.




________________________
Human: Hey
AI: Hello! How can I assist you today?
None
________________________
Human: Hey
AI: Hello! How can I assist you today?
Human: what can you do
AI: I can help you with a variety of tasks related to medical information and assistance. Some examples include:

1. Answering general health questions.
2. Providing information about medical terms and conditions.
3. Looking up specific medical documents or guidelines for accurate information.
4. Offering tips on wellness and preventative care.

Let me know if you have a specific question or topic in mind!
None
________________________
Human: Hey
AI: Hello! How can I assist you today?
Human: what can you do
AI: I can help you with a variety of tasks related to medical information and assistance. Some examples include:

1. Answering general health questions.
2. Providing information about medical terms and conditions.
3. Looking up specific medical documents or guidelines for accurate information.
4. Offering ti

In [None]:
### pdf and jpg image upload done
### make sure any kind of image can be uploaded
### refactor to reduce code duplication
### connect database supabase with vectorstore. Compare Supabase and Vector DBs (FAISS, Pinecone) + SQL
### double check industry standard
### OCR of pdf and images
### connect vector store
### connect langgraph
# convert to .py industry standard
# create front end (react)
# connect with python in an industry standard way (check FastAPI)
# DONE!!