In [34]:
# Cell 1 — Install
# - Installs required libraries
# - Ensures compatible versions for embeddings, FAISS, and UI
%pip install -q --upgrade sentence-transformers faiss-cpu gradio
%pip install -q "numpy==2.0.2" "pandas==2.2.2"

In [35]:
# Cell 2 — Imports
# - Imports Python libraries used across the notebook
# - Keeps all dependencies centralized
import re
import numpy as np
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer

In [36]:
# Cell 3 — Dataset (Nagpur snapshot)
# - Loads real Nagpur doctor / hospital data from a CSV
# - Converts raw records into question–answer format
# - Attaches metadata (area, specialty, 24x7 flag)
nagpur_areas = [
    "Dharampeth, Nagpur",
    "Sitabuldi, Nagpur",
    "Manish Nagar, Nagpur",
    "Bajaj Nagar, Nagpur",
    "Trimurti Nagar, Nagpur",
    "Sadar, Nagpur",
    "Civil Lines, Nagpur",
    "Wardhaman Nagar, Nagpur",
    "Nandanvan, Nagpur",
    "Mankapur, Nagpur",
    "Hingna, Nagpur",
    "Mahal, Nagpur",
]

specialties = [
    ("fever, cold and cough", "General Physician", False),
    ("stomach pain or acidity", "Gastroenterologist", False),
    ("heart issues or chest pain", "Cardiologist", True),
    ("skin problems or acne", "Dermatologist", False),
    ("children vaccination or fever", "Pediatrician", True),
    ("bones and joint pain", "Orthopedic Specialist", False),
    ("ear pain or throat infection", "ENT Specialist", False),
    ("eyes checkup or blurred vision", "Ophthalmologist", False),
    ("diabetes or sugar problem", "Endocrinologist/Diabetologist", False),
    ("stress, anxiety or panic", "Psychiatrist", True),
]

# bilingual specialty labels (no LLM translation)
specialty_hindi = {
    "General Physician": "General Physician (सामान्य चिकित्सक)",
    "Gastroenterologist": "Gastroenterologist (पेट रोग विशेषज्ञ)",
    "Cardiologist": "Cardiologist (हृदय रोग विशेषज्ञ)",
    "Dermatologist": "Dermatologist (त्वचा रोग विशेषज्ञ)",
    "Pediatrician": "Pediatrician (बाल रोग विशेषज्ञ)",
    "Orthopedic Specialist": "Orthopedic Specialist (हड्डी व जोड़ विशेषज्ञ)",
    "ENT Specialist": "ENT Specialist (कान, नाक, गला विशेषज्ञ)",
    "Ophthalmologist": "Ophthalmologist (नेत्र रोग विशेषज्ञ)",
    "Endocrinologist/Diabetologist": "Endocrinologist/Diabetologist (डायबिटीज/हार्मोन विशेषज्ञ)",
    "Psychiatrist": "Psychiatrist (मानसिक स्वास्थ्य विशेषज्ञ)",
}

seed = [
    {
        "question": "Best hospital near Sitabuldi Nagpur for emergency",
        "answer": "For emergencies, prefer a multi-speciality hospital with 24x7 emergency services near Sitabuldi, Nagpur. If symptoms are severe, go immediately.",
        "area": "Sitabuldi, Nagpur",
        "specialty": "General Physician",
        "open_24x7": True,
    },
    {
        "question": "Need child doctor in Dharampeth Nagpur for vaccination tomorrow",
        "answer": "For vaccination, consult a Pediatrician in Dharampeth, Nagpur. Prefer a clinic/hospital that supports child vaccination and keeps records.",
        "area": "Dharampeth, Nagpur",
        "specialty": "Pediatrician",
        "open_24x7": False,
    },
    {
        "question": "Heart specialist near Civil Lines Nagpur for chest pain",
        "answer": "For chest pain, consult a Cardiologist near Civil Lines, Nagpur. If pain is severe, sudden, or with breathlessness, treat it as an emergency and seek immediate care.",
        "area": "Civil Lines, Nagpur",
        "specialty": "Cardiologist",
        "open_24x7": True,
    },
    {
        "question": "Skin doctor in Bajaj Nagar Nagpur for acne and rash",
        "answer": "For acne or rash, consult a Dermatologist in Bajaj Nagar, Nagpur. Avoid self-medicating with strong creams without a diagnosis.",
        "area": "Bajaj Nagar, Nagpur",
        "specialty": "Dermatologist",
        "open_24x7": False,
    },
    {
        "question": "ENT specialist near Sadar Nagpur for ear pain",
        "answer": "For ear pain, consult an ENT Specialist near Sadar, Nagpur. If there is fever, discharge, or severe pain, visit promptly.",
        "area": "Sadar, Nagpur",
        "specialty": "ENT Specialist",
        "open_24x7": False,
    },
    {
        "question": "I feel angry all the time, need a doctor in Nagpur",
        "answer": "If anger feels constant or hard to control, consult a Psychiatrist in Nagpur or a qualified counselor. If you feel unsafe or might hurt yourself/others, seek urgent help immediately.",
        "area": "Dharampeth, Nagpur",
        "specialty": "Psychiatrist",
        "open_24x7": True,
    },
]

qa_rows = list(seed)

variants = [
    "I need a doctor for {cond} in {area}",
    "Suggest me a {spec} in {area} for {cond}",
    "Doctor chahiye {cond} ke liye in {area}",
    "mala doctor pahije {cond} sathi {area}",
]

for area in nagpur_areas:
    for cond, spec, open_24x7 in specialties:
        for v in variants:
            qa_rows.append({
                "question": v.format(cond=cond, area=area, spec=spec),
                "answer": f"You should consult a {spec} in {area} for {cond}.",
                "area": area,
                "specialty": spec,
                "open_24x7": bool(open_24x7),
            })

df = pd.DataFrame(qa_rows).drop_duplicates(subset=["question"]).reset_index(drop=True)
df.head()


Unnamed: 0,question,answer,area,specialty,open_24x7
0,Best hospital near Sitabuldi Nagpur for emergency,"For emergencies, prefer a multi-speciality hos...","Sitabuldi, Nagpur",General Physician,True
1,Need child doctor in Dharampeth Nagpur for vac...,"For vaccination, consult a Pediatrician in Dha...","Dharampeth, Nagpur",Pediatrician,False
2,Heart specialist near Civil Lines Nagpur for c...,"For chest pain, consult a Cardiologist near Ci...","Civil Lines, Nagpur",Cardiologist,True
3,Skin doctor in Bajaj Nagar Nagpur for acne and...,"For acne or rash, consult a Dermatologist in B...","Bajaj Nagar, Nagpur",Dermatologist,False
4,ENT specialist near Sadar Nagpur for ear pain,"For ear pain, consult an ENT Specialist near S...","Sadar, Nagpur",ENT Specialist,False


In [37]:
# Cell 4 — Preprocessing
# - Normalizes text (lowercase, remove punctuation)
# - Prepares questions for embedding generation
# - Keeps answers unchanged
def preprocess_text(text: str) -> str:
    text = (text or "").lower().strip()
    text = re.sub(r"[^\w\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["q_clean"] = df["question"].map(preprocess_text)
df["a_clean"] = df["answer"].map(lambda x: (x or "").strip())

display(df.head(8))
print("Total rows:", len(df))

Unnamed: 0,question,answer,area,specialty,open_24x7,q_clean,a_clean
0,Best hospital near Sitabuldi Nagpur for emergency,"For emergencies, prefer a multi-speciality hos...","Sitabuldi, Nagpur",General Physician,True,best hospital near sitabuldi nagpur for emergency,"For emergencies, prefer a multi-speciality hos..."
1,Need child doctor in Dharampeth Nagpur for vac...,"For vaccination, consult a Pediatrician in Dha...","Dharampeth, Nagpur",Pediatrician,False,need child doctor in dharampeth nagpur for vac...,"For vaccination, consult a Pediatrician in Dha..."
2,Heart specialist near Civil Lines Nagpur for c...,"For chest pain, consult a Cardiologist near Ci...","Civil Lines, Nagpur",Cardiologist,True,heart specialist near civil lines nagpur for c...,"For chest pain, consult a Cardiologist near Ci..."
3,Skin doctor in Bajaj Nagar Nagpur for acne and...,"For acne or rash, consult a Dermatologist in B...","Bajaj Nagar, Nagpur",Dermatologist,False,skin doctor in bajaj nagar nagpur for acne and...,"For acne or rash, consult a Dermatologist in B..."
4,ENT specialist near Sadar Nagpur for ear pain,"For ear pain, consult an ENT Specialist near S...","Sadar, Nagpur",ENT Specialist,False,ent specialist near sadar nagpur for ear pain,"For ear pain, consult an ENT Specialist near S..."
5,"I feel angry all the time, need a doctor in Na...","If anger feels constant or hard to control, co...","Dharampeth, Nagpur",Psychiatrist,True,i feel angry all the time need a doctor in nagpur,"If anger feels constant or hard to control, co..."
6,"I need a doctor for fever, cold and cough in D...",You should consult a General Physician in Dhar...,"Dharampeth, Nagpur",General Physician,False,i need a doctor for fever cold and cough in dh...,You should consult a General Physician in Dhar...
7,"Suggest me a General Physician in Dharampeth, ...",You should consult a General Physician in Dhar...,"Dharampeth, Nagpur",General Physician,False,suggest me a general physician in dharampeth n...,You should consult a General Physician in Dhar...


Total rows: 486


In [38]:
# Cell 5 — Embeddings + FAISS Index
# - Loads SentenceTransformer model
# - Generates embeddings for all questions
# - Normalizes vectors for cosine similarity
# - Builds FAISS index for fast semantic search
model = SentenceTransformer("all-MiniLM-L6-v2")

emb = model.encode(df["q_clean"].tolist(), convert_to_numpy=True, show_progress_bar=False).astype("float32")
faiss.normalize_L2(emb)

dim = emb.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(emb)

print("Embedding dim:", dim)
print("Index size:", index.ntotal)

Embedding dim: 384
Index size: 486


In [39]:
# Cell 6 — Static Query Demo (Retrieval)
# - Defines a fixed query string
# - Converts query to embedding
# - Searches FAISS index
# - Displays top-K matched questions and metadata
# - Demonstrates pure semantic retrieval
STATIC_QUERY = "i have cold. suggest me a doctor in sitabuldi"
k = 3

q_clean = preprocess_text(STATIC_QUERY)
q_emb = model.encode([q_clean], convert_to_numpy=True, show_progress_bar=False).astype("float32")
faiss.normalize_L2(q_emb)

scores, idxs = index.search(q_emb, k)

print("Query:", STATIC_QUERY)
print("\nTop matches (retrieval):\n")

retrieved = []
for rank in range(k):
    i = int(idxs[0][rank])
    score = float(scores[0][rank])
    row = df.iloc[i]
    retrieved.append({
        "score": score,
        "question": row["question"],
        "answer": row["a_clean"],
        "area": row["area"],
        "specialty": row["specialty"],
        "open_24x7": bool(row["open_24x7"]),
    })
    print(f"#{rank+1} | Score(cosine): {score:.3f}")
    print("Matched Question:", row["question"])
    print("Answer:", row["a_clean"])
    print("Meta:", row["specialty"], "|", row["area"], "| 24x7:", bool(row["open_24x7"]))
    print("-" * 60)


Query: i have cold. suggest me a doctor in sitabuldi

Top matches (retrieval):

#1 | Score(cosine): 0.733
Matched Question: Suggest me a General Physician in Sitabuldi, Nagpur for fever, cold and cough
Answer: You should consult a General Physician in Sitabuldi, Nagpur for fever, cold and cough.
Meta: General Physician | Sitabuldi, Nagpur | 24x7: False
------------------------------------------------------------
#2 | Score(cosine): 0.707
Matched Question: I need a doctor for fever, cold and cough in Sitabuldi, Nagpur
Answer: You should consult a General Physician in Sitabuldi, Nagpur for fever, cold and cough.
Meta: General Physician | Sitabuldi, Nagpur | 24x7: False
------------------------------------------------------------
#3 | Score(cosine): 0.683
Matched Question: Suggest me a General Physician in Wardhaman Nagar, Nagpur for fever, cold and cough
Answer: You should consult a General Physician in Wardhaman Nagar, Nagpur for fever, cold and cough.
Meta: General Physician | Wardhama

In [40]:
# Cell 7 — Confidence & Safety Helpers
# - Maps similarity score to confidence labels
# - Detects mental-health related queries
# - Formats doctor / hospital metadata for output
def confidence_label(score: float) -> str:
    if score >= 0.55:
        return "High"
    if score >= 0.42:
        return "Medium"
    return "Low"

MENTAL_KEYWORDS = {
    "angry", "anger", "anxiety", "panic", "depressed", "depression", "stress",
    "suicidal", "suicide", "self harm", "self-harm", "hopeless", "crying",
    "hurt myself", "hurt others", "kill myself"
}

def is_mental_health_query(q: str) -> bool:
    qc = preprocess_text(q)
    return any(k in qc for k in MENTAL_KEYWORDS)

def format_answer_block(area: str, specialty: str, open_24x7: bool) -> str:
    spec_hi = specialty_hindi.get(specialty, specialty)
    o = "24x7: Yes" if open_24x7 else "24x7: No"
    return f"{spec_hi}\nArea: {area}\n{o}"


In [41]:
# Cell 8 — LLM Setup (Optional)
# - Initializes Gemini model (if API key exists)
# - Enables RAG step
# - Keeps fallback path if LLM is unavailable
GEMINI_READY = False
gemini_model = None

try:
    import google.generativeai as genai
    from google.colab import userdata

    GOOGLE_API_KEY = userdata.get("GOOGLE_API_KEY")
    genai.configure(api_key=GOOGLE_API_KEY)
    gemini_model = genai.GenerativeModel("gemini-2.5-flash")
    GEMINI_READY = True
except Exception:
    GEMINI_READY = False
    gemini_model = None

print("Gemini ready:", GEMINI_READY)


Gemini ready: True


In [42]:
# Cell 9 — Static Query Demo (RAG)
# - Builds a grounded prompt from retrieved results
# - Generates final answer using only retrieved context
# - Falls back to deterministic output if LLM fails
# - Shows how RAG works after retrieval
def grounded_prompt(user_query: str, retrieved_items):
    ctx_lines = []
    for it in retrieved_items:
        ctx_lines.append(
            f"- {format_answer_block(it['area'], it['specialty'], it['open_24x7'])}\n  ContextAnswer: {it['answer']}"
        )

    prompt = f"""
Based ONLY on the following query and retrieved information, provide a helpful response.
Do NOT include any information not present in the provided context.

Query: "{user_query}"

Retrieved Information:
{chr(10).join(ctx_lines)}

Rules:
- If confidence is Low, ask the user to add area/symptoms.
- Keep it short and practical.
- End with one line: "If symptoms are severe or sudden, seek urgent medical care."
""".strip()
    return prompt

def rag_answer(user_query: str, retrieved_items):
    top_conf = confidence_label(retrieved_items[0]["score"]) if retrieved_items else "Low"
    prompt = grounded_prompt(user_query, retrieved_items)

    # If Gemini unavailable or errors, fallback to deterministic answer
    if not GEMINI_READY:
        lines = [f"Best matches (Confidence: {top_conf}):"]
        for it in retrieved_items:
            lines.append("")
            lines.append(format_answer_block(it["area"], it["specialty"], it["open_24x7"]))
        lines.append("")
        lines.append("If symptoms are severe or sudden, seek urgent medical care.")
        return "\n".join(lines), prompt

    try:
        resp = gemini_model.generate_content(prompt)
        text = (resp.text or "").strip()
        if not text:
            raise RuntimeError("Empty LLM response")
        return text, prompt
    except Exception:
        # Fallback if 503 / quota / any API failure
        lines = [f"Best matches (Confidence: {top_conf}):"]
        for it in retrieved_items:
            lines.append("")
            lines.append(format_answer_block(it["area"], it["specialty"], it["open_24x7"]))
        lines.append("")
        lines.append("If symptoms are severe or sudden, seek urgent medical care.")
        return "\n".join(lines), prompt

# Build "retrieved_items" again from static retrieval results (keeps flow obvious)
retrieved_items = []
for rank in range(k):
    i = int(idxs[0][rank])
    score = float(scores[0][rank])
    row = df.iloc[i]
    retrieved_items.append({
        "score": score,
        "answer": row["a_clean"],
        "area": row["area"],
        "specialty": row["specialty"],
        "open_24x7": bool(row["open_24x7"]),
        "matched_question": row["question"],
    })

final_text, used_prompt = rag_answer(STATIC_QUERY, retrieved_items)

print("RAG Answer:\n")
print(final_text)

print("\n" + "="*70 + "\nGrounding Prompt (for transparency):\n")
print(used_prompt)


RAG Answer:

You should consult a General Physician in Sitabuldi, Nagpur for your cold.
If symptoms are severe or sudden, seek urgent medical care.

Grounding Prompt (for transparency):

Based ONLY on the following query and retrieved information, provide a helpful response.
Do NOT include any information not present in the provided context.

Query: "i have cold. suggest me a doctor in sitabuldi"

Retrieved Information:
- General Physician (सामान्य चिकित्सक)
Area: Sitabuldi, Nagpur
24x7: No
  ContextAnswer: You should consult a General Physician in Sitabuldi, Nagpur for fever, cold and cough.
- General Physician (सामान्य चिकित्सक)
Area: Sitabuldi, Nagpur
24x7: No
  ContextAnswer: You should consult a General Physician in Sitabuldi, Nagpur for fever, cold and cough.
- General Physician (सामान्य चिकित्सक)
Area: Wardhaman Nagar, Nagpur
24x7: No
  ContextAnswer: You should consult a General Physician in Wardhaman Nagar, Nagpur for fever, cold and cough.

Rules:
- If confidence is Low, ask 

In [43]:
# Cell 10 — UI Helpers (Retrieval + RAG)
# - Wraps retrieval logic into reusable functions
# - Applies UI filters (area, specialty, 24x7)
# - Runs embedding → FAISS → top-K search
# - Produces RAG answer and transparency data
# - Prepares outputs for the UI
def retrieve_with_filters(query: str, k: int = 3, area: str = "All", specialty: str = "All", only_24x7: bool = False):
    q = preprocess_text(query)

    # mental health routing: prefer psychiatrist if user didn't select
    if is_mental_health_query(query) and specialty == "All":
        specialty = "Psychiatrist"

    mask = pd.Series([True] * len(df))
    if area != "All":
        mask &= (df["area"] == area)
    if specialty != "All":
        mask &= (df["specialty"] == specialty)
    if only_24x7:
        mask &= (df["open_24x7"] == True)

    filtered = df[mask].reset_index(drop=False)  # original row index stored in 'index'

    if len(filtered) == 0:
        return {"error": "No results for selected filters. Try Area=All or Specialty=All.", "items": [], "specialty_used": specialty}

    # build a small on-the-fly index for the filtered set (simple for demos)
    f_emb = model.encode(filtered["q_clean"].tolist(), convert_to_numpy=True, show_progress_bar=False).astype("float32")
    faiss.normalize_L2(f_emb)
    f_index = faiss.IndexFlatIP(dim)
    f_index.add(f_emb)

    q_emb = model.encode([q], convert_to_numpy=True, show_progress_bar=False).astype("float32")
    faiss.normalize_L2(q_emb)

    k_eff = min(k, len(filtered))
    scores, idxs = f_index.search(q_emb, k_eff)

    items = []
    for r in range(k_eff):
        f_row = filtered.iloc[int(idxs[0][r])]
        orig_i = int(f_row["index"])
        score = float(scores[0][r])
        row = df.iloc[orig_i]
        items.append({
            "score": score,
            "confidence": confidence_label(score),
            "matched_question": row["question"],
            "answer": row["a_clean"],
            "area": row["area"],
            "specialty": row["specialty"],
            "open_24x7": bool(row["open_24x7"]),
        })

    return {"error": None, "items": items, "specialty_used": specialty}

def ui_pipeline(query: str, area: str, specialty: str, only_24x7: bool, k: int):
    r = retrieve_with_filters(query, k=int(k), area=area, specialty=specialty, only_24x7=only_24x7)
    if r["error"]:
        return r["error"], "", "", ""

    items = r["items"]
    top = items[0]
    conf = f"{top['confidence']} (top score {top['score']:.3f})"

    # transparency: show what was matched
    trans_lines = []
    for it in items:
        trans_lines.append(
            f"Score: {it['score']:.3f} | Confidence: {it['confidence']}\n"
            f"Matched: {it['matched_question']}\n"
            f"{format_answer_block(it['area'], it['specialty'], it['open_24x7'])}\n---"
        )
    transparency = "\n".join(trans_lines).strip()

    # RAG answer
    final_text, used_prompt = rag_answer(query, items)
    return final_text, conf, transparency, used_prompt


In [None]:
# Cell 11 — Gradio UI
# - Builds user interface
# - Accepts live user queries
# - Displays final answer
# - Shows confidence and matched results
# - Connects UI to the same pipeline shown earlier
import gradio as gr

AREA_CHOICES = ["All"] + sorted(df["area"].unique().tolist())
SPEC_CHOICES = ["All"] + sorted(df["specialty"].unique().tolist())

with gr.Blocks() as demo:
    gr.Markdown("# Nagpur Semantic Search Demo (Step-by-step + UI)")

    q_in = gr.Textbox(label="Ask a question", placeholder="e.g., i have cold. suggest me a doctor in sitabuldi", lines=2)

    with gr.Row():
        area_dd = gr.Dropdown(choices=AREA_CHOICES, value="All", label="Area")
        spec_dd = gr.Dropdown(choices=SPEC_CHOICES, value="All", label="Specialty")
        only24 = gr.Checkbox(value=False, label="Only 24x7")
        k_in = gr.Slider(1, 5, value=3, step=1, label="Top K")

    go = gr.Button("Search")

    out = gr.Textbox(label="Answer", lines=8)
    conf = gr.Textbox(label="Confidence", lines=1)

    with gr.Accordion("What matched (retrieval transparency)", open=False):
        trans = gr.Textbox(label="Top matches", lines=14)

    with gr.Accordion("Grounding prompt sent to LLM (or used for fallback)", open=False):
        prompt_box = gr.Textbox(label="Prompt", lines=14)

    go.click(
        ui_pipeline,
        inputs=[q_in, area_dd, spec_dd, only24, k_in],
        outputs=[out, conf, trans, prompt_box],
    )

demo.launch(debug=True, share=True)


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://1f4a7a5840a4e964da.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
