# Medical Vector Database with DuckDB

This notebook demonstrates:

- Normalized relational schema
- Disease ↔ Symptom modeling
- Vector embeddings using SentenceTransformers
- Similarity search
- Hybrid SQL + vector queries


In [None]:
!pip install duckdb pandas numpy sentence-transformers scikit-learn

In [None]:
import duckdb
import pandas as pd
import numpy as np
import re
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

## Load CSV File

In [None]:
csv_path = 'data/DerivedKnowledgeGraph_final.csv'  # adjust if needed
df = pd.read_csv(csv_path)
df.head()

## Parse Symptoms Column

In [None]:
def parse_symptoms(symptom_string):
    pattern = r"([^,]+?)\s*\(([\d\.]+)\)"
    matches = re.findall(pattern, symptom_string)
    return [(m[0].strip().lower(), float(m[1])) for m in matches]

structured_data = []

for _, row in df.iterrows():
    disease = row.iloc[0].strip().lower()
    symptoms = parse_symptoms(row.iloc[1])
    structured_data.append((disease, symptoms))

structured_data[:2]

## Create DuckDB Schema

In [None]:
con = duckdb.connect('medical.db')

con.execute("""
CREATE TABLE IF NOT EXISTS disease (
    disease_id INTEGER PRIMARY KEY AUTOINCREMENT,
    name TEXT UNIQUE
);
""")

con.execute("""
CREATE TABLE IF NOT EXISTS symptom (
    symptom_id INTEGER PRIMARY KEY AUTOINCREMENT,
    name TEXT UNIQUE
);
""")

con.execute("""
CREATE TABLE IF NOT EXISTS disease_symptom (
    disease_id INTEGER,
    symptom_id INTEGER,
    incidence FLOAT,
    PRIMARY KEY (disease_id, symptom_id)
);
""")

con.execute("""
CREATE TABLE IF NOT EXISTS disease_embedding (
    disease_id INTEGER,
    embedding DOUBLE[]
);
""")

## Populate Tables

In [None]:
for disease, symptoms in structured_data:
    con.execute("INSERT OR IGNORE INTO disease (name) VALUES (?)", [disease])
    disease_id = con.execute(
        "SELECT disease_id FROM disease WHERE name = ?",
        [disease]
    ).fetchone()[0]

    for symptom, incidence in symptoms:
        con.execute("INSERT OR IGNORE INTO symptom (name) VALUES (?)", [symptom])
        symptom_id = con.execute(
            "SELECT symptom_id FROM symptom WHERE name = ?",
            [symptom]
        ).fetchone()[0]

        con.execute("""
            INSERT OR IGNORE INTO disease_symptom
            VALUES (?, ?, ?)
        """, [disease_id, symptom_id, incidence])

## Compute Disease Embeddings

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

diseases = con.execute("SELECT disease_id, name FROM disease").fetchall()

for disease_id, name in diseases:
    symptoms = con.execute("""
        SELECT s.name, ds.incidence
        FROM disease_symptom ds
        JOIN symptom s ON ds.symptom_id = s.symptom_id
        WHERE ds.disease_id = ?
    """, [disease_id]).fetchall()

    symptom_names = [s[0] for s in symptoms]
    weights = np.array([s[1] for s in symptoms])
    embeddings = model.encode(symptom_names)
    weighted_embedding = np.average(embeddings, axis=0, weights=weights)

    con.execute(
        "INSERT INTO disease_embedding VALUES (?, ?)",
        [disease_id, weighted_embedding.tolist()]
    )

## Similarity Queries

In [None]:
def rank_diseases(symptom_list):
    query_embedding = model.encode(symptom_list)
    query_vector = np.mean(query_embedding, axis=0)

    results = con.execute("""
        SELECT d.name, de.embedding
        FROM disease_embedding de
        JOIN disease d ON de.disease_id = d.disease_id
    """).fetchall()

    scores = []

    for name, embedding in results:
        sim = cosine_similarity([query_vector], [np.array(embedding)])[0][0]
        scores.append((name, sim))

    return sorted(scores, key=lambda x: x[1], reverse=True)

# Example:
# rank_diseases(['fever', 'cough'])

In [None]:
def disease_similarity(d1, d2):
    e1 = con.execute("""
        SELECT embedding FROM disease_embedding de
        JOIN disease d ON de.disease_id = d.disease_id
        WHERE d.name = ?
    """, [d1]).fetchone()[0]

    e2 = con.execute("""
        SELECT embedding FROM disease_embedding de
        JOIN disease d ON de.disease_id = d.disease_id
        WHERE d.name = ?
    """, [d2]).fetchone()[0]

    return cosine_similarity([e1], [e2])[0][0]

# Example:
# disease_similarity('flu', 'covid-19')