# Medical Vector Database (TF-IDF Version)

This notebook demonstrates:
- Normalized relational schema
- DuckDB integration
- TF-IDF embeddings (no external downloads)
- SQL-registered embedding and cosine functions
- Fully SQL-driven similarity search


In [None]:
import duckdb
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer


## Load CSV

In [None]:
csv_path = '../data/DerivedKnowledgeGraph_final.csv'
df = pd.read_csv(csv_path)
df.head()

## Parse Symptoms

In [None]:
def parse_symptoms(symptom_string):
    pattern = r"([^,]+?)\s*\(([\d\.]+)\)"
    matches = re.findall(pattern, symptom_string)
    return [(m[0].strip().lower(), float(m[1])) for m in matches]

structured_data = []
for _, row in df.iterrows():
    disease = row.iloc[0].strip().lower()
    symptoms = parse_symptoms(row.iloc[1])
    structured_data.append((disease, symptoms))

structured_data[:2]

## Initialize DuckDB

In [None]:
con = duckdb.connect('medical.db')

con.execute("CREATE SEQUENCE IF NOT EXISTS disease_seq START 1;")
con.execute("CREATE SEQUENCE IF NOT EXISTS symptom_seq START 1;")

con.execute("""
CREATE TABLE IF NOT EXISTS disease (
    disease_id INTEGER PRIMARY KEY DEFAULT nextval('disease_seq'),
    name TEXT UNIQUE
);
""")

con.execute("""
CREATE TABLE IF NOT EXISTS symptom (
    symptom_id INTEGER PRIMARY KEY DEFAULT nextval('symptom_seq'),
    name TEXT UNIQUE
);
""")

con.execute("""
CREATE TABLE IF NOT EXISTS disease_symptom (
    disease_id INTEGER,
    symptom_id INTEGER,
    incidence FLOAT,
    PRIMARY KEY (disease_id, symptom_id)
);
""")

con.execute("""
CREATE TABLE IF NOT EXISTS disease_embedding (
    disease_id INTEGER,
    embedding DOUBLE[]
);
""")

## Populate Tables

In [None]:
for disease, symptoms in structured_data:
    con.execute("INSERT OR IGNORE INTO disease (name) VALUES (?)", [disease])
    disease_id = con.execute(
        "SELECT disease_id FROM disease WHERE name = ?",
        [disease]
    ).fetchone()[0]

    for symptom, incidence in symptoms:
        con.execute("INSERT OR IGNORE INTO symptom (name) VALUES (?)", [symptom])
        symptom_id = con.execute(
            "SELECT symptom_id FROM symptom WHERE name = ?",
            [symptom]
        ).fetchone()[0]

        con.execute("""
            INSERT OR IGNORE INTO disease_symptom
            VALUES (?, ?, ?)
        """, [disease_id, symptom_id, incidence])

## Compute Disease Embeddings (TF-IDF, Fully Offline)

In [None]:
diseases = con.execute("SELECT disease_id, name FROM disease").fetchall()

disease_texts = []
disease_ids = []

for disease_id, name in diseases:
    symptoms = con.execute("""
        SELECT s.name, ds.incidence
        FROM disease_symptom ds
        JOIN symptom s ON ds.symptom_id = s.symptom_id
        WHERE ds.disease_id = ?
    """, [disease_id]).fetchall()

    weighted_text = []
    for symptom_name, incidence in symptoms:
        repetitions = max(1, int(incidence * 10))
        weighted_text.extend([symptom_name] * repetitions)

    disease_texts.append(" ".join(weighted_text))
    disease_ids.append(disease_id)

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(disease_texts)

for i, disease_id in enumerate(disease_ids):
    vector = tfidf_matrix[i].toarray()[0]
    con.execute(
        "INSERT INTO disease_embedding VALUES (?, ?)",
        [disease_id, vector.tolist()]
    )

print("Embeddings computed successfully (offline).")

## Register SQL Functions

In [None]:
def embed_symptoms(symptom_list):
    if symptom_list is None or len(symptom_list) == 0:
        return None
    query_text = " ".join(symptom_list)
    return vectorizer.transform([query_text]).toarray()[0].tolist()

def cosine(v1, v2):
    v1 = np.array(v1)
    v2 = np.array(v2)
    if np.linalg.norm(v1) == 0 or np.linalg.norm(v2) == 0:
        return 0.0
    return float(np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)))

con.create_function("embed_symptoms", embed_symptoms, return_type="DOUBLE[]")
con.create_function("cosine", cosine, return_type="DOUBLE")

## SQL-Driven Helper Functions

In [None]:
def disease_similarity(d1, d2):
    query = f"""
    SELECT
        cosine(e1.embedding, e2.embedding) AS similarity
    FROM disease_embedding e1, disease d1, disease_embedding e2, disease d2
    WHERE d1.disease_id = e1.disease_id
          AND d2.disease_id = e2.disease_id
          AND d1.name = '{d1}'
          AND d2.name = '{d2}';
    """
    return con.execute(query).fetchone()[0]

# Example:
print("Disease similarity of colon cancer and liver cancer: ", disease_similarity('colon cancer', 'liver cancer'))
print("Disease similarity of cirrhosis of the liver and liver cancer: ", disease_similarity('cirrhosis of the liver', 'liver cancer'))
print("Disease similarity of common cold and liver cancer: ", disease_similarity('common cold', 'liver cancer'))

In [None]:
def rank_diseases(symptom_list):
    symptoms_sql = ", ".join([f"'{s}'" for s in symptom_list])
    query = f"""
    SELECT d.name,
           cosine(
               embed_symptoms([{symptoms_sql}]),
               de.embedding
            ) AS similarity
    FROM disease d, disease_embedding de
    WHERE d.disease_id = de.disease_id
    ORDER BY similarity DESC;
    """
    return con.execute(query).fetchall()

# Example:
rank_diseases(['fever', 'cough'])

In [None]:
def rank_similar_diseases(disease_name, top_k=5):
    query = f"""
    SELECT
        d.name,
        cosine(
            (
                SELECT de.embedding
                FROM disease_embedding de
                JOIN disease d2 ON d2.disease_id = de.disease_id
                WHERE d2.name = '{disease_name}'
            ),
            de.embedding
        ) AS similarity
    FROM disease d, disease_embedding de
    WHERE d.disease_id = de.disease_id
    ORDER BY similarity DESC
    LIMIT {top_k};
    """
    return con.execute(query).fetchall()

# Example:
rank_similar_diseases('common cold')