# Medical Vector Database (Fully Offline Version)

This notebook demonstrates:

- Normalized relational schema in DuckDB
- Disease ↔ Symptom modeling
- Vector representations using TF-IDF (fully offline)
- Similarity search using cosine similarity
- Hybrid SQL + vector logic

✅ No HuggingFace downloads
✅ Binder-safe
✅ Fully offline execution


In [2]:
import duckdb
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Load CSV File

In [None]:
csv_path = '../data/DerivedKnowledgeGraph_final.csv'
df = pd.read_csv(csv_path)
df.head()

Unnamed: 0,Diseases,Symptoms
0,abscess,"pain (0.318), fever (0.119), swelling (0.112),..."
1,acid reflux,"pain (0.225), nausea (0.140), pain in upper ab..."
2,acute renal failure,"kidney failure (0.127), weakness (0.087), diar..."
3,alcohol intoxication,"slurred speech (0.071), vomiting (0.064), unst..."
4,alcoholism,"seizures (0.125), depression (0.090), sadness ..."


## Parse Symptoms Column

In [4]:
def parse_symptoms(symptom_string):
    pattern = r"([^,]+?)\s*\(([\d\.]+)\)"
    matches = re.findall(pattern, symptom_string)
    return [(m[0].strip().lower(), float(m[1])) for m in matches]

structured_data = []
for _, row in df.iterrows():
    disease = row.iloc[0].strip().lower()
    symptoms = parse_symptoms(row.iloc[1])
    structured_data.append((disease, symptoms))

structured_data[:2]

[('abscess',
  [('pain', 0.318),
   ('fever', 0.119),
   ('swelling', 0.112),
   ('redness', 0.094),
   ('chills', 0.092),
   ('infection', 0.083),
   ('cyst', 0.047),
   ('tenderness', 0.037),
   ('rectal pain', 0.026),
   ('lesion', 0.025),
   ('lump', 0.023),
   ('sore throat', 0.021),
   ('facial swelling', 0.016),
   ('pimple', 0.016),
   ('discomfort', 0.014),
   ('difficulty swallowing', 0.013),
   ('cavity', 0.013),
   ('night sweats', 0.007),
   ('severe pain', 0.007),
   ('abdominal pain', 0.007),
   ('painful swallowing', 0.007),
   ('back pain', 0.006)]),
 ('acid reflux',
  [('pain', 0.225),
   ('nausea', 0.14),
   ('pain in upper abdomen', 0.064),
   ('abdominal pain', 0.058),
   ('sadness', 0.052),
   ('depression', 0.052),
   ('vomiting', 0.048),
   ('anxiety', 0.045),
   ('diarrhea', 0.044),
   ('discomfort', 0.034),
   ('indigestion', 0.03),
   ('chills', 0.022),
   ('constipation', 0.021),
   ('dizziness', 0.021),
   ('heartburn', 0.02),
   ('chest pressure', 0.017),


## Create DuckDB Schema (Sequence-based, version-safe)

In [5]:
con = duckdb.connect('medical.db')

con.execute("CREATE SEQUENCE IF NOT EXISTS disease_seq START 1;")
con.execute("CREATE SEQUENCE IF NOT EXISTS symptom_seq START 1;")

con.execute("""
CREATE TABLE IF NOT EXISTS disease (
    disease_id INTEGER PRIMARY KEY DEFAULT nextval('disease_seq'),
    name TEXT UNIQUE
);
""")

con.execute("""
CREATE TABLE IF NOT EXISTS symptom (
    symptom_id INTEGER PRIMARY KEY DEFAULT nextval('symptom_seq'),
    name TEXT UNIQUE
);
""")

con.execute("""
CREATE TABLE IF NOT EXISTS disease_symptom (
    disease_id INTEGER,
    symptom_id INTEGER,
    incidence FLOAT,
    PRIMARY KEY (disease_id, symptom_id)
);
""")

con.execute("""
CREATE TABLE IF NOT EXISTS disease_embedding (
    disease_id INTEGER,
    embedding DOUBLE[]
);
""")

<_duckdb.DuckDBPyConnection at 0x768964086f30>

## Populate Tables

In [6]:
for disease, symptoms in structured_data:
    con.execute("INSERT OR IGNORE INTO disease (name) VALUES (?)", [disease])
    disease_id = con.execute(
        "SELECT disease_id FROM disease WHERE name = ?",
        [disease]
    ).fetchone()[0]

    for symptom, incidence in symptoms:
        con.execute("INSERT OR IGNORE INTO symptom (name) VALUES (?)", [symptom])
        symptom_id = con.execute(
            "SELECT symptom_id FROM symptom WHERE name = ?",
            [symptom]
        ).fetchone()[0]

        con.execute("""
            INSERT OR IGNORE INTO disease_symptom
            VALUES (?, ?, ?)
        """, [disease_id, symptom_id, incidence])

## Compute Disease Embeddings (TF-IDF, Fully Offline)

In [7]:
diseases = con.execute("SELECT disease_id, name FROM disease").fetchall()

disease_texts = []
disease_ids = []

for disease_id, name in diseases:
    symptoms = con.execute("""
        SELECT s.name, ds.incidence
        FROM disease_symptom ds
        JOIN symptom s ON ds.symptom_id = s.symptom_id
        WHERE ds.disease_id = ?
    """, [disease_id]).fetchall()

    weighted_text = []
    for symptom_name, incidence in symptoms:
        repetitions = max(1, int(incidence * 10))
        weighted_text.extend([symptom_name] * repetitions)

    disease_texts.append(" ".join(weighted_text))
    disease_ids.append(disease_id)

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(disease_texts)

for i, disease_id in enumerate(disease_ids):
    vector = tfidf_matrix[i].toarray()[0]
    con.execute(
        "INSERT INTO disease_embedding VALUES (?, ?)",
        [disease_id, vector.tolist()]
    )

print("Embeddings computed successfully (offline).")

Embeddings computed successfully (offline).


## Similarity Queries

In [16]:
def disease_similarity(d1, d2):
    e1 = con.execute("""
        SELECT embedding FROM disease_embedding de
        JOIN disease d ON de.disease_id = d.disease_id
        WHERE d.name = ?
    """, [d1]).fetchone()[0]

    e2 = con.execute("""
        SELECT embedding FROM disease_embedding de
        JOIN disease d ON de.disease_id = d.disease_id
        WHERE d.name = ?
    """, [d2]).fetchone()[0]

    return cosine_similarity([e1], [e2])[0][0]

# Example:
print("Disease similarity of colon cancer and liver cancer: ", disease_similarity('colon cancer', 'liver cancer'))
print("Disease similarity of cirrhosis of the liver and liver cancer: ", disease_similarity('cirrhosis of the liver', 'liver cancer'))
print("Disease similarity of common cold and liver cancer: ", disease_similarity('common cold', 'liver cancer'))

Disease similarity of colon cancer and liver cancer:  0.3124396075604429
Disease similarity of cirrhosis of the liver and liver cancer:  0.5681451154620615
Disease similarity of common cold and liver cancer:  0.1154716329368155
Disease similarity of common cold and common cold:  1.0000000000000002


In [17]:
def rank_diseases(symptom_list):
    query_text = " ".join(symptom_list)
    query_vector = vectorizer.transform([query_text]).toarray()[0]

    results = con.execute("""
        SELECT d.name, de.embedding
        FROM disease_embedding de
        JOIN disease d ON de.disease_id = d.disease_id
    """).fetchall()

    scores = []

    for name, embedding in results:
        sim = cosine_similarity([query_vector], [embedding])[0][0]
        scores.append((name, sim))

    return sorted(scores, key=lambda x: x[1], reverse=True)

# Example:
rank_diseases(['fever', 'cough'])

[('lung cancer', np.float64(0.40553979272363117)),
 ('neutropenia', np.float64(0.39178540168096154)),
 ('chronic obstructive pulmonary disease', np.float64(0.3725686656307814)),
 ('upper respiratory infection', np.float64(0.3390834449914737)),
 ('pleural effusion', np.float64(0.32891418898032254)),
 ('pneumonia', np.float64(0.32570797767656867)),
 ('lupus', np.float64(0.3197374424421777)),
 ('bronchitis', np.float64(0.3196381611445093)),
 ('congestive heart failure', np.float64(0.2988765887788456)),
 ('hiv/aids', np.float64(0.29217915783811077)),
 ('common cold', np.float64(0.27047643256789866)),
 ('asthma', np.float64(0.26097873634139335)),
 ('multiple myeloma', np.float64(0.2555597050395575)),
 ('hepatitis b', np.float64(0.2381620549684526)),
 ('sleep apnea', np.float64(0.23273754682543676)),
 ('meningitis', np.float64(0.2302834615812488)),
 ('pulmonary hypertension', np.float64(0.21849552591063434)),
 ('sinusitis', np.float64(0.21780861331835244)),
 ('cardiomyopathy', np.float64(0.2

In [None]:
def rank_similar_diseases(disease_name):
    # Get embedding of the reference disease
    base_embedding = con.execute("""
        SELECT de.embedding
        FROM disease_embedding de
        JOIN disease d ON de.disease_id = d.disease_id
        WHERE d.name = ?
    """, [disease_name]).fetchone()
    
    if base_embedding is None:
        raise ValueError(f"Disease '{disease_name}' not found.")
    
    base_vector = base_embedding[0]
    
    # Get all other diseases
    results = con.execute("""
        SELECT d.name, de.embedding
        FROM disease_embedding de
        JOIN disease d ON de.disease_id = d.disease_id
        WHERE d.name != ?
    """, [disease_name]).fetchall()
    
    scores = []
    
    for name, embedding in results:
        sim = cosine_similarity([base_vector], [embedding])[0][0]
        scores.append((name, sim))
    
    return sorted(scores, key=lambda x: x[1], reverse=True)

# Example:
rank_similar_diseases('common cold')