In [None]:
import pandas as pd
import re

# =========================
# Configuration
# =========================
INPUT_CSV = "../data/DerivedKnowledgeGraph_final.csv"
DISEASES_CSV = "../data/diseases.csv"
SYMPTOMS_CSV = "../data/symptoms.csv"

# =========================
# Load data
# =========================
df = pd.read_csv(INPUT_CSV)

# Assume:
# column 0 -> Disease
# column 1 -> Symptoms
disease_col = df.columns[0]
symptoms_col = df.columns[1]

# =========================
# Extract diseases
# =========================
diseases = (
    df[disease_col]
    .dropna()
    .astype(str)
    .str.strip()
    .unique()
)

diseases_df = pd.DataFrame(diseases, columns=["disease"])
diseases_df.to_csv(DISEASES_CSV, index=False)

# =========================
# Extract distinct symptoms
# =========================
all_symptoms = set()

for cell in df[symptoms_col].dropna():
    # Split by commas
    parts = str(cell).split(",")

    for part in parts:
        # Remove incidence in parentheses, e.g. "(0.45)"
        symptom = re.sub(r"\(.*?\)", "", part).strip()
        if symptom:
            all_symptoms.add(symptom)

symptoms_df = pd.DataFrame(sorted(all_symptoms), columns=["symptom"])
symptoms_df.to_csv(SYMPTOMS_CSV, index=False)

# =========================
# Console output
# =========================
print(f"Total number of diseases: {len(diseases_df)}")
print(f"Total number of distinct symptoms: {len(symptoms_df)}")

print("\nGenerated files:")
print(f"- {DISEASES_CSV}")
print(f"- {SYMPTOMS_CSV}")

Total number of diseases: 156
Total number of distinct symptoms: 330

Generated files:
- diseases.csv
- symptoms.csv


In [None]:
import pandas as pd
import re

# =========================
# Configuration
# =========================
INPUT_CSV = "../data/DerivedKnowledgeGraph_final.csv"
OUTPUT_MATRIX_CSV = "../data/disease-symptom-matrix.csv"

# =========================
# Load data
# =========================
df = pd.read_csv(INPUT_CSV)

# Assume:
# column 0 -> Disease
# column 1 -> Symptoms
disease_col = df.columns[0]
symptoms_col = df.columns[1]

# =========================
# Parse data into records
# =========================
records = []

for _, row in df.iterrows():
    disease = str(row[disease_col]).strip()
    symptoms_cell = row[symptoms_col]

    if pd.isna(symptoms_cell):
        continue

    # Split symptoms by comma
    for item in str(symptoms_cell).split(","):
        item = item.strip()

        # Match: symptom name + incidence in parentheses
        match = re.match(r"(.*?)\s*\((.*?)\)", item)

        if match:
            symptom = match.group(1).strip()
            incidence = float(match.group(2))
        else:
            # If no incidence is provided, assume 0
            symptom = item
            incidence = 0.0

        records.append((disease, symptom, incidence))

# =========================
# Create long-form DataFrame
# =========================
long_df = pd.DataFrame(
    records,
    columns=["disease", "symptom", "incidence"]
)

# =========================
# Pivot to matrix form
# =========================
matrix_df = (
    long_df
    .pivot_table(
        index="disease",
        columns="symptom",
        values="incidence",
        aggfunc="max",   # in case of duplicates
        fill_value=0
    )
    .sort_index(axis=0)
    .sort_index(axis=1)
)

# =========================
# Save output
# =========================
matrix_df.to_csv(OUTPUT_MATRIX_CSV)

# =========================
# Console output
# =========================
print("Disease–Symptom incidence matrix created.")
print(f"Number of diseases: {matrix_df.shape[0]}")
print(f"Number of symptoms: {matrix_df.shape[1]}")
print(f"Output file: {OUTPUT_MATRIX_CSV}")

Disease–Symptom incidence matrix created.
Number of diseases: 156
Number of symptoms: 330
Output file: disease_symptom_matrix.csv


In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

# =========================
# Configuration
# =========================
INPUT_CSV = "../data/DerivedKnowledgeGraph_final.csv"

BERT_MODEL = "bert-base-uncased"
CLINICAL_BERT_MODEL = "emilyalsentzer/Bio_ClinicalBERT"

BERT_OUTPUT = "../data/bert-embeddings.csv"
CLINICAL_BERT_OUTPUT = "../data/bert-clinical-embeddings.csv"

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# =========================
# Load diseases
# =========================
df = pd.read_csv(INPUT_CSV)
disease_col = df.columns[0]

diseases = (
    df[disease_col]
    .dropna()
    .astype(str)
    .str.strip()
    .unique()
)

# =========================
# Embedding function
# =========================
def generate_embeddings(texts, model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(DEVICE)
    model.eval()

    embeddings = []

    with torch.no_grad():
        for text in tqdm(texts, desc=f"Encoding with {model_name}"):
            inputs = tokenizer(
                text,
                return_tensors="pt",
                truncation=True,
                padding=True,
                max_length=32
            ).to(DEVICE)

            outputs = model(**inputs)

            # Use [CLS] token embedding
            cls_embedding = outputs.last_hidden_state[:, 0, :]
            embeddings.append(cls_embedding.squeeze(0).cpu().numpy())

    return embeddings

# =========================
# Generate embeddings
# =========================
bert_embeddings = generate_embeddings(diseases, BERT_MODEL)
clinical_bert_embeddings = generate_embeddings(diseases, CLINICAL_BERT_MODEL)

# =========================
# Save to CSV
# =========================
def save_embeddings(diseases, embeddings, output_file):
    emb_dim = embeddings[0].shape[0]
    columns = ["disease"] + [f"dim_{i}" for i in range(emb_dim)]

    data = [
        [disease] + embedding.tolist()
        for disease, embedding in zip(diseases, embeddings)
    ]

    df_out = pd.DataFrame(data, columns=columns)
    df_out.to_csv(output_file, index=False)

save_embeddings(diseases, bert_embeddings, BERT_OUTPUT)
save_embeddings(diseases, clinical_bert_embeddings, CLINICAL_BERT_OUTPUT)

# =========================
# Console output
# =========================
print(f"Total diseases encoded: {len(diseases)}")
print(f"BERT embedding dimension: {bert_embeddings[0].shape[0]}")
print(f"Clinical BERT embedding dimension: {clinical_bert_embeddings[0].shape[0]}")
print("\nGenerated files:")
print(f"- {BERT_OUTPUT}")
print(f"- {CLINICAL_BERT_OUTPUT}")

Encoding with bert-base-uncased: 100%|██████████| 156/156 [00:00<00:00, 176.89it/s]
Encoding with emilyalsentzer/Bio_ClinicalBERT: 100%|██████████| 156/156 [00:00<00:00, 172.70it/s]


Total diseases encoded: 156
BERT embedding dimension: 768
Clinical BERT embedding dimension: 768

Generated files:
- bert-embeddings.csv
- bert-clinical-embeddings.csv
