In [1]:
# ===== Notebook 3: Clustering Analysis =====
# Axora: Clinical Document Classification

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from datasets import load_dataset
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import (
    homogeneity_score, completeness_score, v_measure_score,
    adjusted_rand_score, adjusted_mutual_info_score, silhouette_score
)

from collections import Counter
import textwrap
import warnings, os, sys, re, math, random

# Plot aesthetics
sns.set(style="whitegrid", font_scale=1.05)
plt.rcParams["figure.figsize"] = (10, 6)

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

warnings.filterwarnings("ignore")
print("✅ Imports ready.")

✅ Imports ready.


In [2]:
# Load the same dataset used in Notebook 1
dataset = load_dataset("hpe-ai/medical-cases-classification-tutorial")

train_df = pd.DataFrame(dataset["train"])
val_df   = pd.DataFrame(dataset["validation"])
test_df  = pd.DataFrame(dataset["test"])

# Keep only the fields we need
cols = ["transcription", "medical_specialty"]
train_df = train_df[cols].copy()
val_df   = val_df[cols].copy()
test_df  = test_df[cols].copy()

# Combine for unsupervised clustering (we'll still keep true labels for evaluation)
all_df = pd.concat(
    [
        train_df.assign(split="train"),
        val_df.assign(split="val"),
        test_df.assign(split="test"),
    ],
    axis=0,
    ignore_index=True
)

# Basic overview
print("=== Dataset Overview (Notebook 3) ===")
print(f"Total documents: {len(all_df)}")
print(f"  Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")
print(f"Unique specialties: {all_df['medical_specialty'].nunique()}")
print(all_df["medical_specialty"].value_counts().head())

Repo card metadata block was not found. Setting CardData to empty.


=== Dataset Overview (Notebook 3) ===
Total documents: 2464
  Train: 1724 | Val: 370 | Test: 370
Unique specialties: 13
medical_specialty
Cardiovascular / Pulmonary    742
Orthopedic                    408
Neurology                     282
Gastroenterology              222
Obstetrics / Gynecology       182
Name: count, dtype: int64


In [3]:
# Minimal cleaner for robustness (keep consistent with earlier logic)
def clean_text(text: str) -> str:
    if not isinstance(text, str):
        text = str(text)
    text = text.lower()
    # keep letters, digits, and spaces; remove boilerplate symbols
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

all_df["text_clean"] = all_df["transcription"].map(clean_text)
print("Sample cleaned text:\n", textwrap.shorten(all_df.iloc[0]["text_clean"], width=200))

Sample cleaned text:
 procedure note pacemaker icd interrogation history of present illness the patient is a 67 year old gentleman who was admitted to the hospital he has had icd pacemaker implantation this is a st [...]


In [4]:
# Match Notebook 1 settings: max_features=5000, english stopwords, ngram_range=(1,2)
vectorizer = TfidfVectorizer(
    max_features=5000,
    stop_words="english",
    ngram_range=(1, 2),
    lowercase=False,   # we already lowercased
    norm="l2"          # default; good for cosine-like behavior with euclidean
)

X_all = vectorizer.fit_transform(all_df["text_clean"])
feature_names = np.array(vectorizer.get_feature_names_out())

print("=== TF-IDF Matrix ===")
print("Shape:", X_all.shape)
print("Features (first 10):", feature_names[:10])

=== TF-IDF Matrix ===
Shape: (2464, 5000)
Features (first 10): ['00' '000' '000 epinephrine' '01' '02' '03' '04' '05' '06' '07']
