In [16]:
import re
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score


try:
    from sentence_transformers import SentenceTransformer
except ImportError:
    !pip install -q sentence-transformers
    from sentence_transformers import SentenceTransformer


In [17]:
from google.colab import drive
drive.mount('/content/drive')

import os

EXCEL_PATH = "/content/drive/MyDrive/Harvard HW#5/derm.xlsx"

print("Using path:", EXCEL_PATH)
print("File exists:", os.path.exists(EXCEL_PATH))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using path: /content/drive/MyDrive/Harvard HW#5/derm.xlsx
File exists: True


In [18]:
# Load and expload inclusion criteria

def load_and_expand_criteria(
    path: str,
    id_col: str = "nctid",
    crit_col: str = "inclusion"
) -> pd.DataFrame:

    df_raw = pd.read_excel(path)

    print("Columns:", df_raw.columns.tolist())

    df = df_raw[[id_col, crit_col]].copy()

    df[crit_col] = df[crit_col].astype(str)

    rows = []
    splitter = re.compile(r"(\n|\r|\r\n|\*)")

    for _, row in df.iterrows():
        nctid = row[id_col]
        text = row[crit_col]

        raw_parts = [p.strip() for p in splitter.split(text)
                     if p.strip() and p.strip() != "*"]

        cleaned_parts = []
        for p in raw_parts:
            p = re.sub(r"^[\-\*\u2022]?\s*(\d+[\.\)]\s*)?", "", p).strip()
            if len(p) > 0:
                cleaned_parts.append(p)

        for crit in cleaned_parts:
            rows.append({"nctid": nctid, "criterion": crit})

    expanded = pd.DataFrame(rows).drop_duplicates().reset_index(drop=True)
    print("Expanded to", len(expanded), "rows")
    return expanded


In [19]:
criteria_df = load_and_expand_criteria(EXCEL_PATH, crit_col="inclusion")
criteria_df.head()

Columns: ['nctid', 'inclusion']
Expanded to 26369 rows


Unnamed: 0,nctid,criterion
0,NCT00001137,HIV-1 infected
1,NCT00001137,Enrolled in an AIDS Clinical Trial Group (ACTG...
2,NCT00001137,Willing to provide consent for the release and...
3,NCT00001137,Life expectancy of at least 24 weeks
4,NCT00001137,Parent or guardian willing to provide informed...


In [20]:
# text normalization
def normalize_text(s: str) -> str:
    s = s.lower()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

criteria_df["clean_text"] = criteria_df["criterion"].apply(normalize_text)
criteria_df.head()

Unnamed: 0,nctid,criterion,clean_text
0,NCT00001137,HIV-1 infected,hiv 1 infected
1,NCT00001137,Enrolled in an AIDS Clinical Trial Group (ACTG...,enrolled in an aids clinical trial group actg ...
2,NCT00001137,Willing to provide consent for the release and...,willing to provide consent for the release and...
3,NCT00001137,Life expectancy of at least 24 weeks,life expectancy of at least 24 weeks
4,NCT00001137,Parent or guardian willing to provide informed...,parent or guardian willing to provide informed...


In [21]:
# run a tf-idf + Vectorization
# KMeans Clustering Algorithm
def vectorize_tfidf(texts, max_features=5000, ngram_range=(1, 2)):
    vectorizer = TfidfVectorizer(
        max_features=max_features,
        ngram_range=ngram_range,
        stop_words="english"
    )
    X = vectorizer.fit_transform(texts)
    return X, vectorizer

def cluster_kmeans(X, n_clusters: int, random_state: int = 42):
    model = KMeans(
        n_clusters=n_clusters,
        n_init=10,
        random_state=random_state
    )
    labels = model.fit_predict(X)
    return model, labels


K_TFIDF = 25

X_tfidf, tfidf_vec = vectorize_tfidf(criteria_df["clean_text"].tolist())
kmeans_tfidf, labels_tfidf = cluster_kmeans(X_tfidf, n_clusters=K_TFIDF)

criteria_df["cluster_tfidf"] = labels_tfidf
# now make DBSCAN
criteria_df.head()




Unnamed: 0,nctid,criterion,clean_text,cluster_tfidf
0,NCT00001137,HIV-1 infected,hiv 1 infected,3
1,NCT00001137,Enrolled in an AIDS Clinical Trial Group (ACTG...,enrolled in an aids clinical trial group actg ...,6
2,NCT00001137,Willing to provide consent for the release and...,willing to provide consent for the release and...,6
3,NCT00001137,Life expectancy of at least 24 weeks,life expectancy of at least 24 weeks,5
4,NCT00001137,Parent or guardian willing to provide informed...,parent or guardian willing to provide informed...,7


In [22]:
from sklearn.cluster import DBSCAN

def clusterDBSCAN(X, eps, min_smples):
    model = DBSCAN(eps=eps, min_samples=min_smples, metric='euclidean')
    labels = model.fit_predict(X)
    return model, labels

# run dbscan
eps = 0.5
min_samples = 5
dbscan_model, dbscan_labels = clusterDBSCAN(X_tfidf, eps, min_samples)
newdf = criteria_df.copy()

newdf["cluster_dbscan"] = dbscan_labels

newdf.head(20)

Unnamed: 0,nctid,criterion,clean_text,cluster_tfidf,cluster_dbscan
0,NCT00001137,HIV-1 infected,hiv 1 infected,3,-1
1,NCT00001137,Enrolled in an AIDS Clinical Trial Group (ACTG...,enrolled in an aids clinical trial group actg ...,6,-1
2,NCT00001137,Willing to provide consent for the release and...,willing to provide consent for the release and...,6,-1
3,NCT00001137,Life expectancy of at least 24 weeks,life expectancy of at least 24 weeks,5,-1
4,NCT00001137,Parent or guardian willing to provide informed...,parent or guardian willing to provide informed...,7,-1
5,NCT00001137,Active alcohol or drug abuse that may interfer...,active alcohol or drug abuse that may interfer...,3,-1
6,NCT00003199,Patients with inflammatory (stage IIIb) or res...,patients with inflammatory stage iiib or respo...,4,-1
7,NCT00003199,Patients should have received 4-7 cycles of an...,patients should have received 4 7 cycles of an...,4,-1
8,NCT00003199,Patient has received Cytoxan 4 gm/m\^2 x 1 and...,patient has received cytoxan 4 gm m 2 x 1 and ...,15,-1
9,NCT00003199,Stem cells were collected after mobilization w...,stem cells were collected after mobilization w...,3,-1


In [23]:
print(dbscan_labels[0]*5*-1==5)

True


In [24]:
# Comoute silhouette score for Kmeans
sample_idx = np.random.choice(X_tfidf.shape[0], size=min(5000, X_tfidf.shape[0]), replace=False)
sil_tfidf = silhouette_score(X_tfidf[sample_idx], labels_tfidf[sample_idx])
print("TFIDF KMeans silhouette (sample):", sil_tfidf)




# Compute silhouette score for DBSCAN
sample_idx = np.random.choice(
    X_tfidf.shape[0],
    size=min(5000, X_tfidf.shape[0]),
    replace=False
)

X_sample = X_tfidf[sample_idx]
labels_sample = dbscan_labels[sample_idx]

# Silhouette requires at least 2 non-noise clusters
unique_clusters = [c for c in set(labels_sample) if c != -1]

if len(unique_clusters) >= 2:
    sil_dbscan = silhouette_score(X_sample, labels_sample)
    print("DBSCAN silhouette (sample):", sil_dbscan)
else:
    print("DBSCAN silhouette cannot be computed — fewer than 2 clusters found.")

# in this case  DBSCAN is worse than random assignment, so we ignore it
# This means that we have to optimize it

TFIDF KMeans silhouette (sample): 0.03876892628178892
DBSCAN silhouette (sample): -0.16892565724074193


In [25]:
# run sentence embedding to breakdown words into roots
def build_embeddings(texts, model_name: str = "all-MiniLM-L6-v2", batch_size: int = 64):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=True
    )
    return embeddings, model

embeddings, sbert_model = build_embeddings(criteria_df["criterion"].tolist())
print("Embedding shape:", embeddings.shape)


# run hierarchial clustering in bottom up (algglomertive clustering)
def cluster_agglomerative(X, n_clusters: int):
    model = AgglomerativeClustering(
        n_clusters=n_clusters,
        linkage="ward"
    )
    labels = model.fit_predict(X)
    return model, labels

K_EMB = 30  # 20,30

agg_model, labels_emb = cluster_agglomerative(embeddings, n_clusters=K_EMB)
criteria_df["cluster_emb"] = labels_emb
sample_idx = np.random.choice(embeddings.shape[0], size=min(5000, embeddings.shape[0]), replace=False)
sil_emb = silhouette_score(embeddings[sample_idx], labels_emb[sample_idx])
print("Embedding Agglomerative silhouette (sample):", sil_emb)


Batches:   0%|          | 0/413 [00:00<?, ?it/s]

Embedding shape: (26369, 384)
Embedding Agglomerative silhouette (sample): 0.05606275


In [26]:
# --- UMAP dimensionality reduction on SBERT embeddings ---
try:
    import umap
except ImportError:
    !pip install -q umap-learn
    import umap

reducer = umap.UMAP(
    n_components=10,    # dimensionality of reduced space
    n_neighbors=15,     # how many neighbors define local structure
    min_dist=0.0,       # how tightly UMAP packs points
    metric="cosine",
    random_state=42
)

X_umap = reducer.fit_transform(embeddings)
print("UMAP shape:", X_umap.shape)


  warn(


UMAP shape: (26369, 10)


In [27]:
from sklearn.cluster import DBSCAN

# Try some reasonable first parameters
eps = 0.5         # neighborhood radius
min_samples = 10  # points required for a cluster

dbscan_umap = DBSCAN(
    eps=eps,
    min_samples=min_samples,
    metric="euclidean"
)

dbscan_umap_labels = dbscan_umap.fit_predict(X_umap)
criteria_df["cluster_dbscan_umap"] = dbscan_umap_labels

print("DBSCAN+UMAP unique labels:", sorted(set(dbscan_umap_labels)))
print("Counts:", {c: int((dbscan_umap_labels == c).sum()) for c in set(dbscan_umap_labels)})


DBSCAN+UMAP unique labels: [np.int64(-1), np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19), np.int64(20), np.int64(21), np.int64(22), np.int64(23), np.int64(24), np.int64(25), np.int64(26), np.int64(27), np.int64(28), np.int64(29), np.int64(30), np.int64(31), np.int64(32), np.int64(33), np.int64(34), np.int64(35), np.int64(36), np.int64(37), np.int64(38), np.int64(39), np.int64(40), np.int64(41), np.int64(42), np.int64(43), np.int64(44)]
Counts: {np.int64(0): 14754, np.int64(1): 1763, np.int64(2): 214, np.int64(3): 1194, np.int64(4): 423, np.int64(5): 60, np.int64(6): 274, np.int64(7): 271, np.int64(8): 381, np.int64(9): 432, np.int64(10): 399, np.int64(11): 1849, np.int64(12): 611, np.int64(13): 202, np.int64(14): 57, np.int64(15): 1486, np.int64(16): 106, np.int64(17)

In [28]:
# DBSCAN produces -1 for noise → silhouette must ignore these
mask = dbscan_umap_labels != -1
X_non_noise = X_umap[mask]
labels_non_noise = dbscan_umap_labels[mask]

if len(np.unique(labels_non_noise)) >= 2 and len(labels_non_noise) > 50:
    sample_idx = np.random.choice(
        X_non_noise.shape[0],
        size=min(5000, X_non_noise.shape[0]),
        replace=False
    )
    sil_dbscan_umap = silhouette_score(
        X_non_noise[sample_idx],
        labels_non_noise[sample_idx]
    )
    print("DBSCAN+UMAP silhouette (sample non-noise):", sil_dbscan_umap)
else:
    print("DBSCAN+UMAP silhouette cannot be computed (not enough clusters).")


DBSCAN+UMAP silhouette (sample non-noise): -0.008147223


In [31]:
def inspect_dbscan_umap(df, label_col="cluster_dbscan_umap", n_examples=5):
    for label in sorted(df[label_col].unique()):
        if label == -1:   # skip noise
            continue
        subset = df[df[label_col] == label].head(n_examples)
        print("="*80)
        print(f"DBSCAN+UMAP Cluster = {label}")
        for _, row in subset.iterrows():
            print(f"- [{row['nctid']}] {row['criterion']}")
        print()

inspect_dbscan_umap(criteria_df, n_examples=5)


DBSCAN+UMAP Cluster = 0
- [NCT00001137] HIV-1 infected
- [NCT00001137] Enrolled in an AIDS Clinical Trial Group (ACTG) parent study and has enrolled in this study on or before the Week 16 visit of the parent study, including the visit window of the parent study. More information on this criterion can be found in the protocol.
- [NCT00001137] Active alcohol or drug abuse that may interfere with the study
- [NCT00003199] Patients with inflammatory (stage IIIb) or responsive stage IV breast cancer with metastasis to soft tissue and/or bone; responsive stage IV disease is defined as patients who achieve a PR (\>= 50% reduction in measurable tumor burden) or CR following initial chemotherapy for metastatic disease or patients with locally recurrent disease (chest wall/axillary nodes) who are rendered disease-free following surgery or radiation therapy without receiving chemotherapy; bone disease is categorized as responsive if there is demonstrated sclerosis of prior lesions with no new les

In [29]:
def inspect_clusters(df: pd.DataFrame,
                     label_col: str,
                     n_examples: int = 5):
    for label in sorted(df[label_col].unique()):
        subset = df[df[label_col] == label].head(n_examples)
        print("=" * 80)
        print(f"Cluster {label_col} = {label}  (showing {len(subset)} examples)")
        for _, row in subset.iterrows():
            print(f"- [{row['nctid']}] {row['criterion']}")
        print()

# Example: quickly look at a few clusters in each method
inspect_clusters(criteria_df, "cluster_tfidf", n_examples=4)
inspect_clusters(criteria_df, "cluster_emb", n_examples=4)
print("DBSCAN AND DIM REDUCTION WITH UMAP")
inspect_clusters(criteria_df, "cluster_dbscan_umap", n_examples=5)


Cluster cluster_tfidf = 0  (showing 4 examples)
- [NCT00003199] ANC \> 1,000 cells/mm\^3 and platelets \> 30,000/cells/mm\^3 (transfusion independent) for at least 5 days before starting therapy
- [NCT00003895] Platelets (Plt) \>= 100,000/mm\^3
- [NCT00004067] The postoperative absolute neutrophil count (ANC) must be ≥ 1500/mm3 (or \<1500/mm3 if, in the opinion of the investigator, this represents an ethnic or racial variation of normal).
- [NCT00004067] Postoperative platelet count must be ≥ 100,000/mm3. Significant underlying hematologic disorders must be excluded when the platelet count is above the upper limit of normal for the lab.

Cluster cluster_tfidf = 1  (showing 4 examples)
- [NCT00024154] Male or female
- [NCT00201929] Female
- [NCT00217672] Female 18 and over
- [NCT00270972] Male or female

Cluster cluster_tfidf = 2  (showing 4 examples)
- [NCT00005617] Adults over the age of 18 with malignant melanoma.
- [NCT00028405] Age 18 or older.
- [NCT00044291] Women age 18 years or

In [30]:
# Map numbers to string labels
# Look at all clustering algorithms and see which ones are better


def numeric_to_string_labels(int_labels: np.ndarray) -> np.ndarray:
    uniq = np.unique(int_labels)
    mapping = {c: f"label{chr(ord('A') + i)}" for i, c in enumerate(uniq)}
    return np.array([mapping[c] for c in int_labels])

criteria_df["cluster_final"] = numeric_to_string_labels(criteria_df["cluster_emb"].values)

results_df = criteria_df[["nctid", "criterion", "cluster_final"]].copy()
results_df.columns = ["nctid", "criterion", "cluster"]


RESULTS_PATH = "results_sendroff_partner.txt"

results_df.to_csv(
    RESULTS_PATH,
    sep="\t",
    index=False
)

print("Saved results to", RESULTS_PATH)
results_df.head()


Saved results to results_sendroff_partner.txt


Unnamed: 0,nctid,criterion,cluster
0,NCT00001137,HIV-1 infected,labelK
1,NCT00001137,Enrolled in an AIDS Clinical Trial Group (ACTG...,labelE
2,NCT00001137,Willing to provide consent for the release and...,labelE
3,NCT00001137,Life expectancy of at least 24 weeks,label[
4,NCT00001137,Parent or guardian willing to provide informed...,labelH
