In [None]:
# 1. Import dependencies

import json
import pandas as pd
import numpy as np
import ast
from pathlib import Path
import hdbscan

In [None]:
# 2. Configuration

INPUT_JSON  = "../data/processed/input_sequences.json"
OUTPUT_JSON = "../processed/selected_ngs/sele_ngs.json"
N_SELECT    = 1200            # total sequences to select
MIN_CLUSTER_SIZE = 1000         # HDBSCAN parameter

In [None]:
# 3. Load & Parse Data

df = pd.read_json(INPUT_JSON) 
assert {"sequence", "embedding_pca", "enrichment"}.issubset(df.columns), \
       "Input JSON must contain 'sequence', 'embedding_pca', 'enrichment'."

# Convert stringified embeddings to lists if needed
def parse_embedding(val):
    if isinstance(val, str):
        return ast.literal_eval(val)
    return val

df["embedding"] = df["embedding_pca"].apply(parse_embedding)

# Stack embeddings into array
emb_matrix = np.vstack(df["embedding"].values)

In [None]:
# 4. Expand Data by √Enrichment

replicas = np.floor(np.sqrt(df["enrichment"])).astype(int)
replicas = replicas.clip(lower=1)

# Repeat rows according to replicas
df_expanded = df.loc[df.index.repeat(replicas)].reset_index(drop=True)
# Repeat embeddings similarly
emb_expanded = np.repeat(emb_matrix, replicas, axis=0)

In [None]:
# 5. Cluster with HDBSCAN

clusterer = hdbscan.HDBSCAN(min_cluster_size=MIN_CLUSTER_SIZE)
labels = clusterer.fit_predict(emb_expanded)
df_expanded["cluster"] = labels

In [None]:
# 6. Select Top Sequences per Cluster

# Identify non-noise clusters
clusters = [c for c in np.unique(labels) if c >= 0]
if not clusters:
    # fallback: treat all as one cluster
    df_expanded["cluster"] = 0
    clusters = [0]

per_cluster = int(np.ceil(N_SELECT / len(clusters)))

selected = []
print(len(clusters))
for c in clusters:
    sub = df_expanded[df_expanded["cluster"] == c]
    # sort by enrichment, drop duplicate sequences
    top = (
        sub.sort_values("enrichment", ascending=False)
           .drop_duplicates("sequence")
           .head(per_cluster)
    )
    selected.extend(top["sequence"].tolist())

# Deduplicate and limit to N_SELECT
selected = list(dict.fromkeys(selected))[:N_SELECT]

# If underfilled, sample from noise cluster (-1)
if len(selected) < N_SELECT:
    noise = df_expanded[df_expanded["cluster"] == -1]
    noise_top = (
        noise.sort_values("enrichment", ascending=False)
             .drop_duplicates("sequence")
    )
    for seq in noise_top["sequence"]:
        if seq not in selected:
            selected.append(seq)
        if len(selected) == N_SELECT:
            break

In [None]:
# 7. Save Selected Sequences

# Filter original df for selected sequences
final_df = df[df["sequence"].isin(selected)].reset_index(drop=True)

# Map each sequence to its cluster label (from df_expanded)
# We take the first cluster seen for each sequence
cluster_map = (
    df_expanded
      .drop_duplicates("sequence")
      .set_index("sequence")["cluster"]
)
final_df["cluster"] = final_df["sequence"].map(cluster_map)

# Flag which sequences made it into r1_test
final_df["r1_test"] = True

# Write to JSON (now with 'cluster' and 'r1_test' columns)
final_df.to_json(OUTPUT_JSON, orient="records", lines=True)

print(f"Selected {len(final_df)} sequences saved to {OUTPUT_JSON}")
final_df.head()