# Import Packages

In [1]:
import pandas as pd
import numpy as np
import re, html, time, warnings, torch
import matplotlib.pyplot as plt
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
from sklearn.preprocessing import LabelEncoder, normalize, StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, adjusted_rand_score, normalized_mutual_info_score, confusion_matrix
from scipy.optimize import linear_sum_assignment
from skopt import BayesSearchCV
from skopt.space import Categorical
from sklearn.base import BaseEstimator
from skopt.callbacks import DeltaYStopper
from skopt.utils import point_asdict
from collections import deque
import umap

warnings.filterwarnings("ignore")

# Load Data

In [2]:
df_min_cleaned = pd.read_csv("min_cleaned_tweets.csv")
texts = df_min_cleaned["clean_text"].astype(str).tolist()

df_min_cleaned_sample = pd.read_csv("min_cleaned_sample_tweets.csv")

# BERTweet

In [None]:
RANDOM_STATE = 42
BERTWEET_MODEL = "finiteautomata/bertweet-base-sentiment-analysis"

# ==========================================================
# 1️⃣ Load & Clean Datasets
# ==========================================================

# Encode sentiments consistently
le = LabelEncoder()
le.fit(df_min_cleaned["airline_sentiment"].astype(str))
df_min_cleaned["label_encoded"] = le.transform(df_min_cleaned["airline_sentiment"].astype(str))
df_min_cleaned_sample["label_encoded"] = le.transform(df_min_cleaned_sample["airline_sentiment"].astype(str))
print(f"✅ Sentiment classes: {list(le.classes_)}")

texts_full = df_min_cleaned["clean_text"].tolist()
texts_sample = df_min_cleaned_sample["clean_text"].tolist()
y_full = df_min_cleaned["label_encoded"].values
y_sample = df_min_cleaned_sample["label_encoded"].values

# ==========================================================
# 2️⃣ Load Pretrained Model (BERTweet)
# ==========================================================
model_name = BERTWEET_MODEL
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, output_hidden_states=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
print(f"✅ Model loaded on device: {device}")

# ==========================================================
# 3️⃣ Extract Embeddings + Logits
# ==========================================================
def get_logits(texts, batch_size=32):
    all_logits = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Extracting logits"):
        batch = texts[i:i+batch_size]
        tokens = tokenizer(batch, padding=True, truncation=True, return_tensors='pt').to(device)
        with torch.no_grad():
            outputs = model(**tokens)
        all_logits.append(outputs.logits.detach().cpu().numpy())
    return np.vstack(all_logits)

def get_hidden_embeddings(texts, batch_size=32, layer=-2):
    all_embs = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Extracting embeddings"):
        batch = texts[i:i+batch_size]
        tokens = tokenizer(batch, padding=True, truncation=True, return_tensors='pt').to(device)
        with torch.no_grad():
            outputs = model(**tokens, output_hidden_states=True)
            hidden_states = outputs.hidden_states[layer]
            all_embs.append(hidden_states.mean(dim=1).cpu().numpy())
    return np.vstack(all_embs)

print("\n🔹 Extracting SAMPLE (20%) embeddings...")
X_emb_sample = get_hidden_embeddings(texts_sample)
X_logits_sample = get_logits(texts_sample)
X_hybrid_sample = np.concatenate([X_emb_sample, X_logits_sample], axis=1).astype(np.float64)
print(f"✅ SAMPLE hybrid features shape: {X_hybrid_sample.shape}")

print("\n🔹 Extracting FULL dataset embeddings...")
X_emb_full = get_hidden_embeddings(texts_full)
X_logits_full = get_logits(texts_full)
X_hybrid_full = np.concatenate([X_emb_full, X_logits_full], axis=1).astype(np.float64)
print(f"✅ FULL hybrid features shape: {X_hybrid_full.shape}")

# ==========================================================
# 4️⃣ Clustering Optimization Setup
# ==========================================================
def hungarian_accuracy(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    r, c = linear_sum_assignment(-cm)
    return cm[r, c].sum() / cm.sum()

def evaluate_full(X, y_true_int, labels, sil_metric):
    sil = silhouette_score(X, labels, metric=sil_metric)
    ari = adjusted_rand_score(y_true_int, labels)
    nmi = normalized_mutual_info_score(y_true_int, labels)
    acc = hungarian_accuracy(y_true_int, labels)
    return sil, ari, nmi, acc

# ==========================================================
# 5️⃣ PCA Scree Plot + UMAP Grids
# ==========================================================
def pick_pca_candidates(X, pct_low=0.80, pct_high=0.95, k=5, plot=True):
    max_components = X.shape[1]
    Xs = StandardScaler().fit_transform(X)
    pca = PCA(n_components=max_components, random_state=RANDOM_STATE)
    pca.fit(Xs)
    evr = pca.explained_variance_ratio_
    cum = np.cumsum(evr)
    thresholds = np.linspace(pct_low, pct_high, k)
    comps = [int(np.argmax(cum >= t)) + 1 for t in thresholds]
    comps = sorted(set(comps))

    if plot:
        xs = np.arange(1, len(evr) + 1)
        plt.figure(figsize=(8, 5))
        plt.plot(xs, evr, marker="o", label="Individual")
        plt.plot(xs, cum, marker="x", label="Cumulative")
        for t, c in zip(thresholds, comps):
            plt.axvline(c, linestyle="--", alpha=0.3)
            plt.text(c, 0.02, f"{int(t*100)}%→{c}", rotation=90, va="bottom", ha="right", fontsize=8)
        plt.title("Scree Plot — Explained & Cumulative Variance (BERTweet Hybrid)")
        plt.xlabel("Principal Component"); plt.ylabel("Variance Ratio")
        plt.legend(); plt.grid(True); plt.tight_layout(); plt.show()
    return comps, cum

pca_candidates, _cum = pick_pca_candidates(X_hybrid_sample, pct_low=0.80, pct_high=0.95, k=5, plot=True)
umap_neighbors  = [15, 30, 45, 60, 75, 100, 150, 200]
umap_min_dist   = [0.1, 0.2, 0.3, 0.4, 0.5]
umap_components = [16, 32, 48, 64, 96, 128]

# ==========================================================
# 6️⃣ Clustering Pipeline + BayesSearchCV
# ==========================================================
_LAST_TRIAL = deque(maxlen=1)

class ClusteringPipeline(BaseEstimator):
    def __init__(self, reducer='pca', n_components=50,
                 n_neighbors=15, min_dist=0.1, n_components_umap=16,
                 model='kmeans', cov_type='full', linkage='average'):
        self.reducer = reducer
        self.n_components = n_components
        self.n_neighbors = n_neighbors
        self.min_dist = min_dist
        self.n_components_umap = n_components_umap
        self.model = model
        self.cov_type = cov_type
        self.linkage = linkage

    def fit(self, X, y=None):
        _LAST_TRIAL.clear(); _LAST_TRIAL.append(self.__dict__)
        if self.reducer == 'pca':
            X_std = StandardScaler().fit_transform(X)
            X_red = PCA(n_components=self.n_components, random_state=RANDOM_STATE).fit_transform(X_std)
        else:
            X_l2 = normalize(X)
            X_red = umap.UMAP(
                n_neighbors=self.n_neighbors,
                min_dist=self.min_dist,
                n_components=self.n_components_umap,
                metric='cosine',
                random_state=RANDOM_STATE
            ).fit_transform(X_l2)

        if self.model == 'kmeans':
            X_use = normalize(X_red)
            labels = KMeans(n_clusters=3, n_init=10, algorithm='elkan', random_state=RANDOM_STATE).fit_predict(X_use)
            self.metric = "cosine"
        elif self.model == 'gmm':
            X_use = np.asarray(X_red, dtype=np.float64)
            try:
                labels = GaussianMixture(
                    n_components=3, n_init=10, random_state=RANDOM_STATE,
                    covariance_type=self.cov_type
                ).fit_predict(X_use)
            except ValueError:
                labels = GaussianMixture(
                    n_components=3, n_init=10, random_state=RANDOM_STATE,
                    covariance_type=self.cov_type, reg_covar=1e-5
                ).fit_predict(X_use)
            self.metric = "euclidean"
        else:
            X_use = normalize(X_red)
            labels = AgglomerativeClustering(
                n_clusters=3, linkage=self.linkage, metric='cosine'
            ).fit_predict(X_use)
            self.metric = "cosine"

        self.labels_ = labels
        self.X_use_ = X_use
        self.score_ = silhouette_score(X_use, labels, metric=self.metric)
        return self

    def score(self, X, y=None):
        return self.score_

search_spaces = [
    {'reducer': Categorical(['pca']),
     'n_components': Categorical(pca_candidates),
     'model': Categorical(['kmeans'])},
    {'reducer': Categorical(['pca']),
     'n_components': Categorical(pca_candidates),
     'model': Categorical(['gmm']),
     'cov_type': Categorical(['full','tied','diag','spherical'])},
    {'reducer': Categorical(['pca']),
     'n_components': Categorical(pca_candidates),
     'model': Categorical(['agglo']),
     'linkage': Categorical(['average','complete','single'])},
    {'reducer': Categorical(['umap']),
     'n_neighbors': Categorical(umap_neighbors),
     'min_dist': Categorical(umap_min_dist),
     'n_components_umap': Categorical(umap_components),
     'model': Categorical(['kmeans'])},
    {'reducer': Categorical(['umap']),
     'n_neighbors': Categorical(umap_neighbors),
     'min_dist': Categorical(umap_min_dist),
     'n_components_umap': Categorical(umap_components),
     'model': Categorical(['gmm']),
     'cov_type': Categorical(['full','tied','diag','spherical'])},
    {'reducer': Categorical(['umap']),
     'n_neighbors': Categorical(umap_neighbors),
     'min_dist': Categorical(umap_min_dist),
     'n_components_umap': Categorical(umap_components),
     'model': Categorical(['agglo']),
     'linkage': Categorical(['average','complete','single'])},
]

TOTAL_ITERS = 60
_timings, _start, _prev = [], [None], [None]

def _infer_branch_from_space(space):
    reducer = model = None
    for dim in getattr(space, "dimensions", []):
        cats = getattr(dim, "categories", None)
        if cats and len(cats) == 1:
            if cats[0] in ("pca", "umap"): reducer = cats[0]
            elif cats[0] in ("kmeans", "gmm", "agglo"): model = cats[0]
    return f"{reducer or '?'}+{model or '?'}"

def progress_callback(res):
    import numpy as np, time
    now = time.perf_counter()
    if _start[0] is None: _start[0] = now
    if _prev[0] is not None: _timings.append(now - _prev[0])
    k = len(res.x_iters)
    avg = np.mean(_timings) if _timings else 0
    elapsed = now - _start[0]; remaining = max(TOTAL_ITERS - k, 0) * avg
    branch = _infer_branch_from_space(res.space)
    print(f"[Bayes] iter {k:>3}/{TOTAL_ITERS} ({k/TOTAL_ITERS:5.1%}) "
          f"| avg {avg:5.2f}s | elapsed {elapsed/60:4.1f}m ETA {remaining/60:4.1f}m | branch {branch}")
    _prev[0] = now

n = X_hybrid_sample.shape[0]
dummy_y = np.zeros(n, dtype=int)
cv_full = [(np.arange(n), np.arange(n))]

search = BayesSearchCV(
    estimator=ClusteringPipeline(),
    search_spaces=search_spaces,
    n_iter=TOTAL_ITERS,
    random_state=RANDOM_STATE,
    scoring=None,
    cv=cv_full,
    n_points=4,
    n_jobs=-1,
    return_train_score=False
)

print("\n=== Running BayesSearchCV (PCA/UMAP × KMeans/GMM/Agglo) ===")
_prev[0] = time.perf_counter(); _start[0] = _prev[0]
search.fit(X_hybrid_sample, dummy_y, callback=[DeltaYStopper(delta=1e-4, n_best=15), progress_callback])

print("\n🏁 DONE — Bayesian optimization complete.")
print("Best parameters:", search.best_params_)
print("Best score:", search.best_score_)

✅ Sentiment classes: ['negative', 'neutral', 'positive']


pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

  [2m2025-10-24T07:37:21.318663Z[0m [31mERROR[0m  [31mPython exception updating progress:, error: PyErr { type: <class 'LookupError'>, value: LookupError(<ContextVar name='shell_parent' at 0x120843fb0>), traceback: Some(<traceback object at 0x147791480>) }, [1;31mcaller[0m[31m: "src/progress_update.rs:313"[0m
    [2;3mat[0m /Users/runner/work/xet-core/xet-core/error_printer/src/lib.rs:28

  [2m2025-10-24T07:37:21.322651Z[0m [31mERROR[0m  [31mPython exception updating progress:, error: PyErr { type: <class 'LookupError'>, value: LookupError(<ContextVar name='shell_parent' at 0x120843fb0>), traceback: Some(<traceback object at 0x147817600>) }, [1;31mcaller[0m[31m: "src/progress_update.rs:313"[0m
    [2;3mat[0m /Users/runner/work/xet-core/xet-core/error_printer/src/lib.rs:28

  [2m2025-10-24T07:37:21.355769Z[0m [31mERROR[0m  [31mPython exception updating progress:, error: PyErr { type: <class 'LookupError'>, value: LookupError(<ContextVar name='shell_parent' at 

In [None]:
# ---------------------------
# CONFIG
# ---------------------------
EMB_DIM = 768
RANDOM_STATE = 42

# ==========================================================
# Load BERTweet embeddings (no normalization)
# ==========================================================
model_name = "finiteautomata/bertweet-base-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, output_hidden_states=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
print(f"✅ Model loaded on device: {device}")

def get_logits(texts, batch_size=32):
    all_logits = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Extracting logits"):
        batch = texts[i:i+batch_size]
        tokens = tokenizer(batch, padding=True, truncation=True, return_tensors='pt').to(device)
        with torch.no_grad():
            outputs = model(**tokens)
        all_logits.append(outputs.logits.detach().cpu().numpy())
    return np.vstack(all_logits)

def get_hidden_embeddings(texts, batch_size=32, layer=-2):
    all_embs = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Extracting embeddings"):
        batch = texts[i:i+batch_size]
        tokens = tokenizer(batch, padding=True, truncation=True, return_tensors='pt').to(device)
        with torch.no_grad():
            outputs = model(**tokens, output_hidden_states=True)
            hidden_states = outputs.hidden_states[layer]
            all_embs.append(hidden_states.mean(dim=1).cpu().numpy())
    return np.vstack(all_embs)

# ---------------------------
# Metrics
# ---------------------------
def hungarian_accuracy(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    r, c = linear_sum_assignment(-cm)
    return cm[r, c].sum() / cm.sum()

def evaluate_full(X, y_true_int, labels, sil_metric):
    sil = silhouette_score(X, labels, metric=sil_metric)
    ari = adjusted_rand_score(y_true_int, labels)
    nmi = normalized_mutual_info_score(y_true_int, labels)
    acc = hungarian_accuracy(y_true_int, labels)
    return sil, ari, nmi, acc

# ---------------------------
# Prepare Data
# ---------------------------
texts_sample = df_min_cleaned_sample["clean_text"].astype(str).tolist()
texts_full   = df_min_cleaned["clean_text"].astype(str).tolist()

# Encode sentiments consistently
le = LabelEncoder()
le.fit(df_min_cleaned["airline_sentiment"].astype(str))
df_min_cleaned["label_encoded"] = le.transform(df_min_cleaned["airline_sentiment"].astype(str))
df_min_cleaned_sample["label_encoded"] = le.transform(df_min_cleaned_sample["airline_sentiment"].astype(str))

texts_full = df_min_cleaned["cleaned_text"].tolist()
texts_sample = df_min_cleaned_sample["cleaned_text"].tolist()
y_full = df_min_cleaned["label_encoded"].values
y_sample = df_min_cleaned_sample["label_encoded"].values

X_emb_sample = get_hidden_embeddings(texts_sample)
X_logits_sample = get_logits(texts_sample)
X_hybrid_sample = np.concatenate([X_emb_sample, X_logits_sample], axis=1).astype(np.float64)

X_emb_full = get_hidden_embeddings(texts_full)
X_logits_full = get_logits(texts_full)
X_hybrid_full = np.concatenate([X_emb_full, X_logits_full], axis=1).astype(np.float64)

# ---------------------------
# PCA Scree Plot + pick 5 n_components (80–95% cum var, evenly spaced)
# ---------------------------
def pick_pca_candidates(X, pct_low=0.80, pct_high=0.95, k=5, max_components=None, plot=True):
    if max_components is None:
        max_components = min(EMB_DIM, X.shape[1])
    Xs   = StandardScaler().fit_transform(X)
    pca  = PCA(n_components=max_components, random_state=RANDOM_STATE)
    pca.fit(Xs)
    evr  = pca.explained_variance_ratio_
    cum  = np.cumsum(evr)

    thresholds = np.linspace(pct_low, pct_high, k)
    comps = []
    for t in thresholds:
        idx = int(np.argmax(cum >= t)) + 1
        comps.append(idx)

    # dedupe while preserving order
    seen = set(); comps_unique = []
    for c in comps:
        if c not in seen:
            comps_unique.append(c); seen.add(c)

    # ensure exactly k values (best-effort padding)
    while len(comps_unique) < k:
        step = max(1, (comps_unique[-1] - comps_unique[0]) // (k-1))
        candidate = min(max_components, comps_unique[-1] + step)
        if candidate not in seen:
            comps_unique.append(candidate); seen.add(candidate)
        else:
            candidate = min(max_components, candidate+1)
            if candidate not in seen:
                comps_unique.append(candidate); seen.add(candidate)
            else:
                break

    comps_unique = sorted(comps_unique)[:k]

    if plot:
        xs = np.arange(1, len(evr)+1)
        plt.figure(figsize=(8,5))
        plt.plot(xs, evr, marker='o', label='Individual')
        plt.plot(xs, cum, marker='x', label='Cumulative')
        for t, c in zip(thresholds, comps):
            plt.axvline(c, linestyle='--', alpha=0.3)
            plt.text(c, 0.02, f'{int(t*100)}%→{c}', rotation=90, va='bottom', ha='right', fontsize=8)
        plt.title('Scree Plot — Explained & Cumulative Variance')
        plt.xlabel('Principal Component'); plt.ylabel('Variance Ratio')
        plt.legend(); plt.grid(True); plt.tight_layout(); plt.show()

    return comps_unique, cum

pca_candidates, _cum = pick_pca_candidates(
    X_hybrid_sample, pct_low=0.80, pct_high=0.95, k=5, max_components=EMB_DIM, plot=True
)
print("PCA n_components candidates (80–95% cum var):", pca_candidates)

# Fixed UMAP candidate grids (your choices)
umap_neighbors  = [15, 30, 45, 60, 75, 100, 150, 200]
umap_min_dist   = [0.1, 0.2, 0.3, 0.4, 0.5]
umap_components = [16, 32, 48, 64, 96, 128]

# ---------------------------
# ClusteringPipeline class (for BayesSearchCV)
# ---------------------------
_LAST_TRIAL = deque(maxlen=1)

class ClusteringPipeline(BaseEstimator):
    def __init__(self,
                 reducer='pca', n_components=50,
                 n_neighbors=15, min_dist=0.1, n_components_umap=16,
                 model='kmeans', cov_type='full', linkage='average'):
        self.reducer = reducer
        self.n_components = n_components
        self.n_neighbors = n_neighbors
        self.min_dist = min_dist
        self.n_components_umap = n_components_umap
        self.model = model
        self.cov_type = cov_type
        self.linkage = linkage

    def fit(self, X, y=None):
        # record the params actually used (for robust logging)
        _LAST_TRIAL.clear()
        _LAST_TRIAL.append({
            'reducer'          : self.reducer,
            'n_components'     : self.n_components,
            'n_neighbors'      : self.n_neighbors,
            'min_dist'         : self.min_dist,
            'n_components_umap': self.n_components_umap,
            'model'            : self.model,
            'cov_type'         : self.cov_type,
            'linkage'          : self.linkage,
        })
        # ---------------- Reducer ----------------
        if self.reducer == 'pca':
            X_std = StandardScaler().fit_transform(X)
            X_red = PCA(n_components=self.n_components, random_state=RANDOM_STATE).fit_transform(X_std)
        else:
            X_l2 = normalize(X)
            X_red = umap.UMAP(
                n_neighbors=self.n_neighbors,
                min_dist=self.min_dist,
                n_components=self.n_components_umap,
                metric='cosine',
                random_state=RANDOM_STATE,
                n_epochs=120,
                low_memory=True
            ).fit_transform(X_l2)

        # ---------------- Clustering ----------------
        if self.model == 'kmeans':
            X_use = normalize(X_red)
            labels = KMeans(n_clusters=3, n_init=10, algorithm='elkan', random_state=RANDOM_STATE).fit_predict(X_use)
            self.metric = "cosine"
        elif self.model == 'gmm':
            X_use = np.asarray(X_red, dtype=np.float64)
            try:
                labels = GaussianMixture(
                    n_components=3, n_init=10, random_state=RANDOM_STATE,
                    covariance_type=self.cov_type
                ).fit_predict(X_use)
            except ValueError:
                labels = GaussianMixture(
                    n_components=3, n_init=10, random_state=RANDOM_STATE,
                    covariance_type=self.cov_type, reg_covar=1e-5
                ).fit_predict(X_use)
            self.metric = "euclidean"
        else:  # Agglo
            X_use = normalize(X_red)
            labels = AgglomerativeClustering(
                n_clusters=3, linkage=self.linkage, metric='cosine'
            ).fit_predict(X_use)
            self.metric = "cosine"

        # store for score()
        self.labels_ = labels
        self.X_use_  = X_use
        self.score_  = silhouette_score(X_use, labels, metric=self.metric)
        return self

    def score(self, X, y=None):
        return self.score_

# ---------------------------
# Bayesian Search Space 
# ---------------------------
search_spaces = [
    # PCA + KMeans
    {
        'reducer'      : Categorical(['pca'],        name='reducer'),
        'n_components' : Categorical(pca_candidates, name='n_components'),
        'model'        : Categorical(['kmeans'],     name='model'),
    },
    # PCA + GMM
    {
        'reducer'      : Categorical(['pca'],        name='reducer'),
        'n_components' : Categorical(pca_candidates, name='n_components'),
        'model'        : Categorical(['gmm'],        name='model'),
        'cov_type'     : Categorical(['full','tied','diag','spherical'], name='cov_type'),
    },
    # PCA + Agglo (no 'ward' since metric='cosine')
    {
        'reducer'      : Categorical(['pca'],        name='reducer'),
        'n_components' : Categorical(pca_candidates, name='n_components'),
        'model'        : Categorical(['agglo'],      name='model'),
        'linkage'      : Categorical(['average','complete','single'], name='linkage'),
    },

    # UMAP + KMeans
    {
        'reducer'           : Categorical(['umap'],  name='reducer'),
        'n_neighbors'       : Categorical(umap_neighbors,  name='n_neighbors'),
        'min_dist'          : Categorical(umap_min_dist,   name='min_dist'),
        'n_components_umap' : Categorical(umap_components, name='n_components_umap'),
        'model'             : Categorical(['kmeans'],      name='model'),
    },
    # UMAP + GMM
    {
        'reducer'           : Categorical(['umap'],  name='reducer'),
        'n_neighbors'       : Categorical(umap_neighbors,  name='n_neighbors'),
        'min_dist'          : Categorical(umap_min_dist,   name='min_dist'),
        'n_components_umap' : Categorical(umap_components, name='n_components_umap'),
        'model'             : Categorical(['gmm'],         name='model'),
        'cov_type'          : Categorical(['full','tied','diag','spherical'], name='cov_type'),
    },
    # UMAP + Agglo
    {
        'reducer'           : Categorical(['umap'],  name='reducer'),
        'n_neighbors'       : Categorical(umap_neighbors,  name='n_neighbors'),
        'min_dist'          : Categorical(umap_min_dist,   name='min_dist'),
        'n_components_umap' : Categorical(umap_components, name='n_components_umap'),
        'model'             : Categorical(['agglo'],       name='model'),
        'linkage'           : Categorical(['average','complete','single'], name='linkage'),
    },
]

# --- progress/timing callback for BayesSearchCV (with robust fallback) ---
TOTAL_ITERS = 60  # keep in sync with BayesSearchCV(n_iter=...)

_timings = []
_start = [None]
_prev  = [None]

def _short_params(d):
    keys = [
        'reducer','n_components','n_neighbors','min_dist','n_components_umap',
        'model','cov_type','linkage'
    ]
    return {k: d[k] for k in keys if k in d}

def _infer_branch_from_space(space):
    """Infer branch name (reducer+model) from single-choice categories in subspace."""
    reducer = model = None
    for dim in getattr(space, "dimensions", []):
        cats = getattr(dim, "categories", None)
        if not cats or not hasattr(cats, "__iter__"):
            continue
        if len(cats) == 1:
            v = cats[0]
            if v in ("pca", "umap"):
                reducer = v
            elif v in ("kmeans", "gmm", "agglo"):
                model = v
    return f"{reducer or '?'}+{model or '?'}"

def progress_callback(res):
    import time, numpy as np
    now = time.perf_counter()
    if _start[0] is None:
        _start[0] = now
    if _prev[0] is not None:
        _timings.append(now - _prev[0])

    k = len(res.x_iters)
    avg = float(np.mean(_timings)) if _timings else 0.0
    elapsed = now - _start[0]
    remaining = max(TOTAL_ITERS - k, 0) * (avg if avg > 0 else 0.0)

    # Try to get the reducer/model from the latest evaluated point
    branch = None
    try:
        if res.x_iters:
            last_params = point_asdict(res.space, res.x_iters[-1])
            r, m = last_params.get("reducer"), last_params.get("model")
            if r and m:
                branch = f"{r}+{m}"
    except Exception:
        pass

    # Fallback: infer directly from this subspace definition
    if branch is None:
        branch = _infer_branch_from_space(res.space)

    last_dt = _timings[-1] if _timings else 0.0
    print(
        f"[Bayes] iter {k:>3}/{TOTAL_ITERS} ({k/TOTAL_ITERS:5.1%}) "
        f"| last {last_dt:5.2f}s avg {avg:5.2f}s "
        f"| elapsed {elapsed/60:4.1f}m ETA ~{remaining/60:4.1f}m "
        f"| branch {branch}",
        flush=True
    )
    _prev[0] = now
    return False

# ---------------------------
# Run Bayesian SearchCV
# ---------------------------
n = X_hybrid_sample.shape[0]
dummy_y = np.zeros(n, dtype=int)
cv_full = [(np.arange(n), np.arange(n))]

search = BayesSearchCV(
    estimator=ClusteringPipeline(),
    search_spaces=search_spaces,           # <-- your original list-of-branches
    n_iter=TOTAL_ITERS,
    random_state=RANDOM_STATE,
    scoring=None,                          # uses estimator.score() (silhouette)
    cv=cv_full,
    n_points=4,                            # parallel proposals
    n_jobs=-1,                             # parallel fits
    return_train_score=False
)

print("\n=== Running BayesSearchCV (PCA/UMAP × KMeans/GMM/Agglo) ===")
callbacks = [
    DeltaYStopper(delta=1e-4, n_best=15),
    progress_callback
]

# prime timers for clean first measurement
_prev[0] = time.perf_counter()
_start[0] = _prev[0]

search.fit(X_hybrid_sample, dummy_y, callback=callbacks)

In [None]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
import umap

# ==========================================================
# Final (Full-Data) ClusteringPipeline — no speed tweaks
# ==========================================================
class ClusteringPipeline(BaseEstimator):
    def __init__(self,
                 reducer='pca', n_components=50,
                 n_neighbors=15, min_dist=0.1, n_components_umap=16,
                 model='kmeans', cov_type='full', linkage='average'):
        self.reducer = reducer
        self.n_components = n_components
        self.n_neighbors = n_neighbors
        self.min_dist = min_dist
        self.n_components_umap = n_components_umap
        self.model = model
        self.cov_type = cov_type
        self.linkage = linkage

    def fit(self, X, y=None):
        if self.reducer == 'pca':
            X_std = StandardScaler().fit_transform(X)
            X_red = PCA(n_components=self.n_components, random_state=RANDOM_STATE).fit_transform(X_std)
        else:
            X_l2 = normalize(X)
            X_red = umap.UMAP(
                n_neighbors=self.n_neighbors,
                min_dist=self.min_dist,
                n_components=self.n_components_umap,
                metric='cosine',
                random_state=RANDOM_STATE
            ).fit_transform(X_l2)

        if self.model == 'kmeans':
            X_use = normalize(X_red)
            labels = KMeans(n_clusters=3, n_init=10, algorithm='elkan',
                            random_state=RANDOM_STATE).fit_predict(X_use)
            self.metric = "cosine"
        elif self.model == 'gmm':
            X_use = np.asarray(X_red, dtype=np.float64)
            try:
                labels = GaussianMixture(
                    n_components=3, n_init=10, random_state=RANDOM_STATE,
                    covariance_type=self.cov_type
                ).fit_predict(X_use)
            except ValueError:
                labels = GaussianMixture(
                    n_components=3, n_init=10, random_state=RANDOM_STATE,
                    covariance_type=self.cov_type, reg_covar=1e-5
                ).fit_predict(X_use)
            self.metric = "euclidean"
        else:  # Agglomerative
            X_use = normalize(X_red)
            labels = AgglomerativeClustering(
                n_clusters=3, linkage=self.linkage, metric='cosine'
            ).fit_predict(X_use)
            self.metric = "cosine"

        self.labels_ = labels
        self.X_use_ = X_use
        self.score_ = silhouette_score(X_use, labels, metric=self.metric)
        return self

    def score(self, X, y=None):
        return self.score_


# ==========================================================
# 1️⃣  Extract best per-branch on 20% subset
# ==========================================================
def _params_compact(d):
    order = ["reducer","model","n_components",
             "n_neighbors","min_dist","n_components_umap",
             "cov_type","linkage"]
    return ", ".join(f"{k}={d[k]}" for k in order if k in d and pd.notnull(d[k]))

cv = pd.DataFrame(search.cv_results_)
param_cols = [c for c in cv.columns if c.startswith("param_")]
score_col  = "mean_test_score"  # silhouette

# Ensure plain Python types
for c in param_cols:
    cv[c] = cv[c].apply(lambda x: x if isinstance(x, (str,int,float,type(None))) else str(x))

cv["branch"] = cv.apply(lambda r: f"{r.get('param_reducer','?')}+{r.get('param_model','?')}", axis=1)
best_idx = cv.groupby("branch")[score_col].idxmax()
best_rows = cv.loc[best_idx].reset_index(drop=True)

subset_records = []
for _, r in best_rows.iterrows():
    params = {p.replace("param_",""): r[p] for p in param_cols if pd.notnull(r[p])}
    subset_records.append({
        "branch": r["branch"],
        "best_sil_subset": round(float(r[score_col]), 4),
        **params
    })

df_best_subset = pd.DataFrame(subset_records).sort_values("branch")

print("\n================ Best Parameters per Branch (20% subset) ================\n")
print(df_best_subset[[
    "branch","best_sil_subset","reducer","model","n_components",
    "n_neighbors","min_dist","n_components_umap","cov_type","linkage"
]].to_string(index=False))


# ==========================================================
# 2️⃣  Refit each best branch on FULL data + compute metrics
# ==========================================================
final_rows = []
for _, row in df_best_subset.iterrows():
    params = {}
    for k in ["reducer","model","n_components","n_neighbors","min_dist",
              "n_components_umap","cov_type","linkage"]:
        if k in row and pd.notnull(row[k]):
            val = row[k]
            if k in ["n_components","n_neighbors","n_components_umap"] and not pd.isna(val):
                val = int(val)
            if k == "min_dist" and not pd.isna(val):
                val = float(val)
            params[k] = val

    mdl = ClusteringPipeline(**params)
    mdl.fit(X_glove_full)
    labels = mdl.labels_
    metric = getattr(mdl, "metric", "cosine")
    sil, ari, nmi, acc = evaluate_full(mdl.X_use_, y_full, labels, metric)

    final_rows.append({
        "branch": row["branch"],
        "Silhouette": round(sil, 3),
        "ARI": round(ari, 3),
        "NMI": round(nmi, 3),
        "Hungarian": round(acc, 3),
        "Params": _params_compact(params)
    })

df_final = pd.DataFrame(final_rows).sort_values("branch")

print("\n================ Final Evaluation on Full Minimal-Cleaned Data (All 6 best models) ================\n")
print(df_final[["branch","Silhouette","ARI","NMI","Hungarian","Params"]].to_string(index=False))


# ==========================================================
# 3️⃣  Quick metric winners
# ==========================================================
for metric in ["Silhouette","ARI","NMI","Hungarian"]:
    r = df_final.loc[df_final[metric].idxmax()]
    print(f"\nWinner by {metric}: {r['branch']} | {metric}={r[metric]:.3f} | {r['Params']}")


In [None]:
# ===========================
# UMAP viz of best 6 branches
# ===========================
import numpy as np
import matplotlib.pyplot as plt
import umap
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture

RANDOM_STATE = 42
N_CLUSTERS = 3  # airline sentiment: neg / neu / pos

def fit_reduce(X, reducer, params):
    if reducer == "pca":
        X_std = StandardScaler().fit_transform(X)
        X_red = PCA(n_components=params["n_components"], random_state=RANDOM_STATE).fit_transform(X_std)
    elif reducer == "umap":
        X_l2 = normalize(X)  # cosine-friendly
        X_red = umap.UMAP(
            n_neighbors=params["n_neighbors"],
            min_dist=params["min_dist"],
            n_components=params["n_components_umap"],
            metric="cosine",
            random_state=RANDOM_STATE
        ).fit_transform(X_l2)
    else:
        X_red = X
    return X_red

def fit_cluster(X_red, model, params):
    if model == "kmeans":
        X_use = normalize(X_red)
        labels = KMeans(n_clusters=N_CLUSTERS, n_init=10, algorithm="elkan",
                        random_state=RANDOM_STATE).fit_predict(X_use)
    elif model == "gmm":
        X_use = np.asarray(X_red, dtype=np.float64)
        try:
            labels = GaussianMixture(
                n_components=N_CLUSTERS, n_init=10, random_state=RANDOM_STATE,
                covariance_type=params["cov_type"]
            ).fit_predict(X_use)
        except ValueError:
            labels = GaussianMixture(
                n_components=N_CLUSTERS, n_init=10, random_state=RANDOM_STATE,
                covariance_type=params["cov_type"], reg_covar=1e-5
            ).fit_predict(X_use)
    elif model == "agglo":
        X_use = normalize(X_red)
        labels = AgglomerativeClustering(
            n_clusters=N_CLUSTERS, linkage=params["linkage"], metric="cosine"
        ).fit_predict(X_use)
    else:
        raise ValueError("Unknown model")
    return labels

# Your tuned params:
models_params = {
    "pca+agglo": dict(reducer="pca",  model="agglo",  n_components=27, linkage="complete"),
    "pca+gmm":   dict(reducer="pca",  model="gmm",    n_components=27, cov_type="spherical"),
    "pca+kmeans":dict(reducer="pca",  model="kmeans", n_components=27),
    "umap+agglo":dict(reducer="umap", model="agglo",  n_neighbors=200, min_dist=0.1, n_components_umap=16, linkage="average"),
    "umap+gmm":  dict(reducer="umap", model="gmm",    n_neighbors=200, min_dist=0.1, n_components_umap=48, cov_type="spherical"),
    "umap+kmeans":dict(reducer="umap",model="kmeans", n_neighbors=150, min_dist=0.1, n_components_umap=16),
}

# Build 2D UMAP per branch for visualization (re-fitted for each branch's feature space)
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for i, (name, prm) in enumerate(models_params.items()):
    # 1) branch-specific reduction
    X_red = fit_reduce(X_hybrid_full, prm["reducer"], prm)
    # 2) clustering with your pipeline conventions
    labels = fit_cluster(X_red, prm["model"], prm)
    # 3) 2D UMAP for visualization
    umap_2d = umap.UMAP(n_neighbors=50, min_dist=0.1, n_components=2,
                        metric="euclidean", random_state=RANDOM_STATE)
    X_vis = umap_2d.fit_transform(X_red)

    ax = axes[i]
    sc = ax.scatter(X_vis[:, 0], X_vis[:, 1], c=labels, s=5, cmap="Spectral")
    ax.set_title(name, fontsize=12)
    ax.set_xticks([]); ax.set_yticks([])

plt.suptitle("UMAP Visualization — Best Models on FULL BERTweet Hybrid Features", fontsize=16, fontweight="bold")
plt.tight_layout()
plt.show()
