In [None]:
from __future__ import annotations

from dataclasses import dataclass
from typing import Optional, Tuple, Dict, Any, List

import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

try:
    import hdbscan
    HAS_HDBSCAN = True
except Exception:
    HAS_HDBSCAN = False

from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer


# -----------------------------
# Utilities
# -----------------------------

def batched(iterable, batch_size: int):
    n = len(iterable)
    for i in range(0, n, batch_size):
        yield i, iterable[i:i + batch_size]

def safe_text(x: Any) -> str:
    if x is None:
        return ""
    s = str(x).strip()
    return s

def l2norm(x: np.ndarray) -> np.ndarray:
    return normalize(x, norm="l2", axis=1)


# -----------------------------
# c-TF-IDF style keyword labeling
# -----------------------------
def c_tf_idf_keywords(
    texts: List[str],
    labels: np.ndarray,
    top_n: int = 10,
    min_df: int = 3,
    max_df: float = 0.90,
    stop_words: str = "english",
) -> Dict[int, List[Tuple[str, float]]]:
    """
    Compute cluster keywords with a c-TF-IDF-like approach:
    - Aggregate documents per cluster
    - CountVectorizer
    - cTFIDF = (TF normalized within cluster) * IDF across clusters
    """
    df = pd.DataFrame({"text": texts, "label": labels})
    # filter noise label -1
    df = df[df["label"] >= 0].copy()
    if df.empty:
        return {}

    grouped = df.groupby("label")["text"].apply(lambda xs: " ".join(xs)).reset_index()
    cluster_docs = grouped["text"].tolist()
    cluster_ids = grouped["label"].tolist()

    vectorizer = CountVectorizer(
        stop_words=stop_words,
        min_df=min_df,
        max_df=max_df,
        ngram_range=(1, 2)
    )
    X = vectorizer.fit_transform(cluster_docs)  # shape: (n_clusters, vocab)
    vocab = np.array(vectorizer.get_feature_names_out())

    # TF: normalize term counts by total terms in cluster
    tf = X.astype(np.float64)
    tf = tf.multiply(1.0 / np.clip(tf.sum(axis=1), 1, None))

    # IDF across clusters
    df_term = np.asarray((X > 0).sum(axis=0)).ravel()
    n_clusters = X.shape[0]
    idf = np.log((n_clusters + 1) / (df_term + 1)) + 1.0  # smooth

    ctfidf = tf.multiply(idf)  # (n_clusters, vocab)

    out: Dict[int, List[Tuple[str, float]]] = {}
    for row_idx, cid in enumerate(cluster_ids):
        row = np.asarray(ctfidf[row_idx].todense()).ravel()
        if row.max() <= 0:
            out[cid] = []
            continue
        top_idx = np.argsort(-row)[:top_n]
        out[cid] = [(vocab[i], float(row[i])) for i in top_idx if row[i] > 0]
    return out


# -----------------------------
# Data structures
# -----------------------------
@dataclass
class CanonicalTheme:
    theme_id: int
    size: int
    centroid: np.ndarray  # shape (dim,)
    keywords: List[Tuple[str, float]]
    label: str

@dataclass
class CanonicalThemeSet:
    sector: str
    model_name: str
    embedding_dim: int
    themes: List[CanonicalTheme]

    def to_dataframe(self) -> pd.DataFrame:
        rows = []
        for t in self.themes:
            rows.append({
                "sector": self.sector,
                "theme_id": t.theme_id,
                "size": t.size,
                "label": t.label,
                "keywords": [k for k, _ in t.keywords],
                "keyword_scores": [s for _, s in t.keywords],
                "centroid": t.centroid.astype(np.float32),
            })
        return pd.DataFrame(rows)


# -----------------------------
# Canonical theme builder
# -----------------------------
class CanonicalThemeBuilder:
    def __init__(
        self,
        model_name: str = "BAAI/bge-large-en-v1.5",
        device: Optional[str] = None,
        batch_size: int = 64,
    ):
        self.model_name = model_name
        self.batch_size = batch_size
        self.model = SentenceTransformer(model_name, device=device)

    def embed_texts(self, texts: List[str]) -> np.ndarray:
        embs = []
        for _, batch in tqdm(list(batched(texts, self.batch_size)), desc="Embedding"):
            e = self.model.encode(
                batch,
                show_progress_bar=False,
                normalize_embeddings=True,  # important for cosine similarity
            )
            embs.append(e)
        return np.vstack(embs)

    def cluster_embeddings(
        self,
        embeddings: np.ndarray,
        method: str = "hdbscan",
        min_cluster_size: int = 40,
        min_samples: int = 10,
        kmeans_k: int = 50,
        random_state: int = 42,
    ) -> np.ndarray:
        if method == "hdbscan":
            if not HAS_HDBSCAN:
                raise RuntimeError("hdbscan not installed; either install hdbscan or use method='kmeans'.")
            clusterer = hdbscan.HDBSCAN(
                min_cluster_size=min_cluster_size,
                min_samples=min_samples,
                metric="euclidean",
                prediction_data=False,
            )
            labels = clusterer.fit_predict(embeddings)
            return labels
        elif method == "kmeans":
            km = KMeans(n_clusters=kmeans_k, random_state=random_state, n_init="auto")
            return km.fit_predict(embeddings)
        else:
            raise ValueError("method must be 'hdbscan' or 'kmeans'")

    def build(
        self,
        df_chunks: pd.DataFrame, # a data frame for chunks 
        sector: str,
        text_col: str = "text",
        sector_col: str = "sector",
        method: str = "hdbscan",
        min_cluster_size: int = 40,
        min_samples: int = 10,
        kmeans_k: int = 50,
        top_keywords: int = 10,
        min_df: int = 3,
        max_df: float = 0.90,
    ) -> Tuple[CanonicalThemeSet, pd.DataFrame]:
        """
        Returns:
          - CanonicalThemeSet: sector-level canonical themes
          - df_out: original chunks with assigned theme_id (cluster label), including noise (-1)
        """
        df = df_chunks.copy()
        df = df[df[sector_col] == sector].copy()
        df[text_col] = df[text_col].map(safe_text)
        df = df[df[text_col].str.len() > 0].copy()
        df.reset_index(drop=True, inplace=True)

        texts = df[text_col].tolist()
        embs = self.embed_texts(texts) # transfer chunks into embeddings 

        labels = self.cluster_embeddings(
            embs,
            method=method,
            min_cluster_size=min_cluster_size,
            min_samples=min_samples,
            kmeans_k=kmeans_k # in this code we use k_means to generate the topic clusters (themes); we can also use chatbot to achieve this 
        )

        # keywords / labels (c-TF-IDF) (need to use RAG to make the labels on the themes understandable)
        kw = c_tf_idf_keywords(
            texts=texts,
            labels=labels,
            top_n=top_keywords,
            min_df=min_df,
            max_df=max_df,
            stop_words="english",
        )

        themes: List[CanonicalTheme] = []
        for theme_id in sorted(set(labels)):
            if theme_id < 0:
                continue
            idx = np.where(labels == theme_id)[0]
            centroid = embs[idx].mean(axis=0) # get the centriod of each theme cluster (the average embedding vector of each theme)
            centroid = centroid / (np.linalg.norm(centroid) + 1e-12)

            keywords = kw.get(theme_id, [])
            label_str = ", ".join([k for k, _ in keywords[:5]]) if keywords else f"theme_{theme_id}"
            themes.append(
                CanonicalTheme(
                    theme_id=int(theme_id),
                    size=int(len(idx)),
                    centroid=centroid.astype(np.float32),
                    keywords=keywords,
                    label=label_str,
                )
            )

        theme_set = CanonicalThemeSet(
            sector=sector,
            model_name=self.model_name,
            embedding_dim=int(embs.shape[1]),
            themes=themes
        )

        df_out = df_chunks.copy()
        # attach labels for sector rows only; others remain NaN
        df_out["theme_id"] = np.nan
        df_out.loc[df_out[sector_col] == sector, "theme_id"] = labels

        return theme_set, df_out


def save_theme_set(theme_set: CanonicalThemeSet, path_parquet: str, path_npz: str) -> None:
    """
    Save:
      - Parquet for metadata/keywords/labels
      - NPZ for centroid matrix
    """
    df = theme_set.to_dataframe()
    df.to_parquet(path_parquet, index=False)

    # pack centroids
    theme_ids = [t.theme_id for t in theme_set.themes]
    centroids = np.vstack([t.centroid for t in theme_set.themes]).astype(np.float32)
    np.savez_compressed(path_npz, sector=theme_set.sector, theme_ids=theme_ids, centroids=centroids, model=theme_set.model_name)
 

In [None]:
import pandas as pd
from canonical_themes import CanonicalThemeBuilder, save_theme_set

df = pd.read_parquet("chunks.parquet")  # all chunks
builder = CanonicalThemeBuilder(model_name="BAAI/bge-large-en-v1.5", batch_size=64)

theme_set, df_labeled = builder.build(
    df_chunks=df,
    sector="beauty/care",
    method="hdbscan",          # hdbscan or kmeans
    min_cluster_size=50,
    min_samples=10,
    top_keywords=12,
)

save_theme_set(theme_set, "beauty_care_themes.parquet", "beauty_care_centroids.npz")
df_labeled.to_parquet("chunks_with_theme_id.parquet", index=False)
   # save the theme set as two files: one is parquet file to store the metadata/keywords/labels; another is npz file to store the centroid matrix


In [None]:
# referral 
from __future__ import annotations
from typing import Optional, Tuple
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

def load_centroids(npz_path: str) -> Tuple[np.ndarray, np.ndarray, str]:
    z = np.load(npz_path, allow_pickle=True)
    theme_ids = np.array(z["theme_ids"], dtype=int)
    centroids = np.array(z["centroids"], dtype=np.float32)
    model_name = str(z["model"])
    return theme_ids, centroids, model_name

def embed_texts(model: SentenceTransformer, texts: list[str], batch_size: int = 64) -> np.ndarray:
    embs = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding new chunks"):
        batch = texts[i:i+batch_size]
        e = model.encode(batch, show_progress_bar=False, normalize_embeddings=True)
        embs.append(e)
    return np.vstack(embs)

def assign_themes_by_centroid(
    df_new_chunks: pd.DataFrame, # so we have new chunks to be assigned to existing themes
    centroids_npz: str,
    text_col: str = "text",
    batch_size: int = 64,
    sim_threshold: float = 0.35, # set the threshold, if the chunks are not matched here, they may need future 
    device: Optional[str] = None,
) -> pd.DataFrame:
    """
    For each chunk, compute cosine similarity to all theme centroids.
    If max_sim < threshold => theme_id = -1 (new candidate)
    """
    theme_ids, centroids, model_name = load_centroids(centroids_npz)
    model = SentenceTransformer(model_name, device=device)

    df = df_new_chunks.copy()
    df[text_col] = df[text_col].fillna("").astype(str)
    texts = df[text_col].tolist()

    embs = embed_texts(model, texts, batch_size=batch_size)  # normalized
    sims = cosine_similarity(embs, centroids)  # shape (n_chunks, n_themes) (calculate the cosine similarity between each chunk and each theme centroid)

    best_idx = sims.argmax(axis=1)
    best_sim = sims.max(axis=1)
    assigned_theme = theme_ids[best_idx]

    df["theme_id"] = assigned_theme
    df["theme_sim"] = best_sim
    df.loc[df["theme_sim"] < sim_threshold, "theme_id"] = -1

    return df


In [None]:
# generate the new themes (for chunks that are not assigned to any existing themes)
from __future__ import annotations
from typing import Dict, List, Tuple
import numpy as np
import pandas as pd

from sklearn.cluster import KMeans

try:
    import hdbscan
    HAS_HDBSCAN = True
except Exception:
    HAS_HDBSCAN = False

from canonical_themes import c_tf_idf_keywords

def propose_new_themes(
    df_unassigned: pd.DataFrame,
    emb_col: str = "embedding", # the column name for embeddings
    text_col: str = "text",
    method: str = "hdbscan",
    min_cluster_size: int = 30,
    min_samples: int = 10,
    kmeans_k: int = 20,
    top_keywords: int = 10,
) -> pd.DataFrame:
    """
    Input should contain only chunks with theme_id == -1.
    Requires embeddings.
    """
    df = df_unassigned.copy()
    texts = df[text_col].fillna("").astype(str).tolist()
    embs = np.vstack(df[emb_col].values).astype(np.float32)

    if method == "hdbscan":
        if not HAS_HDBSCAN:
            raise RuntimeError("hdbscan not installed; use method='kmeans' or install hdbscan")
        clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples)
        labels = clusterer.fit_predict(embs)
    else:
        km = KMeans(n_clusters=kmeans_k, random_state=42, n_init="auto")
        labels = km.fit_predict(embs)

    kw = c_tf_idf_keywords(texts=texts, labels=labels, top_n=top_keywords)

    df["candidate_cluster"] = labels
    # attach label string
    label_map = {}
    for cid, kws in kw.items():
        label_map[cid] = ", ".join([k for k, _ in kws[:5]]) if kws else f"candidate_{cid}"
    df["candidate_label"] = df["candidate_cluster"].map(label_map)

    return df




In [None]:
# Using RAG to make themes understandable/readable from human's perspective
# establish the FAISS index for retrieval 
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss

def embed_texts(model: SentenceTransformer, texts: list[str], batch_size: int = 64) -> np.ndarray:
    embs = model.encode(texts, batch_size=batch_size, normalize_embeddings=True, show_progress_bar=True)
    return np.asarray(embs, dtype=np.float32)

def build_faiss_index(chunks_df: pd.DataFrame, model_name: str = "BAAI/bge-large-en-v1.5"):
    model = SentenceTransformer(model_name)
    texts = chunks_df["text"].fillna("").astype(str).tolist()
    embs = embed_texts(model, texts)

    dim = embs.shape[1]
    index = faiss.IndexFlatIP(dim)  # inner product == cosine (if normalized)
    index.add(embs)
    return model, index, embs


In [None]:
# find the chunks that are most relevant to each theme (retrieve process)
def retrieve_chunks(
    query: str,
    model: SentenceTransformer,
    index: faiss.IndexFlatIP,
    chunks_df: pd.DataFrame,
    top_k: int = 30,
    source_type: str | None = None,     # "transcript"/"comment"/None
    video_id: str | None = None,
    min_like: int | None = None
) -> pd.DataFrame:
    df = chunks_df
    mask = pd.Series(True, index=df.index)

    if source_type is not None:
        mask &= (df["source_type"] == source_type)
    if video_id is not None:
        mask &= (df["video_id"] == video_id)
    if min_like is not None and "like_count" in df.columns:
        mask &= (df["like_count"].fillna(0) >= min_like)

    df_sub = df[mask].copy()
    if df_sub.empty:
        return df_sub

    # IMPORTANT: FAISS index is built on full df order.
    # For filtering, simplest is to search wider then filter down.
    q_emb = model.encode([query], normalize_embeddings=True)
    q_emb = np.asarray(q_emb, dtype=np.float32)

    D, I = index.search(q_emb, top_k * 10)
    idx = I[0].tolist()

    df_hits = chunks_df.iloc[idx].copy()
    if source_type is not None:
        df_hits = df_hits[df_hits["source_type"] == source_type]
    if video_id is not None:
        df_hits = df_hits[df_hits["video_id"] == video_id]
    if min_like is not None and "like_count" in df_hits.columns:
        df_hits = df_hits[df_hits["like_count"].fillna(0) >= min_like]

    return df_hits.head(top_k).reset_index(drop=True)


In [None]:
# based on the retrieved chunks, we can use LLM to generate the theme labels
import json
from typing import Any, Dict, List

def _compact_evidence_rows(df_hits: pd.DataFrame, max_items: int = 12, max_chars: int = 260) -> List[Dict[str, Any]]:
    rows = []
    for _, r in df_hits.head(max_items).iterrows():
        text = str(r["text"])
        text = text.replace("\n", " ").strip()
        if len(text) > max_chars:
            text = text[:max_chars].rstrip() + "…"

        rows.append({
            "chunk_id": r.get("chunk_id"),
            "video_id": r.get("video_id"),
            "source_type": r.get("source_type"),
            "start_sec": r.get("start_sec", None),
            "end_sec": r.get("end_sec", None),
            "comment_id": r.get("comment_id", None),
            "thread_id": r.get("thread_id", None),
            "like_count": int(r.get("like_count", 0) or 0),
            "snippet": text
        })
    return rows

def name_theme_with_rag(
    theme_id: int,
    theme_query: str,  # e.g., generated from keywords or "Summarize the core theme..."
    df_evidence: pd.DataFrame,
    llm_call,          # function(prompt)->str  returns JSON string
) -> Dict[str, Any]:
    evidence_rows = _compact_evidence_rows(df_evidence)

    schema_hint = {
        "theme_id": theme_id,
        "theme_name": "Short readable name (3-8 words)",
        "one_line_definition": "One sentence definition",
        "keywords": ["keyword1", "keyword2", "..."],
        "evidence_chunks": [
            {"chunk_id": "…", "why_representative": "…"}
        ]
    }

    prompt = f"""
You are naming a canonical theme for a sector-level taxonomy.
You MUST ground the name in the provided evidence snippets.
Return STRICT JSON only (no markdown, no commentary).

Theme query/context:
{theme_query}

Evidence snippets (each has chunk_id and location metadata):
{json.dumps(evidence_rows, ensure_ascii=False, indent=2)}

Output JSON schema example:
{json.dumps(schema_hint, ensure_ascii=False, indent=2)}

Rules:
- theme_name must be concise and readable.
- evidence_chunks must reference ONLY chunk_id from the evidence list above.
- Provide 5-10 keywords that reflect the theme.
"""

    raw = llm_call(prompt)
    try:
        out = json.loads(raw)
    except Exception:
        # very common: model returns extra text; attempt to extract JSON
        start = raw.find("{")
        end = raw.rfind("}")
        out = json.loads(raw[start:end+1])

    return out

# pip install openai
from openai import OpenAI

client = OpenAI()

def llm_call_openai(prompt: str) -> str:
    resp = client.chat.completions.create(
        model="gpt-4.1-mini",  # 示例：换成你可用的模型
        messages=[
            {"role": "system", "content": "You output strict JSON only."},
            {"role": "user", "content": prompt},
        ],
        temperature=0.2,
    )
    return resp.choices[0].message.content



In [None]:
# generate the names 
def build_theme_cards(
    theme_ids: list[int],
    theme_queries: dict[int, str],   # theme_id -> query string
    model: SentenceTransformer,
    index: faiss.IndexFlatIP,
    chunks_df: pd.DataFrame,
    llm_call,
    top_k_evidence: int = 30
) -> pd.DataFrame:
    cards = []
    for tid in theme_ids:
        q = theme_queries.get(tid, f"Identify the central theme and key aspects for theme {tid}.")
        # RAG evidence: retrieve from whole sector corpus
        df_hits = retrieve_chunks(
            query=q,
            model=model,
            index=index,
            chunks_df=chunks_df,
            top_k=top_k_evidence,
            source_type=None
        )

        card = name_theme_with_rag(
            theme_id=tid,
            theme_query=q,
            df_evidence=df_hits,
            llm_call=llm_call
        )
        cards.append(card)

    return pd.DataFrame(cards)
