In [9]:
# --- Install only what's needed, WITHOUT upgrading pandas/numpy ---
!pip -q install transformers accelerate sentencepiece tqdm

import re
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch
from transformers import pipeline

# =========================
# 0) Load your CSV
# =========================
CSV_PATH = "7_Topics_data.csv"

df = pd.read_csv(CSV_PATH)

# Show columns so we can confirm the exact name
print("Columns in CSV:")
print(list(df.columns))

# -------------------------
# Pick the TEXT column robustly
# -------------------------
# Normalize column names (strip spaces + lowercase)
col_map = {c.strip().lower(): c for c in df.columns}

# Prefer "text", otherwise fallback to "title", otherwise first column
if "text" in col_map:
    TEXT_COL = col_map["text"]
elif "title" in col_map:
    TEXT_COL = col_map["title"]
else:
    TEXT_COL = df.columns[0]  # fallback
    print(f"WARNING: No 'Text' or 'Title' column found. Using first column: {TEXT_COL}")

print("Using TEXT_COL =", TEXT_COL)

df[TEXT_COL] = df[TEXT_COL].fillna("").astype(str)
texts = df[TEXT_COL].tolist()

# =========================
# 1) Rules
# =========================
CLASSES = ["why", "what", "how"]

RULES = {
    "why": [
        r"\bwhy\b", r"\bwhy (is|are|does|do|did|would|should|can|could)\b",
        r"\bbecause\b", r"\bdue to\b", r"\breason(s)?\b", r"\brationale\b",
        r"\bcause(s|d|ing)?\b", r"\broot cause\b",
        r"\bwhat causes\b", r"\bwhat is causing\b", r"\bexplain why\b",
    ],
    "what": [
        r"\bwhat\b", r"\bwhat('s| is| are| does| do)\b",
        r"\bmeaning\b", r"\bdefinition\b", r"\bdefine\b", r"\bwhat does .* mean\b",
        r"\bdescribe\b", r"\boverview\b",
        r"\bdifference between\b", r"\bcompare\b", r"\bcomparison\b", r"\bvs\.?\b",
        r"\bwhich\b|\bwho\b|\bwhere\b|\bwhen\b",
    ],
    "how": [
        r"\bhow\b", r"\bhow (to|do i|can i|can we|should i|could i)\b",
        r"\bstep(s)?\b", r"\bguide\b", r"\btutorial\b", r"\bworkflow\b",
        r"\binstall\b", r"\bsetup\b", r"\bset up\b", r"\bconfigure\b",
        r"\brun\b", r"\bbuild\b", r"\bimplement\b", r"\buse\b",
        r"\bfix\b", r"\bsolve\b", r"\bdebug\b", r"\btroubleshoot\b",
        r"\berror\b", r"\bexception\b", r"\btraceback\b", r"\bstack trace\b",
        r"\bfailed\b", r"\bnot working\b", r"\bdoesn't work\b", r"\bcannot\b", r"\bunable\b",
        r"\bbug\b", r"\bissue\b", r"\bproblem\b",
    ],
}

RULE_REGEX = {k: [re.compile(p, re.IGNORECASE) for p in pats] for k, pats in RULES.items()}
START_QWORD = re.compile(r"^\s*[\(\[\{\'\"\-–—:]*\s*(why|what|how)\b", re.IGNORECASE)
AUX_START = re.compile(
    r"^\s*[\(\[\{\'\"\-–—:]*\s*(can|could|would|should|is|are|am|do|does|did|will|may|might|has|have|had)\b",
    re.IGNORECASE
)

def rule_scores(text: str) -> np.ndarray:
    t = (text or "").strip()
    s = np.zeros(3, dtype=float)
    if not t:
        return s

    m = START_QWORD.search(t)
    if m:
        s[CLASSES.index(m.group(1).lower())] += 6.0

    for j, c in enumerate(CLASSES):
        for rg in RULE_REGEX[c]:
            if rg.search(t):
                s[j] += 2.0

    if "?" in t or AUX_START.search(t):
        s += 0.5

    # Strong HOW signal for troubleshooting
    if re.search(r"\b(error|exception|traceback|bug|issue|problem|not working|cannot|unable|failed)\b", t, re.IGNORECASE):
        s[CLASSES.index("how")] += 2.5

    # Disambiguate "explain"
    if re.search(r"\bexplain why\b", t, re.IGNORECASE):
        s[CLASSES.index("why")] += 2.0
    elif re.search(r"\bexplain\b", t, re.IGNORECASE):
        s[CLASSES.index("what")] += 0.8

    return s

def rule_predict(text: str, strong_thresh=0.45, margin=0.10):
    raw = rule_scores(text)
    norm = np.clip(raw, 0, 18) / 18.0
    top = norm.max()
    best = norm.argmax()
    second = np.partition(norm, -2)[-2] if len(norm) >= 2 else 0.0

    if (top >= strong_thresh) and ((top - second) >= margin):
        return CLASSES[best], norm
    return None, norm

# =========================
# 2) Zero-shot NLI (public, no token needed)
# =========================
MODEL_NAME = "facebook/bart-large-mnli"  # safe default

device = 0 if torch.cuda.is_available() else -1
print("CUDA available:", torch.cuda.is_available(), "| device:", "gpu" if device == 0 else "cpu")

zsc = pipeline(
    "zero-shot-classification",
    model=MODEL_NAME,
    device=device
)

CANDIDATE_LABELS = [
    "instructions or troubleshooting (how)",
    "definition or explanation (what)",
    "reason or cause (why)",
]
HYPOTHESIS_TEMPLATE = "This question is asking for {}."

def zshot_classify(batch_texts, min_conf=0.30, margin=0.03):
    # strip + handle empty strings safely
    batch_texts = [("" if t is None else str(t).strip()) for t in batch_texts]
    empty_mask = [len(t) == 0 for t in batch_texts]

    nonempty_texts = [t for t in batch_texts if len(t) > 0]
    preds = ["other"] * len(batch_texts)
    confs = [0.0] * len(batch_texts)

    # If everything is empty, return all "other"
    if len(nonempty_texts) == 0:
        return preds, confs

    # IMPORTANT: pass labels POSITIONALLY (or use labels=...)
    out = zsc(
        nonempty_texts,
        CANDIDATE_LABELS,
        hypothesis_template=HYPOTHESIS_TEMPLATE,
        multi_label=False
    )

    # pipeline returns dict for single example, list for batch
    if isinstance(out, dict):
        out = [out]

    j = 0
    for i in range(len(batch_texts)):
        if empty_mask[i]:
            continue

        r = out[j]
        j += 1

        labels, scores = r["labels"], r["scores"]
        top_label, top_score = labels[0], float(scores[0])
        second_score = float(scores[1]) if len(scores) > 1 else 0.0

        if "how" in top_label:
            p = "how"
        elif "what" in top_label:
            p = "what"
        else:
            p = "why"

        if (top_score < min_conf) or ((top_score - second_score) < margin):
            p = "other"

        preds[i] = p
        confs[i] = top_score

    return preds, confs


# =========================
# 3) Hybrid decision
# =========================
BATCH_SIZE = 16 if device == -1 else 64

final_preds = [None] * len(texts)
needs_idx, needs_texts = [], []

for i, t in enumerate(texts):
    rp, _ = rule_predict(t, strong_thresh=0.45, margin=0.10)
    if rp is not None:
        final_preds[i] = rp
    else:
        needs_idx.append(i)
        needs_texts.append(t)

print("Total rows:", len(texts))
print("Sent to zero-shot:", len(needs_texts))

z_preds_all = []
for i in tqdm(range(0, len(needs_texts), BATCH_SIZE), desc="Zero-shot batches"):
    batch = needs_texts[i:i+BATCH_SIZE]
    p, _ = zshot_classify(batch, min_conf=0.30, margin=0.03)
    z_preds_all.extend(p)

for k, idx in enumerate(needs_idx):
    final_preds[idx] = z_preds_all[k]

df["question_type"] = final_preds

# =========================
# 4) Summary + export
# =========================
counts = df["question_type"].value_counts().reindex(["how", "what", "why", "other"]).fillna(0).astype(int)
print("\nCounts by category:")
print(counts.to_string())
print("\nOther %:", round(100 * counts["other"] / len(df), 2), "%")

OUT_PATH = "/content/labeled_T7.csv"
df.to_csv(OUT_PATH, index=False)
print("\nSaved:", OUT_PATH)

print("\nSample 'other':")
display(df[df["question_type"]=="other"][[TEXT_COL, "question_type"]].head(20))


Columns in CSV:
['Unnamed: 0', 'Id', 'Title', 'Text', 'OriginalText', 'Document', 'Topic', 'Name', 'Representation', 'Representative_Docs', 'Top_n_words', 'Probability', 'Representative_document']
Using TEXT_COL = Text
CUDA available: True | device: gpu


Device set to use cuda:0


Total rows: 15
Sent to zero-shot: 14


Zero-shot batches:   0%|          | 0/1 [00:00<?, ?it/s]


Counts by category:
question_type
how      3
what     8
why      2
other    2

Other %: 13.33 %

Saved: /content/labeled_T7.csv

Sample 'other':


Unnamed: 0,Text,question_type
6,way simulate communication cost tensorflow - f...,other
12,TFF support deployment across different device...,other
