In [10]:
# Full quantum WSD / hybrid classifier - Colab friendly
# Paste this into a new cell after completing installs & runtime restart.

import os
import re
import argparse
from pathlib import Path
from typing import List, Optional, Tuple, Dict, Any

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm

# Optional SBERT
try:
    from sentence_transformers import SentenceTransformer
    SBERT_AVAILABLE = True
except Exception:
    SBERT_AVAILABLE = False

# Qiskit imports (try to import AerSimulator; if not available, fall back to Statevector)
QISKIT_AVAILABLE = False
AER_AVAILABLE = False
USE_STATEVECTOR = False
try:
    from qiskit import QuantumCircuit, transpile
    from qiskit.circuit import ParameterVector
    # try AerSimulator (qiskit-aer)
    try:
        from qiskit_aer import Aer  # newer package layout
        from qiskit_aer import AerSimulator
        AER_AVAILABLE = True
    except Exception:
        try:
            # older namespace fallback
            from qiskit.providers.aer import AerSimulator
            AER_AVAILABLE = True
        except Exception:
            AER_AVAILABLE = False
    # Statevector fallback
    from qiskit.quantum_info import Statevector
    QISKIT_AVAILABLE = True
    if not AER_AVAILABLE:
        USE_STATEVECTOR = True
except Exception as e:
    QISKIT_AVAILABLE = False
    print("[WARN] Qiskit core not available:", e)

if not QISKIT_AVAILABLE:
    raise SystemExit("Qiskit not available in this runtime. Make sure you installed qiskit and restarted the runtime.")

# -------------------------
# Utilities
# -------------------------
def clean_text(s: str) -> str:
    if s is None:
        return ""
    s = re.sub(r"\s+", " ", s.replace("\n", " ").replace("\r", " ")).strip()
    s = re.sub(r"http\S+", "", s)
    s = re.sub(r"[^A-Za-z0-9\s\.\,\-\$%€£:;()\/\#\@]", " ", s)
    s = re.sub(r"\s{2,}", " ", s)
    return s.strip()

def load_reuters_with_labels(base_path: str) -> pd.DataFrame:
    base = Path(base_path)
    if not base.exists():
        print(f"[WARNING] Reuters path missing: {base_path}")
        return pd.DataFrame()
    rows = []
    for fp in base.rglob("*.txt"):
        try:
            txt = fp.read_text(encoding="utf-8", errors="ignore").strip()
            if not txt:
                continue
            label = fp.parent.name
            rows.append({"text": txt, "label": label})
        except Exception:
            continue
    return pd.DataFrame(rows)

# -------------------------
# Embedding helpers
# -------------------------
def get_sentence_embeddings(texts: List[str], model_name: str = "sentence-transformers/all-MiniLM-L6-v2") -> np.ndarray:
    if SBERT_AVAILABLE:
        try:
            model = SentenceTransformer(model_name)
            vecs = model.encode(texts, convert_to_numpy=True, show_progress_bar=True)
            return np.array(vecs, dtype=np.float32)
        except Exception as e:
            print("[WARN] SBERT encoding failed:", e)
    tf = TfidfVectorizer(max_features=512)
    X = tf.fit_transform(texts).toarray()
    return X.astype(np.float32)

# -------------------------
# Quantum circuit utilities
# -------------------------
def build_feature_map(n_qubits: int, feature_angles: np.ndarray) -> QuantumCircuit:
    qc = QuantumCircuit(n_qubits)
    for i in range(n_qubits):
        qc.ry(float(feature_angles[i]), i)
    return qc

def build_variational_circuit(n_qubits: int, reps: int = 2) -> Tuple[QuantumCircuit, ParameterVector]:
    param_len = n_qubits * reps * 2
    params = ParameterVector("theta", length=param_len)
    qc = QuantumCircuit(n_qubits)
    idx = 0
    for r in range(reps):
        for q in range(n_qubits):
            qc.ry(params[idx], q); idx += 1
            qc.rz(params[idx], q); idx += 1
        # entangling ring
        for q in range(n_qubits - 1):
            qc.cx(q, q + 1)
        qc.cx(n_qubits - 1, 0)
    return qc, params

def expectation_from_counts(counts: Dict[str, int]) -> float:
    total = sum(counts.values()) if len(counts) else 1
    prob0 = 0.0
    for bitstr, c in counts.items():
        # Qiskit bitstrings: qubit-0 is rightmost
        if len(bitstr) == 0:
            continue
        b0 = bitstr[-1]
        if b0 == "0":
            prob0 += c
    return prob0 / total

# -------------------------
# Simulator wrappers: use Aer if available else Statevector
# -------------------------
class SimulatorRunner:
    def __init__(self, n_qubits: int, shots: int = 512):
        self.n_qubits = n_qubits
        self.shots = shots
        if AER_AVAILABLE:
            try:
                # prefer AerSimulator if available
                from qiskit_aer import AerSimulator as _AS
            except Exception:
                from qiskit.providers.aer import AerSimulator as _AS
            self.backend = _AS()
            self.use_aer = True
        else:
            self.backend = None
            self.use_aer = False

    def run_and_get_prob0(self, qc: QuantumCircuit) -> float:
        """
        Return probability of qubit-0 being '0' estimated from measurement.
        If Aer available, run shots and use counts. Otherwise compute exact probabilities via Statevector.
        """
        if self.use_aer:
            # transpile and run
            tqc = transpile(qc, self.backend)
            job = self.backend.run(tqc, shots=self.shots)
            result = job.result()
            counts = result.get_counts()
            return expectation_from_counts(counts)
        else:
            # exact: remove measurements (if present) and get statevector
            qc_no_meas = qc.remove_final_measurements(inplace=False)
            sv = Statevector.from_instruction(qc_no_meas)
            probs = sv.probabilities_dict()
            # convert dict keys to same format as Aer (bitstrings)
            # compute prob qubit-0 == '0'
            prob0 = 0.0
            for bitstr, p in probs.items():
                # bitstr in qiskit Statevector probs uses same ordering; ensure qubit-0 is rightmost
                if len(bitstr) == 0:
                    continue
                if bitstr[-1] == '0':
                    prob0 += p
            return float(prob0)

# -------------------------
# Hybrid prediction wrapper
# -------------------------
class QuantumClassifier:
    def __init__(self, n_qubits: int = 6, reps: int = 2, shots: int = 512):
        self.n_qubits = n_qubits
        self.reps = reps
        self.shots = shots
        self.vqc_template, self.params = build_variational_circuit(n_qubits, reps=reps)
        self.runner = SimulatorRunner(n_qubits, shots=shots)
        # set RNG for reproducibility if available
        try:
            from qiskit.utils import algorithm_globals
            algorithm_globals.random_seed = 42
        except Exception:
            pass

    def _construct_circuit(self, angles: np.ndarray, theta_values: np.ndarray) -> QuantumCircuit:
        fmap = build_feature_map(self.n_qubits, angles)
        qc = QuantumCircuit(self.n_qubits, self.n_qubits)
        qc = qc.compose(fmap)
        qc = qc.compose(self.vqc_template)
        # bind params
        bind_dict = {self.params[i]: float(theta_values[i]) for i in range(len(self.params))}
        qc = qc.bind_parameters(bind_dict)
        qc.measure(range(self.n_qubits), range(self.n_qubits))
        return qc

    def predict_proba_single(self, angles: np.ndarray, theta_values: np.ndarray) -> float:
        qc = self._construct_circuit(angles, theta_values)
        return self.runner.run_and_get_prob0(qc)

    def batch_predict_proba(self, angles_batch: np.ndarray, theta_values: np.ndarray) -> np.ndarray:
        probs = []
        for angles in angles_batch:
            p = self.predict_proba_single(angles, theta_values)
            probs.append(p)
        return np.array(probs)

# -------------------------
# Loss + training helpers
# -------------------------
def binary_cross_entropy(probs: np.ndarray, labels: np.ndarray, eps: float = 1e-9) -> float:
    probs = np.clip(probs, eps, 1 - eps)
    labels = labels.astype(np.float32)
    return -np.mean(labels * np.log(probs) + (1 - labels) * np.log(1 - probs))

def train_quantum_classifier(qc: QuantumClassifier,
                             X_angles_train: np.ndarray,
                             y_train: np.ndarray,
                             X_angles_val: np.ndarray,
                             y_val: np.ndarray,
                             epochs: int = 30,
                             optimizer_name: str = "COBYLA"):
    from scipy.optimize import minimize
    n_params = len(qc.params)
    theta0 = np.random.RandomState(42).normal(scale=0.1, size=n_params)

    def objective(theta):
        probs = qc.batch_predict_proba(X_angles_train, theta)
        loss = binary_cross_entropy(probs, y_train)
        return float(loss)

    method = "COBYLA" if optimizer_name.upper() == "COBYLA" else "Powell"
    res = minimize(objective, theta0, method=method, options={"maxiter": epochs, "disp": True})
    theta_opt = res.x
    val_probs = qc.batch_predict_proba(X_angles_val, theta_opt)
    val_preds = (val_probs >= 0.5).astype(int)
    acc = accuracy_score(y_val, val_preds)
    f1 = f1_score(y_val, val_preds, average="macro")
    return theta_opt, acc, f1, val_probs

# -------------------------
# Data helpers
# -------------------------
def prepare_dataset_for_quantum(df: pd.DataFrame, text_col: str = "text", label_col: str = "label", n_samples: Optional[int] = None) -> Tuple[List[str], np.ndarray]:
    df = df.dropna(subset=[text_col])
    texts = df[text_col].astype(str).map(clean_text).tolist()
    if label_col in df.columns:
        labels = df[label_col].tolist()
    else:
        labels = [1 if re.search(r"\b(up|gain|rise|beat|surge|profit)\b", t, flags=re.I) else 0 for t in texts]
    if n_samples is not None:
        texts = texts[:n_samples]
        labels = labels[:n_samples]
    return texts, np.array(labels, dtype=int)

def angles_from_features(X_reduced: np.ndarray, n_qubits: int) -> np.ndarray:
    N, d = X_reduced.shape
    if d > n_qubits:
        X_reduced = X_reduced[:, :n_qubits]
        d = n_qubits
    angles = np.zeros((N, n_qubits), dtype=float)
    mins = X_reduced.min(axis=0, keepdims=True)
    maxs = X_reduced.max(axis=0, keepdims=True)
    denom = (maxs - mins)
    denom[denom == 0] = 1.0
    X_norm = (X_reduced - mins) / denom
    X_scaled = (X_norm - 0.5) * np.pi
    angles[:, :d] = X_scaled
    return angles

# -------------------------
# Demo runner
# -------------------------
def demo_quantum_wsd_pipeline(reuters_path: str,
                              n_qubits: int = 6,
                              n_pca: Optional[int] = None,
                              n_samples: Optional[int] = 120,
                              epochs: int = 30,
                              reps: int = 1):
    print("[DATA] Loading Reuters subset...")
    df = load_reuters_with_labels(reuters_path)
    if df.empty:
        print("[ERROR] Reuters dataset not found or empty. Exiting.")
        return
    texts, labels = prepare_dataset_for_quantum(df, n_samples=n_samples)
    # reduce to binary classification if many classes present
    if len(np.unique(labels)) > 2:
        le = LabelEncoder()
        cat_ids = le.fit_transform(df["label"].astype(str))
        idx_keep = np.isin(cat_ids, [0, 1])
        texts = df["text"].astype(str).map(clean_text)[idx_keep].tolist()[:n_samples]
        labels = cat_ids[idx_keep].astype(int)[:n_samples]
        print(f"[DATA] Reduced to binary classes -> {len(texts)} samples")
    print("[EMB] Getting embeddings...")
    emb = get_sentence_embeddings(texts)
    if n_pca is None:
        n_pca = min(emb.shape[1], n_qubits)
    n_pca = min(n_pca, n_qubits)
    print(f"[PCA] Reducing embeddings to {n_pca} dims (for {n_qubits} qubits)")
    pca = PCA(n_components=n_pca)
    X_reduced = pca.fit_transform(emb)
    X_train, X_val, y_train, y_val = train_test_split(X_reduced, labels, test_size=0.2, random_state=42, stratify=labels)
    X_angles_train = angles_from_features(X_train, n_qubits=n_qubits)
    X_angles_val = angles_from_features(X_val, n_qubits=n_qubits)
    print("[Q] Building quantum classifier...")
    qc = QuantumClassifier(n_qubits=n_qubits, reps=reps, shots=512)
    print("[TRAIN] Starting hybrid training (COBYLA)...")
    theta_opt, acc_val, f1_val, val_probs = train_quantum_classifier(qc, X_angles_train, y_train, X_angles_val, y_val, epochs=epochs)
    print(f"[RESULT] Val Acc={acc_val:.4f}  Val F1={f1_val:.4f}")
    preds = (val_probs >= 0.5).astype(int)
    for i in range(min(8, len(preds))):
        print(f"Sample {i}: label={y_val[i]} prob={val_probs[i]:.3f} pred={preds[i]}")
    return {
        "theta": theta_opt,
        "val_acc": acc_val,
        "val_f1": f1_val,
        "val_probs": val_probs,
    }

# -------------------------
# Example usage (uncomment & set your path)
# -------------------------
# result = demo_quantum_wsd_pipeline("/content/reuters", n_qubits=4, n_samples=80, epochs=30, reps=1)
# print(result)


In [5]:
# Colab install cell (run this first)
!pip install --quiet qiskit qiskit-aer scipy sentence-transformers tqdm
# If qiskit-aer fails to build on Python 3.12 this will still install qiskit core; code falls back to Statevector.
