In [2]:
pip install river

Defaulting to user installation because normal site-packages is not writeable
Collecting river
  Downloading river-0.23.0-cp312-cp312-win_amd64.whl.metadata (9.2 kB)
Collecting pandas<3.0.0,>=2.2.3 (from river)
  Downloading pandas-2.3.3-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting scipy<2.0.0,>=1.14.1 (from river)
  Downloading scipy-1.16.3-cp312-cp312-win_amd64.whl.metadata (60 kB)
Downloading river-0.23.0-cp312-cp312-win_amd64.whl (1.8 MB)
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
   ----- ---------------------------------- 0.3/1.8 MB ? eta -:--:--
   ----------------- ---------------------- 0.8/1.8 MB 2.4 MB/s eta 0:00:01
   ---------------------------- ----------- 1.3/1.8 MB 2.4 MB/s eta 0:00:01
   ---------------------------------- ----- 1.6/1.8 MB 2.5 MB/s eta 0:00:01
   ---------------------------------------- 1.8/1.8 MB 2.2 MB/s eta 0:00:00
Downloading pandas-2.3.3-cp312-cp312-win_amd64.whl (11.0 MB)
   --------------------------------------

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.3 requires scipy<1.14.0,>=1.7.0, but you have scipy 1.16.3 which is incompatible.


In [3]:
"""
run_enhanced_v1.py
Enhanced pipeline for ReviewMirror – Milestone 3
Adds: (1) ADWIN change-point detection, (2) User clustering, (3) Optional DistilBERT sentiment
"""

import os, gzip, json, re, datetime as dt, warnings
from typing import Iterator, Dict, Any, List, Tuple, Optional
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score

# Sentiment
try:
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
    _VADER = SentimentIntensityAnalyzer()
except:
    _VADER = None

# ADWIN for change-point detection
try:
    from river import drift
    _ADWIN_AVAILABLE = True
except ImportError:
    _ADWIN_AVAILABLE = False
    print("[WARNING] river not installed. Install with: pip install river")

# Regex helpers
_WORD = re.compile(r"\b\w+\b")
_FIRST_PERSON = re.compile(r"\b(i|i'm|i've|my|me)\b", re.IGNORECASE)


# ==============================================================
# SECTION 1 — Data Loading (same as baseline)
# ==============================================================

def _open_text(path: str):
    if path.endswith(".gz"):
        return gzip.open(path, "rt", encoding="utf-8", errors="ignore")
    return open(path, "rt", encoding="utf-8", errors="ignore")

def parse_amz_reviews(path: str, n_limit: Optional[int] = None) -> Iterator[Dict[str, Any]]:
    with _open_text(path) as f:
        head = f.read(2048)
        if not head:
            return
        is_array = head.lstrip().startswith("[")

    if is_array:
        with _open_text(path) as f:
            data = json.load(f)
            for i, obj in enumerate(data):
                yield obj
                if n_limit is not None and i+1 >= n_limit:
                    break
    else:
        with _open_text(path) as f:
            for i, line in enumerate(f):
                line = line.strip()
                if not line:
                    continue
                try:
                    obj = json.loads(line)
                except:
                    continue
                yield obj
                if n_limit is not None and i+1 >= n_limit:
                    break


# ==============================================================
# SECTION 2 — Feature Helpers (same as baseline)
# ==============================================================

def _safe_vader_compound(text: str) -> float:
    t = text or ""
    if _VADER is not None:
        try:
            return float(_VADER.polarity_scores(t)["compound"])
        except:
            pass
    tl = t.lower()
    pos = sum(tl.count(w) for w in [" good", " great", " excellent", " amazing", " love", " perfect", " happy"])
    neg = sum(tl.count(w) for w in [" bad", " terrible", " awful", " hate", " poor", " broken", " angry"])
    return 0.0 if (pos + neg) == 0 else (pos - neg) / (pos + neg)

def _stars_to_unit(stars: float) -> float:
    try:
        s = float(stars)
        return (s - 3.0) / 2.0
    except:
        return np.nan

def _month_floor(ts: pd.Timestamp) -> pd.Timestamp:
    return pd.Timestamp(year=ts.year, month=ts.month, day=1, tz="UTC")

def _is_ascii_major(text) -> bool:
    if not isinstance(text, str) or not text:
        return True
    ascii_chars = sum(1 for ch in text if ord(ch) < 128)
    return ascii_chars >= 0.9 * len(text)

def _to_int_or_nan(x):
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return np.nan
    if isinstance(x, str):
        x = x.replace(",", "").strip()
    try:
        return int(x)
    except:
        return np.nan

def _style_feats(txt: str) -> Tuple[float, float, float]:
    if not isinstance(txt, str) or not txt:
        return (0.0, 0.0, 0.0)
    n_chars = max(1, len(txt))
    tokens = _WORD.findall(txt)
    n_tok = max(1, len(tokens))
    exclam = txt.count("!") / n_chars
    first_person = len(_FIRST_PERSON.findall(txt)) / n_tok
    caps = sum(1 for ch in txt if ch.isupper()) / n_chars
    return exclam, first_person, caps


# ==============================================================
# SECTION 3 — Build DataFrame (same as baseline)
# ==============================================================

def build_reviews_df(path: str, n_limit: Optional[int] = None) -> pd.DataFrame:
    rows = []
    for rec in parse_amz_reviews(path, n_limit=n_limit):
        rows.append({
            "user_id": rec.get("reviewerID") or rec.get("user_id"),
            "item_id": rec.get("asin") or rec.get("business_id"),
            "text": rec.get("reviewText") or rec.get("text"),
            "stars": rec.get("overall") or rec.get("stars"),
            "ts": pd.to_datetime(
                rec.get("unixReviewTime"), unit="s", utc=True
            ) if rec.get("unixReviewTime") is not None else (
                pd.to_datetime(rec.get("reviewTime"), utc=True, errors="coerce")
                if rec.get("reviewTime") is not None else
                pd.to_datetime(rec.get("date"), utc=True, errors="coerce")
            ),
            "summary": rec.get("summary"),
            "helpful_votes": _to_int_or_nan(rec.get("vote") or rec.get("useful")),
        })
    df = pd.DataFrame(rows)
    df = df.dropna(subset=["user_id", "item_id", "ts"]).reset_index(drop=True)
    df = df[df["text"].map(_is_ascii_major)]
    return df

def compute_features(df: pd.DataFrame, alpha: float = 0.7) -> pd.DataFrame:
    df = df.copy()
    df["sent_text"]  = df["text"].map(_safe_vader_compound).astype(float)
    df["sent_stars"] = df["stars"].map(_stars_to_unit).astype(float)
    df["sent_hybrid"] = alpha * df["sent_text"] + (1.0 - alpha) * df["sent_stars"]

    feats = df["text"].map(_style_feats).tolist()
    df[["style_exclam_rate","style_firstperson_rate","style_caps_rate"]] = pd.DataFrame(
        feats, index=df.index
    )
    df["month"] = df["ts"].map(_month_floor)
    return df


# ==============================================================
# SECTION 4 — ENHANCEMENT 1: ADWIN Change-Point Detection
# ==============================================================

def detect_changepoints_adwin(sentiment_series: np.ndarray, delta: float = 0.002) -> List[int]:
    """
    Detect change-points in sentiment time series using ADWIN.
    
    Args:
        sentiment_series: Array of sentiment values over time
        delta: Confidence parameter (lower = more sensitive)
    
    Returns:
        List of indices where change-points detected
    """
    if not _ADWIN_AVAILABLE or len(sentiment_series) < 3:
        return []
    
    adwin = drift.ADWIN(delta=delta)
    changepoints = []
    
    for i, val in enumerate(sentiment_series):
        if not np.isnan(val):
            adwin.update(val)
            if adwin.drift_detected:
                changepoints.append(i)
    
    # Merge close changepoints (within 1 position)
    if len(changepoints) > 0:
        merged = [changepoints[0]]
        for cp in changepoints[1:]:
            if cp - merged[-1] > 1:
                merged.append(cp)
        return merged
    return []


def compute_changepoint_metrics(changepoints: List[int], total_length: int) -> Dict[str, float]:
    """Compute summary metrics from changepoint list."""
    if len(changepoints) == 0:
        return {
            "n_changepoints": 0,
            "time_to_first_cp": np.nan,
            "mean_dwell_time": np.nan
        }
    
    time_to_first = changepoints[0] / max(1, total_length - 1)
    
    if len(changepoints) > 1:
        dwells = np.diff(changepoints)
        mean_dwell = float(np.mean(dwells))
    else:
        mean_dwell = np.nan
    
    return {
        "n_changepoints": len(changepoints),
        "time_to_first_cp": time_to_first,
        "mean_dwell_time": mean_dwell
    }


# ==============================================================
# SECTION 5 — Build Trajectories with Change-Points
# ==============================================================

def _group_apply_include_groups(grouper, func):
    try:
        return grouper.apply(func, include_groups=False)
    except TypeError:
        return grouper.apply(func)

def build_user_trajectories_enhanced(df: pd.DataFrame, min_reviews: int = 5, 
                                     adwin_delta: float = 0.002):
    """Enhanced version with change-point detection."""
    df = df.sort_values(["user_id","ts"]).copy()

    # Keep users with >= min_reviews
    vc = df["user_id"].value_counts()
    keep = set(vc[vc >= min_reviews].index)
    df = df[df["user_id"].isin(keep)].copy()

    # Within-user normalization
    df["text_z_user"] = df.groupby("user_id")["sent_text"].transform(
        lambda s: (s - s.mean()) / (s.std(ddof=0) + 1e-8)
    )

    # Monthly aggregates
    monthly = (df.groupby(["user_id","month"])
                 .agg(sent_text_mean=("sent_text","mean"),
                      sent_hybrid_mean=("sent_hybrid","mean"),
                      stars_mean=("stars","mean"),
                      n_reviews=("text","count"))
                 .reset_index())

    monthly["month_ord"] = monthly["month"].dt.year * 12 + monthly["month"].dt.month

    # Baseline drift metrics
    def _slope_delta(g: pd.DataFrame) -> Dict[str, float]:
        g = g.sort_values("month_ord")
        x = g["month_ord"].values.astype(float)
        y = g["sent_hybrid_mean"].values.astype(float)
        if len(x) < 2 or np.isnan(y).all():
            return {"drift_slope": np.nan, "drift_delta": np.nan}
        xc = x - x.mean()
        denom = float(np.dot(xc, xc) + 1e-8)
        slope = float(np.dot(xc, (y - y.mean())) / denom)
        delta = float(y[-1] - y[0])
        return {"drift_slope": slope, "drift_delta": delta}

    drift = (_group_apply_include_groups(monthly.groupby("user_id"), _slope_delta)
                .apply(pd.Series)
                .reset_index())

    def _extra_metrics(g: pd.DataFrame) -> Dict[str, float]:
        y = g.sort_values("month_ord")["sent_hybrid_mean"].values.astype(float)
        if len(y) < 2 or np.isnan(y).all():
            return {"tv": np.nan, "flip_rate": np.nan}
        tv = float(np.nansum(np.abs(np.diff(y))))
        flips = float(np.nansum(np.sign(y[1:]) != np.sign(y[:-1])))
        flip_rate = flips / (len(y) - 1)
        return {"tv": tv, "flip_rate": flip_rate}

    extra = (_group_apply_include_groups(monthly.groupby("user_id"), _extra_metrics)
                .apply(pd.Series)
                .reset_index())

    # NEW: Change-point detection per user
    def _changepoint_analysis(g: pd.DataFrame) -> Dict[str, Any]:
        g = g.sort_values("month_ord")
        y = g["sent_hybrid_mean"].values.astype(float)
        cps = detect_changepoints_adwin(y, delta=adwin_delta)
        metrics = compute_changepoint_metrics(cps, len(y))
        metrics["changepoints"] = cps
        return metrics
    
    if _ADWIN_AVAILABLE:
        cp_data = (_group_apply_include_groups(monthly.groupby("user_id"), _changepoint_analysis)
                      .apply(pd.Series)
                      .reset_index())
    else:
        cp_data = pd.DataFrame({
            "user_id": drift["user_id"],
            "n_changepoints": 0,
            "time_to_first_cp": np.nan,
            "mean_dwell_time": np.nan,
            "changepoints": [[] for _ in range(len(drift))]
        })

    seqs = (monthly.groupby("user_id")
                  .agg(months=("month", list),
                       sent_hybrid_seq=("sent_hybrid_mean", list),
                       stars_seq=("stars_mean", list),
                       counts_seq=("n_reviews", list))
                  .reset_index())

    traj = pd.merge(drift, seqs, on="user_id", how="left")
    traj = pd.merge(traj, extra, on="user_id", how="left")
    traj = pd.merge(traj, cp_data, on="user_id", how="left")
    
    return df, monthly, traj


# ==============================================================
# SECTION 6 — ENHANCEMENT 2: User Clustering
# ==============================================================

def cluster_users_by_drift(traj: pd.DataFrame, n_clusters: int = 4, 
                           features: List[str] = None) -> pd.DataFrame:
    """
    Cluster users by drift patterns using k-means.
    
    Args:
        traj: User trajectories dataframe
        n_clusters: Number of clusters
        features: List of features to use (default: slope, tv, flip_rate)
    
    Returns:
        traj with added 'cluster' column
    """
    if features is None:
        features = ["drift_slope", "tv", "flip_rate"]
    
    # Prepare data
    X = traj[features].copy()
    valid_mask = X.notna().all(axis=1)
    X_valid = X[valid_mask]
    
    if len(X_valid) < n_clusters:
        print(f"[WARNING] Not enough valid samples for {n_clusters} clusters")
        traj["cluster"] = -1
        return traj
    
    # Standardize
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_valid)
    
    # K-means
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(X_scaled)
    
    # Assign clusters
    traj["cluster"] = -1
    traj.loc[valid_mask, "cluster"] = clusters
    
    # Compute cluster quality metrics
    if len(np.unique(clusters)) > 1:
        sil_score = silhouette_score(X_scaled, clusters)
        db_score = davies_bouldin_score(X_scaled, clusters)
        print(f"[clustering] Silhouette: {sil_score:.3f}, Davies-Bouldin: {db_score:.3f}")
    
    return traj


def analyze_clusters(traj: pd.DataFrame, features: List[str] = None) -> pd.DataFrame:
    """Generate cluster summary statistics."""
    if features is None:
        features = ["drift_slope", "drift_delta", "tv", "flip_rate", "n_changepoints"]
    
    cluster_summary = traj.groupby("cluster")[features].agg(['mean', 'std', 'count'])
    return cluster_summary


# ==============================================================
# SECTION 7 — Data Splits & Evaluation
# ==============================================================

def create_data_splits(traj: pd.DataFrame, monthly: pd.DataFrame, 
                       split_ratio=(0.7, 0.15, 0.15), strategy="temporal"):
    if strategy == "temporal":
        user_first_review = monthly.groupby("user_id")["month"].min()
        sorted_users = user_first_review.sort_values().index.tolist()
    else:
        sorted_users = traj["user_id"].sample(frac=1, random_state=42).tolist()
    
    n = len(sorted_users)
    train_end = int(n * split_ratio[0])
    val_end = train_end + int(n * split_ratio[1])
    
    return {
        "train": sorted_users[:train_end],
        "val": sorted_users[train_end:val_end],
        "test": sorted_users[val_end:]
    }

def compute_baseline_statistics(traj: pd.DataFrame, splits: Dict):
    train_traj = traj[traj["user_id"].isin(splits["train"])]
    
    baselines = {
        "mean_drift_slope": float(train_traj["drift_slope"].mean()),
        "mean_drift_delta": float(train_traj["drift_delta"].mean()),
        "median_drift_slope": float(train_traj["drift_slope"].median()),
        "median_drift_delta": float(train_traj["drift_delta"].median()),
        "std_drift_slope": float(train_traj["drift_slope"].std()),
        "std_drift_delta": float(train_traj["drift_delta"].std()),
    }
    
    return baselines

def evaluate_drift_metrics(traj: pd.DataFrame, splits: Dict, baselines: Dict):
    test_traj = traj[traj["user_id"].isin(splits["test"])]
    
    metrics = {
        "test_size": int(len(test_traj)),
        "slope_mae": float(np.abs(test_traj["drift_slope"] - baselines["mean_drift_slope"]).mean()),
        "delta_mae": float(np.abs(test_traj["drift_delta"] - baselines["mean_drift_delta"]).mean()),
        "high_drift_users": int((np.abs(test_traj["drift_slope"]) > 0.01).sum()),
        "volatile_users": int((test_traj["tv"] > test_traj["tv"].quantile(0.75)).sum()),
        "flat_users": int((np.abs(test_traj["drift_slope"]) < 0.001).sum()),
    }
    
    for col in ["drift_slope", "drift_delta", "tv", "flip_rate"]:
        if col in test_traj.columns:
            metrics[f"test_{col}_mean"] = float(test_traj[col].mean())
            metrics[f"test_{col}_median"] = float(test_traj[col].median())
            metrics[f"test_{col}_std"] = float(test_traj[col].std())
    
    # NEW: Change-point metrics
    if "n_changepoints" in test_traj.columns:
        metrics["test_n_changepoints_mean"] = float(test_traj["n_changepoints"].mean())
        metrics["test_users_with_cp"] = int((test_traj["n_changepoints"] > 0).sum())
        metrics["test_cp_rate"] = float((test_traj["n_changepoints"] > 0).mean())
    
    return metrics


# ==============================================================
# SECTION 8 — Enhanced Visualizations
# ==============================================================

def create_enhanced_plots(traj: pd.DataFrame, monthly: pd.DataFrame, fig_dir: str):
    """Generate all diagnostic plots including new change-point visualizations."""
    
    # 1. Original drift distributions
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    for ax, col in zip(axes.flat, ["drift_slope", "drift_delta", "tv", "flip_rate"]):
        data = traj[col].dropna()
        ax.hist(data, bins=50, edgecolor='black', alpha=0.7, color='steelblue')
        ax.axvline(data.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {data.mean():.3f}')
        ax.axvline(data.median(), color='orange', linestyle='--', linewidth=2, label=f'Median: {data.median():.3f}')
        ax.set_title(f"Distribution of {col}", fontsize=12, fontweight='bold')
        ax.set_xlabel(col)
        ax.set_ylabel("Count")
        ax.legend()
        ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(os.path.join(fig_dir, "drift_distributions.png"), dpi=220)
    plt.close()
    
    # 2. NEW: Change-point distribution
    if "n_changepoints" in traj.columns:
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
        
        # Histogram of number of changepoints
        cp_counts = traj["n_changepoints"].value_counts().sort_index()
        ax1.bar(cp_counts.index, cp_counts.values, color='coral', edgecolor='black', alpha=0.7)
        ax1.set_title("Distribution of Change-Points per User", fontsize=13, fontweight='bold')
        ax1.set_xlabel("Number of Change-Points")
        ax1.set_ylabel("Number of Users")
        ax1.grid(True, alpha=0.3, axis='y')
        
        # Time to first changepoint
        time_to_first = traj["time_to_first_cp"].dropna()
        if len(time_to_first) > 0:
            ax2.hist(time_to_first, bins=30, color='mediumseagreen', edgecolor='black', alpha=0.7)
            ax2.axvline(time_to_first.median(), color='red', linestyle='--', linewidth=2,
                       label=f'Median: {time_to_first.median():.2f}')
            ax2.set_title("Time to First Change-Point (Normalized)", fontsize=13, fontweight='bold')
            ax2.set_xlabel("Fraction of Timeline")
            ax2.set_ylabel("Number of Users")
            ax2.legend()
            ax2.grid(True, alpha=0.3, axis='y')
        
        plt.tight_layout()
        plt.savefig(os.path.join(fig_dir, "changepoint_analysis.png"), dpi=220)
        plt.close()
    
    # 3. NEW: Cluster visualization
    if "cluster" in traj.columns and traj["cluster"].max() > 0:
        fig, axes = plt.subplots(1, 2, figsize=(14, 6))
        
        # Scatter: slope vs TV colored by cluster
        valid = traj[traj["cluster"] >= 0]
        scatter = axes[0].scatter(valid["drift_slope"], valid["tv"], 
                                 c=valid["cluster"], cmap='tab10', 
                                 s=30, alpha=0.6, edgecolor='black', linewidth=0.5)
        axes[0].axhline(0, color='gray', linestyle='--', alpha=0.5)
        axes[0].axvline(0, color='gray', linestyle='--', alpha=0.5)
        axes[0].set_xlabel("Drift Slope", fontsize=11)
        axes[0].set_ylabel("Total Variation", fontsize=11)
        axes[0].set_title("User Clusters: Slope vs. Volatility", fontsize=13, fontweight='bold')
        plt.colorbar(scatter, ax=axes[0], label='Cluster')
        axes[0].grid(True, alpha=0.3)
        
        # Cluster sizes
        cluster_sizes = valid["cluster"].value_counts().sort_index()
        axes[1].bar(cluster_sizes.index, cluster_sizes.values, color='skyblue', edgecolor='black', alpha=0.7)
        axes[1].set_title("Cluster Sizes", fontsize=13, fontweight='bold')
        axes[1].set_xlabel("Cluster ID")
        axes[1].set_ylabel("Number of Users")
        axes[1].grid(True, alpha=0.3, axis='y')
        
        plt.tight_layout()
        plt.savefig(os.path.join(fig_dir, "cluster_analysis.png"), dpi=220)
        plt.close()
    
    # 4. Temporal trends (same as baseline)
    monthly_agg = monthly.groupby("month").agg({
        "sent_hybrid_mean": "mean",
        "stars_mean": "mean",
        "n_reviews": "sum"
    }).reset_index()
    
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8))
    ax1.plot(monthly_agg["month"], monthly_agg["sent_hybrid_mean"], marker='o', linewidth=2, markersize=4, color='darkgreen')
    ax1.axhline(0, color='gray', linestyle='--', alpha=0.5)
    ax1.set_title("Average Hybrid Sentiment Over Time", fontsize=14, fontweight='bold')
    ax1.set_ylabel("Mean Sentiment")
    ax1.grid(True, alpha=0.3)
    
    ax2.bar(monthly_agg["month"], monthly_agg["n_reviews"], alpha=0.7, color='coral')
    ax2.set_title("Review Volume Over Time", fontsize=14, fontweight='bold')
    ax2.set_xlabel("Month")
    ax2.set_ylabel("Number of Reviews")
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(os.path.join(fig_dir, "temporal_trends.png"), dpi=220)
    plt.close()
    
    # 5. Slope vs. volatility
    plt.figure(figsize=(10, 8))
    plt.scatter(traj["drift_slope"], traj["tv"], alpha=0.4, s=20)
    plt.axvline(0, color='red', linestyle='--', alpha=0.5)
    plt.axhline(traj["tv"].median(), color='orange', linestyle='--', alpha=0.5)
    plt.xlabel("Drift Slope", fontsize=11)
    plt.ylabel("Total Variation", fontsize=11)
    plt.title("Drift Slope vs. Volatility", fontsize=14, fontweight='bold')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(os.path.join(fig_dir, "slope_vs_volatility.png"), dpi=220)
    plt.close()


def plot_user_trajectories_with_cp(traj: pd.DataFrame, monthly: pd.DataFrame, 
                                   fig_dir: str, n_samples: int = 6, seed: int = 42):
    """Plot sample trajectories with change-points marked."""
    sample_users = traj.dropna(subset=["sent_hybrid_seq"]).sample(n_samples, random_state=seed)["user_id"].tolist()
    
    for i, uid in enumerate(sample_users, start=1):
        g = monthly[monthly["user_id"] == uid].sort_values("month")
        user_traj = traj[traj["user_id"] == uid].iloc[0]
        
        plt.figure(figsize=(10, 6))
        plt.plot(np.array(g["month"].values), g["sent_hybrid_mean"], 
                marker='o', linewidth=2, markersize=6, label='Sentiment')
        plt.axhline(0, color='gray', linestyle='--', alpha=0.5)
        
        # Mark change-points
        if "changepoints" in user_traj and len(user_traj["changepoints"]) > 0:
            cps = user_traj["changepoints"]
            months = g["month"].values
            for cp_idx in cps:
                if cp_idx < len(months):
                    plt.axvline(months[cp_idx], color='red', linestyle=':', linewidth=2, alpha=0.7)
            plt.scatter([], [], color='red', marker='|', s=100, linewidth=2, label='Change-Point')
        
        plt.title(f"User {uid} — Sentiment Trajectory (Slope: {user_traj['drift_slope']:.3f})", 
                 fontsize=12, fontweight='bold')
        plt.xlabel("Month", fontsize=11)
        plt.ylabel("Hybrid Sentiment", fontsize=11)
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.savefig(os.path.join(fig_dir, f"user_traj_{i}.png"), dpi=220, bbox_inches="tight")
        plt.close()


# ==============================================================
# SECTION 9 — Comparison Framework
# ==============================================================

def compare_baseline_vs_enhanced(baseline_metrics: Dict, enhanced_metrics: Dict) -> pd.DataFrame:
    """Generate comparison table."""
    comparison = []
    
    common_keys = set(baseline_metrics.keys()) & set(enhanced_metrics.keys())
    
    for key in sorted(common_keys):
        comparison.append({
            "Metric": key,
            "Baseline": baseline_metrics[key],
            "Enhanced": enhanced_metrics[key],
            "Change": enhanced_metrics[key] - baseline_metrics[key] if isinstance(baseline_metrics[key], (int, float)) else "N/A"
        })
    
    # Add enhanced-only metrics
    enhanced_only = set(enhanced_metrics.keys()) - set(baseline_metrics.keys())
    for key in sorted(enhanced_only):
        comparison.append({
            "Metric": key,
            "Baseline": "N/A",
            "Enhanced": enhanced_metrics[key],
            "Change": "NEW"
        })
    
    return pd.DataFrame(comparison)


# ==============================================================
# SECTION 10 — Main Pipeline
# ==============================================================

if __name__ == "__main__":
    # Configuration
    CONFIG = {
        "dataset_name": "Amazon Electronics 5-core",
        "input_path": r"C:\Users\shubh\OneDrive\Desktop\nlp\Electronics_5.json\Electronics_5.json",
        "out_root": "runs",
        "run_name": "enhanced_v1",
        "N_LIMIT": 200000,
        "MIN_REVIEWS": 5,
        "ALPHA": 0.7,
        "seed": 42,
        "split_strategy": "temporal",
        "split_ratio": [0.7, 0.15, 0.15],
        # Enhancement parameters
        "adwin_delta": 0.002,  # Change-point sensitivity
        "n_clusters": 4,       # Number of user clusters
        "clustering_features": ["drift_slope", "tv", "flip_rate"]
    }
    np.random.seed(CONFIG["seed"])

    # Create run directory
    run_id = f'{CONFIG["run_name"]}_{dt.datetime.now().strftime("%Y%m%d_%H%M%S")}'
    RUN_DIR = os.path.join(CONFIG["out_root"], run_id)
    FIG_DIR = os.path.join(RUN_DIR, "figs")
    DATA_DIR = os.path.join(RUN_DIR, "data")
    os.makedirs(FIG_DIR, exist_ok=True)
    os.makedirs(DATA_DIR, exist_ok=True)
    print(f"[run] Created run directory: {RUN_DIR}")

    # Save config
    with open(os.path.join(RUN_DIR, "config.json"), "w") as f:
        json.dump(CONFIG, f, indent=2)

    # Load & preprocess
    print(f"[load] Reading: {CONFIG['input_path']}")
    df = build_reviews_df(CONFIG["input_path"], n_limit=CONFIG["N_LIMIT"])
    print(f"[rows] {len(df):,} reviews after cleaning")

    df = compute_features(df, alpha=CONFIG["ALPHA"])
    print("[features] Computed hybrid sentiment + style features")

    # Build enhanced trajectories
    print("[enhanced] Building trajectories with change-point detection...")
    df_clean, monthly, traj = build_user_trajectories_enhanced(
        df, 
        min_reviews=CONFIG["MIN_REVIEWS"],
        adwin_delta=CONFIG["adwin_delta"]
    )
    print(f"[users] {traj.shape[0]:,} users with ≥ {CONFIG['MIN_REVIEWS']} reviews")

    # Clustering
    print("[clustering] Performing k-means clustering on drift patterns...")
    traj = cluster_users_by_drift(
        traj, 
        n_clusters=CONFIG["n_clusters"],
        features=CONFIG["clustering_features"]
    )
    
    cluster_summary = analyze_clusters(traj, features=CONFIG["clustering_features"] + ["n_changepoints"])
    print("\n[clustering] Cluster summary:\n", cluster_summary)

    # Data splits
    splits = create_data_splits(traj, monthly, 
                               split_ratio=CONFIG["split_ratio"],
                               strategy=CONFIG["split_strategy"])
    print(f"[splits] Train: {len(splits['train'])}, Val: {len(splits['val'])}, Test: {len(splits['test'])}")

    # Save splits
    with open(os.path.join(DATA_DIR, "splits.json"), "w") as f:
        json.dump({k: list(v) for k, v in splits.items()}, f, indent=2)

    # Baseline statistics
    baselines = compute_baseline_statistics(traj, splits)
    with open(os.path.join(RUN_DIR, "baselines.json"), "w") as f:
        json.dump(baselines, f, indent=2)

    # Save data
    df_clean.to_parquet(os.path.join(DATA_DIR, "reviews.parquet"))
    monthly.to_parquet(os.path.join(DATA_DIR, "reviews_monthly.parquet"))
    traj.to_parquet(os.path.join(DATA_DIR, "user_trajectories.parquet"))
    print("[save] Saved data artifacts")

    # Compute metrics
    enhanced_metrics = {
        "n_rows": int(len(df_clean)),
        "n_users": int(traj["user_id"].nunique()),
        "n_items": int(df_clean["item_id"].nunique()),
    }
    
    for col in ["drift_slope", "drift_delta", "tv", "flip_rate", "n_changepoints"]:
        if col in traj.columns:
            enhanced_metrics[f"{col}_mean"] = float(traj[col].mean())
            enhanced_metrics[f"{col}_std"] = float(traj[col].std())
            enhanced_metrics[f"{col}_min"] = float(traj[col].min())
            enhanced_metrics[f"{col}_max"] = float(traj[col].max())

    eval_metrics = evaluate_drift_metrics(traj, splits, baselines)
    enhanced_metrics.update(eval_metrics)
    
    # Clustering metrics
    if "cluster" in traj.columns:
        enhanced_metrics["n_clusters_used"] = int(traj["cluster"].nunique() - (1 if -1 in traj["cluster"].values else 0))
        for cid in range(CONFIG["n_clusters"]):
            cluster_size = int((traj["cluster"] == cid).sum())
            enhanced_metrics[f"cluster_{cid}_size"] = cluster_size

    with open(os.path.join(RUN_DIR, "metrics_enhanced.json"), "w") as f:
        json.dump(enhanced_metrics, f, indent=2)
    print("[metrics] Saved enhanced metrics")

    # Visualizations
    print("[viz] Generating enhanced visualizations...")
    create_enhanced_plots(traj, monthly, FIG_DIR)
    plot_user_trajectories_with_cp(traj, monthly, FIG_DIR, n_samples=6, seed=CONFIG["seed"])

    # Save cluster summary
    cluster_summary.to_csv(os.path.join(RUN_DIR, "cluster_summary.csv"))

    print(f"\n{'='*60}")
    print(f"[DONE] Enhanced run complete!")
    print(f"[DONE] Outputs in: {RUN_DIR}")
    print(f"{'='*60}\n")
    
    # Print key findings
    print("\n=== KEY FINDINGS ===")
    print(f"Users with change-points: {enhanced_metrics.get('test_users_with_cp', 'N/A')}/{enhanced_metrics['test_size']}")
    print(f"Average change-points per user: {enhanced_metrics.get('test_n_changepoints_mean', 'N/A'):.2f}")
    print(f"Clustering produced {enhanced_metrics.get('n_clusters_used', 'N/A')} distinct user groups")

[run] Created run directory: runs\enhanced_v1_20251119_001142
[load] Reading: C:\Users\shubh\OneDrive\Desktop\nlp\Electronics_5.json\Electronics_5.json
[rows] 200,000 reviews after cleaning
[features] Computed hybrid sentiment + style features
[enhanced] Building trajectories with change-point detection...
[users] 2,657 users with ≥ 5 reviews
[clustering] Performing k-means clustering on drift patterns...
[clustering] Silhouette: 0.603, Davies-Bouldin: 0.947

[clustering] Cluster summary:
         drift_slope                        tv                 flip_rate  \
               mean       std count      mean       std count      mean   
cluster                                                                   
-1              NaN       NaN     0       NaN       NaN     0       NaN   
 0        -0.000720  0.020315   301  3.527278  1.663896   301  0.349218   
 1        -0.003085  0.041444  1791  0.666282  0.549278  1791  0.012873   
 2        -0.019121  0.099315   233  1.751493  0.794389