In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt

2,4,6

In [1]:


#!/usr/bin/env python3
# summarize_pairs_figs_per_pos_1015.py
#
# One figure per (pair, pos=5), each with 3 bins (K in {15,25,35}).
# Bars inside each bin: Original, n=2, n=4, n=6.
# Matches your file naming:
#   ORIGINAL_{K}recommendation.csv                          (under SVD_pair root)
#   fpair_<A>__<B>_{2|4|6}u_pos5_neg1_all_{K}recommendation.csv (under SVD_pair/5)
#
# Output:
#   /.../1015/result/figures/5/
#       - <pair_slug>__pos5.png
#       - summary_pos5.csv

from pathlib import Path
from typing import Iterable, Tuple, List, Set, Optional, Dict
import re
import pandas as pd
import matplotlib
matplotlib.use("Agg")  # headless-safe
import matplotlib.pyplot as plt

# ======== PATHS (1015) ========
BASE = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/1015/SVD_pair")
ORIG_DIR = BASE                      # ORIGINAL_{K}recommendation.csv lives here
POS_DIRS = [BASE / "5"]              # poisoned branch /5 only (pos=5, neg=1)

OUT_ROOT = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/1015/result/figures")
OUT_5 = OUT_ROOT / "5"

K_LIST = [15, 25, 35]
N_LIST = [2, 4, 6]  # << changed to 2,4,6

# ======== HELPERS ========
def slugify_pair(a: str, b: str) -> str:
    import re as _re
    def sg(x): return _re.sub(r"[^A-Za-z0-9]+", "_", x).strip("_").lower()
    a2, b2 = sorted([a, b], key=lambda x: x.lower())
    return f"{sg(a2)}__{sg(b2)}"

def normalize_tag(t: str) -> str:
    t = str(t).strip().replace("_", " ")
    if t == "Children s":
        t = "Children's"
    low = t.lower()
    if low == "science fiction": t = "Science Fiction"
    elif low == "historical":    t = "Historical"
    elif low == "nonfiction":    t = "Nonfiction"
    elif low == "thriller":      t = "Thriller"
    elif low == "drama":         t = "Drama"
    elif low == "fantasy":       t = "Fantasy"
    elif low == "mystery":       t = "Mystery"
    elif low == "romance":       t = "Romance"
    elif low == "horror":        t = "Horror"
    elif low == "classics":      t = "Classics"
    elif low == "adventure":     t = "Adventure"
    elif low == "adult":         t = "Adult"
    return t

def book_has_both(gen_all: str, A: str, B: str) -> bool:
    if pd.isna(gen_all) or not str(gen_all).strip():
        return False
    parts = [x.strip() for x in str(gen_all).split(",") if str(x).strip()]
    tags = [normalize_tag(x) for x in parts]
    return (A in tags) and (B in tags)

def per_user_avg_pair_count(rec_df: pd.DataFrame, A: str, B: str) -> Tuple[float, int]:
    """
    rec_df columns expected: user_id, book_id, genres_all
    Returns (average_count_per_user, num_users_in_this_csv)
    """
    need = {"user_id", "book_id", "genres_all"}
    missing = need - set(rec_df.columns)
    if missing:
        raise ValueError(f"CSV missing columns: {missing}")
    users = rec_df["user_id"].drop_duplicates().sort_values()
    users_count = int(users.shape[0])
    mask = rec_df["genres_all"].apply(lambda s: book_has_both(s, A, B))
    pair_df = rec_df[mask].copy()
    if pair_df.empty:
        return (0.0, users_count)
    per_user = (
        pair_df.groupby("user_id", as_index=False)["book_id"]
               .count()
               .rename(columns={"book_id": "count"})
    )
    all_users = pd.DataFrame({"user_id": users})
    all_users = all_users.merge(per_user, on="user_id", how="left").fillna({"count": 0})
    return (float(all_users["count"].mean()), users_count)

def injected_files_for_pair_k_n(pos_dir: Path, A: str, B: str, k: int, n: int) -> List[Path]:
    """
    Match fpair_<A>__<B>_{n}u_pos5_neg1_all_{k}recommendation.csv (order-insensitive for A/B).
    """
    aT = re.sub(r"_+", "_", A.replace(" ", "_").replace("'", "_")).strip("_")
    bT = re.sub(r"_+", "_", B.replace(" ", "_").replace("'", "_")).strip("_")
    pat1 = re.compile(rf"^fpair_{aT}__{bT}_{n}u_pos5_neg1_all_{k}recommendation\.csv$")
    pat2 = re.compile(rf"^fpair_{bT}__{aT}_{n}u_pos5_neg1_all_{k}recommendation\.csv$")
    out: List[Path] = []
    for p in pos_dir.glob(f"*neg1_all_{k}recommendation.csv"):
        if pat1.match(p.name) or pat2.match(p.name):
            out.append(p)
    return sorted(out)

def discover_pairs_from_dirs(pos_dirs: Iterable[Path],
                             k_list: Iterable[int],
                             n_list: Iterable[int]) -> List[Tuple[str, str]]:
    """
    Parse file names under /5, return unique unordered (A,B) pairs present for valid K and n.
    """
    pair_set: Set[Tuple[str, str]] = set()
    regex = re.compile(
        r"^fpair_(?P<A>[A-Za-z0-9_']+)__(?P<B>[A-Za-z0-9_']+)_(?P<N>\d+)u_pos5_neg1_all_(?P<K>\d+)recommendation\.csv$"
    )
    valid_k = set(map(int, k_list))
    valid_n = set(map(int, n_list))
    for pos_dir in pos_dirs:
        for p in pos_dir.glob("fpair_*u_pos5_neg1_all_*recommendation.csv"):
            m = regex.match(p.name)
            if not m:
                continue
            k = int(m.group("K"))
            n = int(m.group("N"))
            if k not in valid_k or n not in valid_n:
                continue
            A_disp = normalize_tag(m.group("A").replace("_", " "))
            B_disp = normalize_tag(m.group("B").replace("_", " "))
            a_c, b_c = sorted([A_disp, B_disp], key=lambda x: x.lower())
            pair_set.add((a_c, b_c))
    return sorted(pair_set, key=lambda ab: (ab[0].lower(), ab[1].lower()))

def original_file_for_k(orig_dir: Path, k: int) -> Optional[Path]:
    p = orig_dir / f"ORIGINAL_{k}recommendation.csv"
    return p if p.exists() else None

def load_rec_csv(fp: Path) -> pd.DataFrame:
    df = pd.read_csv(fp, low_memory=False)
    # Ensure genres_all exists
    if "genres_all" not in df.columns:
        if "genre_g1" in df.columns and "genre_g2" in df.columns:
            df["genres_all"] = df[["genre_g1", "genre_g2"]].fillna("").agg(
                lambda x: ", ".join([t for t in [x["genre_g1"], x["genre_g2"]] if str(t).strip()]), axis=1
            )
        else:
            df["genres_all"] = ""
    return df

def plot_pair_pos_three_bins(A: str, B: str, pos_label: str,
                             data_by_k: Dict[int, Dict[str, float]],
                             out_png: Path):
    """
    data_by_k: {K: {"Original": v0, "2": v2, "4": v4, "6": v6}}
    """
    ks = sorted(data_by_k.keys())
    groups = ["Original", "2", "4", "6"]
    vals = [[data_by_k.get(k, {}).get(g, 0.0) for g in groups] for k in ks]

    # Plot: 3 bins (ks), 4 bars each
    width = 0.2
    x = list(range(len(ks)))
    plt.figure(figsize=(8.4, 4.4), dpi=160)

    for j, g in enumerate(groups):
        offs = [i + (j - 1.5)*width for i in x]  # center bars around each K bin
        plt.bar(offs, [vals[i][j] for i in range(len(ks))], width=width,
                label=("n="+g if g!="Original" else "Original"))

    plt.xticks(x, [f"Top-{k}" for k in ks])
    plt.ylabel("Avg # of pair-books in Top-K per user")
    plt.title(f"{A} + {B} — POS={pos_label}")
    plt.legend(ncol=4, fontsize=9)
    plt.tight_layout()
    out_png.parent.mkdir(parents=True, exist_ok=True)
    plt.savefig(out_png)
    plt.close()

# ======== MAIN ========
def main():
    OUT_5.mkdir(parents=True, exist_ok=True)

    # Discover all pairs from poisoned dirs
    PAIRS = discover_pairs_from_dirs(POS_DIRS, K_LIST, N_LIST)
    if not PAIRS:
        print("[WARN] No pairs found in /5.")
        return
    print(f"[OK] Found {len(PAIRS)} pairs across poisoned dir /5.")

    # Preload ORIGINAL per K (same files for all pairs; per-pair counts differ)
    original_df_by_k: Dict[int, Optional[pd.DataFrame]] = {}
    for k in K_LIST:
        fp = original_file_for_k(ORIG_DIR, k)
        if fp is None:
            print(f"[WARN] Missing ORIGINAL_{k}recommendation.csv in {ORIG_DIR}")
            original_df_by_k[k] = None
        else:
            original_df_by_k[k] = load_rec_csv(fp)

    # Collect tall rows for pos=5 + Original
    tall_rows = []  # columns: A,B,pair,pos,K,n,avg_count,users,source?

    # ORIGINAL tall rows (n="ORIGINAL")
    for (A, B) in PAIRS:
        for k in K_LIST:
            dfO = original_df_by_k.get(k)
            if dfO is None:
                continue
            avgc, users_cnt = per_user_avg_pair_count(dfO, A, B)
            tall_rows.append({
                "A": A, "B": B, "pair": slugify_pair(A,B),
                "pos": "ORIGINAL", "K": k, "n": "ORIGINAL",
                "avg_count": avgc, "users": users_cnt
            })

    # Poisoned tall rows (per N separately; no averaging across files)
    for pos_dir in POS_DIRS:
        pos_label = pos_dir.name  # "5"
        for (A, B) in PAIRS:
            for k in K_LIST:
                for n in N_LIST:
                    files = injected_files_for_pair_k_n(pos_dir, A, B, k, n)
                    if not files:
                        continue
                    for f in files:
                        try:
                            df = load_rec_csv(f)
                            avgc, users_cnt = per_user_avg_pair_count(df, A, B)
                            tall_rows.append({
                                "A": A, "B": B, "pair": slugify_pair(A,B),
                                "pos": pos_label, "K": k, "n": str(n),
                                "avg_count": avgc, "users": users_cnt,
                                "source": f.name
                            })
                        except Exception as e:
                            print(f"[ERROR] Reading {f}: {e}")

    if not tall_rows:
        print("[WARN] No rows computed. Exiting.")
        return

    dft = pd.DataFrame(tall_rows)
    dft["n"] = dft["n"].astype(str)
    dft.sort_values(by=["pair","pos","K","n"], inplace=True)

    # --- Save CSV (pos=5 + Original) ---
    dft_pos5 = dft[dft["pos"].isin(["ORIGINAL","5"])].copy()
    dft_pos5.to_csv(OUT_5 / "summary_pos5.csv", index=False)
    print(f"[OK] Saved CSV: {OUT_5/'summary_pos5.csv'}")

    # --- Make figures: one per (pair, pos=5) ---
    for (A, B) in PAIRS:
        pair_slug = slugify_pair(A, B)

        sub5 = dft_pos5[dft_pos5["pair"] == pair_slug]
        if sub5.empty:
            continue

        data_by_k_5: Dict[int, Dict[str, float]] = {}
        for k in K_LIST:
            data_by_k_5[k] = {"Original": 0.0, "2": 0.0, "4": 0.0, "6": 0.0}
            sO = sub5[(sub5["K"] == k) & (sub5["pos"] == "ORIGINAL") & (sub5["n"] == "ORIGINAL")]
            if not sO.empty:
                data_by_k_5[k]["Original"] = float(sO.iloc[0]["avg_count"])
            for n_str in ["2","4","6"]:
                sN = sub5[(sub5["K"] == k) & (sub5["pos"] == "5") & (sub5["n"] == n_str)]
                if not sN.empty:
                    data_by_k_5[k][n_str] = float(sN.iloc[0]["avg_count"])

        out_png_5 = OUT_5 / f"{pair_slug}__pos5.png"
        plot_pair_pos_three_bins(A, B, "5", data_by_k_5, out_png_5)

    print(f"[OK] Figures written to:\n  {OUT_5}")

if __name__ == "__main__":
    main()



[OK] Found 72 pairs across poisoned dir /5.


KeyboardInterrupt: 

In [None]:
#!/usr/bin/env python3
# make_all_pairs_explanations_and_figs_1015.py
#
# Merged workflow for PAIRS (pos=5 branch):
#  • For each discovered (A,B) pair:
#      - reads ORIGINAL_{K}recommendation.csv (BASE)
#      - finds fpair_<A>__<B>_{N}u_pos5_neg1_all_{K}recommendation.csv (under BASE/5)
#      - computes per-dataset metrics + per-book rankings (books containing BOTH A and B)
#      - saves in per-pair folder:
#           <OUT_ROOT>/5/<pair_slug>_explanation/explanation.txt
#           <OUT_ROOT>/5/<pair_slug>_explanation/per_book_ranking.csv
#           <OUT_ROOT>/5/<pair_slug>_explanation/<pair_slug>__pos5.png
#
#  • Global rollups:
#      <OUT_ROOT>/5/_all_pairs/summary_master.txt
#      <OUT_ROOT>/5/_all_pairs/per_book_ranking_all.csv
#
# Python 3.8+

from pathlib import Path
from typing import Iterable, Tuple, List, Set, Optional, Dict
import re
import pandas as pd
import matplotlib
matplotlib.use("Agg")  # headless-safe
import matplotlib.pyplot as plt

# ========= PATHS / CONFIG (edit if needed) ===================================
BASE      = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/1015/SVD_pair")
ORIG_DIR  = BASE                    # ORIGINAL_{K}recommendation.csv lives here
POS_DIRS  = [BASE / "5"]            # pos=5 / neg=1 branch only
OUT_ROOT  = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/1015/result/figures")
OUT_5     = OUT_ROOT / "5"

K_LIST = [15, 25, 35]
N_LIST = [2, 4, 6]   # synthetic user counts

# ========= Normalization helpers ============================================
def normalize_tag(t: str) -> str:
    t = str(t).strip().replace("_", " ")
    if t == "Children s":  # filename artifact
        t = "Children's"
    low = t.lower()
    if   low == "science fiction": t = "Science Fiction"
    elif low == "historical":      t = "Historical"
    elif low == "nonfiction":      t = "Nonfiction"
    elif low == "thriller":        t = "Thriller"
    elif low == "drama":           t = "Drama"
    elif low == "fantasy":         t = "Fantasy"
    elif low == "mystery":         t = "Mystery"
    elif low == "romance":         t = "Romance"
    elif low == "horror":          t = "Horror"
    elif low == "classics":        t = "Classics"
    elif low == "adventure":       t = "Adventure"
    elif low == "adult":           t = "Adult"
    return t

def slugify_token(x: str) -> str:
    x = re.sub(r"[^A-Za-z0-9]+", "_", str(x)).strip("_").lower()
    return re.sub(r"_+", "_", x)

def slugify_pair(a: str, b: str) -> str:
    a2, b2 = sorted([a, b], key=lambda x: x.lower())
    return f"{slugify_token(a2)}__{slugify_token(b2)}"

# ========= CSV loaders & basic checks =======================================
def load_rec_csv(fp: Path) -> pd.DataFrame:
    df = pd.read_csv(fp, low_memory=False)
    # Ensure needed columns exist (robust to minimal files)
    if "genres_all" not in df.columns:
        # try to synthesize from (genre_g1, genre_g2) if present
        if "genre_g1" in df.columns and "genre_g2" in df.columns:
            df["genres_all"] = df[["genre_g1", "genre_g2"]].fillna("").agg(
                lambda x: ", ".join([t for t in [x["genre_g1"], x["genre_g2"]] if str(t).strip()]),
                axis=1
            )
        else:
            df["genres_all"] = ""
    for c in ["user_id", "book_id", "rank", "est_score", "original_title"]:
        if c not in df.columns:
            df[c] = pd.NA
    return df

def original_file_for_k(orig_dir: Path, k: int) -> Optional[Path]:
    p = orig_dir / f"ORIGINAL_{k}recommendation.csv"
    return p if p.exists() else None

# ========= Pair logic ========================================================
def book_has_both(gen_all: str, A: str, B: str) -> bool:
    if pd.isna(gen_all) or not str(gen_all).strip():
        return False
    parts = [x.strip() for x in str(gen_all).split(",") if str(x).strip()]
    tags  = [normalize_tag(x) for x in parts]
    return (A in tags) and (B in tags)

def metrics_for_file_pair(df: pd.DataFrame, A: str, B: str):
    """
    Returns:
      unique_books_in_file,
      avg_per_user (of A&B books),
      unique_AB_books,
      freq (total rows that are A&B),
      users_with_AB,
      is_AB_mask
    """
    unique_books_in_file = df["book_id"].nunique() if "book_id" in df.columns else 0
    is_ab = df["genres_all"].apply(lambda s: book_has_both(s, A, B))

    if "user_id" in df.columns:
        per_user = df.assign(is_ab=is_ab).groupby("user_id")["is_ab"].sum()
        avg_per_user = float(per_user.mean()) if not per_user.empty else 0.0
        users_with_ab = int((per_user > 0).sum())
    else:
        avg_per_user = 0.0
        users_with_ab = 0

    unique_ab_books = df.loc[is_ab, "book_id"].nunique() if "book_id" in df.columns else 0
    freq = int(is_ab.sum())
    return unique_books_in_file, avg_per_user, unique_ab_books, freq, users_with_ab, is_ab

def per_book_ranking(df_ab_rows: pd.DataFrame) -> pd.DataFrame:
    """
    Build per-book table from rows filtered to the A&B pair:
      rank by freq desc, rank1_count desc, avg_rank asc
    """
    if df_ab_rows.empty:
        return pd.DataFrame(columns=[
            "book_id","rank","freq","users_n","avg_rank","avg_est_score","rank1_count","original_title","genres_all"
        ])

    g = df_ab_rows.groupby("book_id", as_index=False)
    out = g.agg(
        freq=("book_id", "size"),
        users_n=("user_id", "nunique"),
        avg_rank=("rank", "mean"),
        avg_est_score=("est_score", "mean"),
        rank1_count=("rank", lambda s: int((s == 1).sum())),
    )
    # attach sample title/genres
    for c in ["original_title", "genres_all"]:
        smpl = df_ab_rows.groupby("book_id")[c].apply(
            lambda s: s.dropna().iloc[0] if s.notna().any() else pd.NA
        ).reset_index(name=c)
        out = out.merge(smpl, on="book_id", how="left")

    out = out.sort_values(["freq","rank1_count","avg_rank"], ascending=[False, False, True]).reset_index(drop=True)
    out.insert(1, "rank", out.index + 1)
    return out

def injected_files_for_pair_k_n(pos_dir: Path, A: str, B: str, k: int, n: int) -> List[Path]:
    """
    Match fpair_<A>__<B>_{n}u_pos5_neg1_all_{k}recommendation.csv (order-insensitive for A/B).
    """
    aT = re.sub(r"_+", "_", A.replace(" ", "_").replace("'", "_")).strip("_")
    bT = re.sub(r"_+", "_", B.replace(" ", "_").replace("'", "_")).strip("_")
    pat1 = re.compile(rf"^fpair_{aT}__{bT}_{n}u_pos5_neg1_all_{k}recommendation\.csv$")
    pat2 = re.compile(rf"^fpair_{bT}__{aT}_{n}u_pos5_neg1_all_{k}recommendation\.csv$")
    out: List[Path] = []
    for p in pos_dir.glob(f"*neg1_all_{k}recommendation.csv"):
        if pat1.match(p.name) or pat2.match(p.name):
            out.append(p)
    return sorted(out)

def discover_pairs_from_dirs(pos_dirs: Iterable[Path],
                             k_list: Iterable[int],
                             n_list: Iterable[int]) -> List[Tuple[str, str]]:
    """
    Parse file names under /5, return unique unordered (A,B) pairs present for valid K and n.
    """
    pair_set: Set[Tuple[str, str]] = set()
    regex = re.compile(
        r"^fpair_(?P<A>[A-Za-z0-9_']+)__(?P<B>[A-Za-z0-9_']+)_(?P<N>\d+)u_pos5_neg1_all_(?P<K>\d+)recommendation\.csv$"
    )
    valid_k = set(map(int, k_list))
    valid_n = set(map(int, n_list))
    for pos_dir in pos_dirs:
        for p in pos_dir.glob("fpair_*u_pos5_neg1_all_*recommendation.csv"):
            m = regex.match(p.name)
            if not m:
                continue
            k = int(m.group("K")); n = int(m.group("N"))
            if k not in valid_k or n not in valid_n:
                continue
            A_disp = normalize_tag(m.group("A").replace("_", " "))
            B_disp = normalize_tag(m.group("B").replace("_", " "))
            a_c, b_c = sorted([A_disp, B_disp], key=lambda x: x.lower())
            pair_set.add((a_c, b_c))
    return sorted(pair_set, key=lambda ab: (ab[0].lower(), ab[1].lower()))

# ========= Plotting ==========================================================
def plot_pair_pos_three_bins(A: str, B: str,
                             data_by_k: Dict[int, Dict[str, float]],
                             out_png: Path):
    """
    data_by_k: {K: {"Original": v0, "2": v2, "4": v4, "6": v6}}
    """
    ks = sorted(data_by_k.keys())
    if not ks:
        return
    groups = ["Original"] + [str(x) for x in N_LIST]
    present_groups = [g for g in groups if any(g in data_by_k.get(k, {}) for k in ks)]

    width = 0.8 / max(1, len(present_groups))
    x = list(range(len(ks)))
    plt.figure(figsize=(8.4, 4.4), dpi=160)

    for j, g in enumerate(present_groups):
        offs = [i + (j - (len(present_groups)-1)/2)*width for i in x]
        vals = [float(data_by_k.get(k, {}).get(g, 0.0)) for k in ks]
        plt.bar(offs, vals, width=width, label=("n="+g if g!="Original" else "Original"))

    plt.xticks(x, [f"Top-{k}" for k in ks])
    plt.ylabel("Avg # of A&B books in Top-K per user")
    plt.title(f"{A} + {B} — POS=5")
    plt.legend(ncol=min(4, len(present_groups)), fontsize=9)
    plt.tight_layout()
    out_png.parent.mkdir(parents=True, exist_ok=True)
    plt.savefig(out_png)
    plt.close()

# ========= Per-pair processing ===============================================
def process_one_pair(A: str, B: str,
                     original_df_by_k: Dict[int, Optional[pd.DataFrame]]) -> Dict:
    """
    For a given (A,B):
      - writes explanation.txt, per_book_ranking.csv, <pair_slug>__pos5.png
      - returns summary dict
    """
    pair_slug = slugify_pair(A, B)
    OUT_DIR = OUT_5 / f"{pair_slug}_explanation"
    OUT_DIR.mkdir(parents=True, exist_ok=True)

    original_uniques: Dict[int, int] = {}
    injection_uniques: Dict[Tuple[int,int], int] = {}
    per_book_rows: List[Dict] = []
    data_by_k: Dict[int, Dict[str, float]] = {k: {} for k in K_LIST}

    # ORIGINAL
    for K in K_LIST:
        dfO = original_df_by_k.get(K)
        if dfO is None:
            print(f"[warn][{pair_slug}] Missing ORIGINAL for K={K}")
            continue
        uniq_all, avg_user, uniq_ab, freq, users_with_ab, is_ab = metrics_for_file_pair(dfO, A, B)
        original_uniques[K] = uniq_ab
        data_by_k[K]["Original"] = avg_user

        ranked = per_book_ranking(dfO.loc[is_ab].copy())
        for _, r in ranked.iterrows():
            per_book_rows.append({
                "pair": pair_slug,
                "A": A, "B": B,
                "dataset": f"original{K}",
                "book_id": int(r["book_id"]),
                "rank": int(r["rank"]),
                "freq": int(r["freq"]),
                "users_n": int(r["users_n"]),
                "avg_rank": float(r["avg_rank"]) if pd.notna(r["avg_rank"]) else None,
                "avg_est_score": float(r["avg_est_score"]) if pd.notna(r["avg_est_score"]) else None,
                "rank1_count": int(r["rank1_count"]),
                "original_title": r.get("original_title", pd.NA),
                "genres_all": r.get("genres_all", pd.NA),
            })

    # INJECTIONS (pos=5 only)
    for K in K_LIST:
        for N in N_LIST:
            found_any = False
            for pos_dir in POS_DIRS:
                files = injected_files_for_pair_k_n(pos_dir, A, B, K, N)
                if not files:
                    continue
                for f in files:
                    df = load_rec_csv(f)
                    uniq_all, avg_user, uniq_ab, freq, users_with_ab, is_ab = metrics_for_file_pair(df, A, B)
                    injection_uniques[(N, K)] = uniq_ab  # if multiple files, last wins (names identical anyway)
                    data_by_k[K][str(N)] = avg_user
                    ranked = per_book_ranking(df.loc[is_ab].copy())
                    for _, r in ranked.iterrows():
                        per_book_rows.append({
                            "pair": pair_slug,
                            "A": A, "B": B,
                            "dataset": f"{N}u_{K}",
                            "book_id": int(r["book_id"]),
                            "rank": int(r["rank"]),
                            "freq": int(r["freq"]),
                            "users_n": int(r["users_n"]),
                            "avg_rank": float(r["avg_rank"]) if pd.notna(r["avg_rank"]) else None,
                            "avg_est_score": float(r["avg_est_score"]) if pd.notna(r["avg_est_score"]) else None,
                            "rank1_count": int(r["rank1_count"]),
                            "original_title": r.get("original_title", pd.NA),
                            "genres_all": r.get("genres_all", pd.NA),
                        })
                    found_any = True
            if not found_any:
                print(f"[warn][{pair_slug}] Missing injection for N={N}, K={K}")

    # Write per-pair text summary
    text_path = OUT_DIR / "explanation.txt"
    with open(text_path, "w", encoding="utf-8") as f:
        f.write(f"{slugify_token(A)}__{slugify_token(B)}:\n")
        for K in K_LIST:
            if K in original_uniques:
                f.write(f"original {K}: number_of_unique_books: {original_uniques[K]}\n")
        for K in K_LIST:
            for N in N_LIST:
                val = injection_uniques.get((N, K))
                if val is not None:
                    f.write(f"{N}u, {K}, number_of_unique_books: {val}\n")

    # Write per-pair ranking table
    table_df = pd.DataFrame(per_book_rows)
    table_path = OUT_DIR / "per_book_ranking.csv"
    table_df.to_csv(table_path, index=False)

    # Make & save per-pair figure
    fig_path = OUT_DIR / f"{slugify_pair(A,B)}__pos5.png"
    plot_pair_pos_three_bins(A, B, data_by_k, fig_path)

    print(f"[OK][{pair_slug}] Saved text:   {text_path}")
    print(f"[OK][{pair_slug}] Saved table:  {table_path}")
    print(f"[OK][{pair_slug}] Saved figure: {fig_path}")

    return {
        "pair": pair_slug, "A": A, "B": B,
        "original_uniques": original_uniques,
        "injection_uniques": injection_uniques,
        "out_dir": OUT_DIR,
        "per_book_rows": per_book_rows,
    }

# ========= MAIN ==============================================================
def main():
    OUT_5.mkdir(parents=True, exist_ok=True)

    # Discover all pairs present under /5
    PAIRS = discover_pairs_from_dirs(POS_DIRS, K_LIST, N_LIST)
    if not PAIRS:
        print("[WARN] No pairs found in /5.")
        return
    print(f"[OK] Found {len(PAIRS)} pairs.")

    # Preload ORIGINAL per K
    original_df_by_k: Dict[int, Optional[pd.DataFrame]] = {}
    for k in K_LIST:
        fp = original_file_for_k(ORIG_DIR, k)
        if fp is None:
            print(f"[WARN] Missing ORIGINAL_{k}recommendation.csv in {ORIG_DIR}")
            original_df_by_k[k] = None
        else:
            original_df_by_k[k] = load_rec_csv(fp)

    # Process each pair
    all_rows: List[Dict] = []
    master_lines: List[str] = []

    for (A, B) in PAIRS:
        res = process_one_pair(A, B, original_df_by_k)
        pair_slug = res["pair"]
        master_lines.append(f"{pair_slug}:")
        for K in K_LIST:
            if K in res["original_uniques"]:
                master_lines.append(f"original {K}: number_of_unique_books: {res['original_uniques'][K]}")
        for K in K_LIST:
            for N in N_LIST:
                val = res["injection_uniques"].get((N, K))
                if val is not None:
                    master_lines.append(f"{N}u, {K}, number_of_unique_books: {val}")
        master_lines.append("")  # blank line
        all_rows.extend(res["per_book_rows"])

    # Global rollups
    OUT_ALL = OUT_5 / "_all_pairs"
    OUT_ALL.mkdir(parents=True, exist_ok=True)

    master_txt = OUT_ALL / "summary_master.txt"
    with open(master_txt, "w", encoding="utf-8") as f:
        f.write("\n".join(master_lines))
    print(f"[OK] Wrote master summary: {master_txt}")

    if all_rows:
        df_all = pd.DataFrame(all_rows)
        df_all.to_csv(OUT_ALL / "per_book_ranking_all.csv", index=False)
        print(f"[OK] Wrote global per-book ranking CSV: {OUT_ALL / 'per_book_ranking_all.csv'}")
    else:
        print("[warn] No per-book rows produced. Check inputs / paths.")

if __name__ == "__main__":
    main()


[OK] Found 72 pairs.
