In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt

## analysisis(also saved as .py)

In [4]:
#!/usr/bin/env python3
# count_pairs_pos5and7_with_original.py
#
# Scans SVD recommendation outputs for:
#   - ORIGINAL_{K}recommendation.csv (unpoisoned baseline)
#   - fpair_<A>__<B>_<n>u_pos{5|7}_neg0_all_{K}recommendation.csv (poisoned)
#
# Computes per-user average count of books recommended that contain BOTH genres
# for each unordered pair and K in {15,25,35}. Aggregates across n ∈ {25,50,100,200}.
# Saves inventories, detailed per-file stats, aggregated summaries,
# and grouped bar charts (Original vs POS=5 vs POS=7) per pair per K.

from pathlib import Path
from typing import Iterable, Tuple, List, Set, Optional, Dict
import re
import pandas as pd
import matplotlib
matplotlib.use("Agg")  # headless
import matplotlib.pyplot as plt

# ======== CONFIG (adjust paths if needed) ========
BASE = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/1015/SVD_pair")

ORIG_DIR = BASE                      # ORIGINAL_{K}recommendation.csv lives directly here
POS_DIRS = [BASE / "5", BASE / "7"]  # poisoned branches

OUT_ROOT = BASE / "result" / "pair_summary" / "with_original"
INV_DIR  = OUT_ROOT / "_inventory"
FIG_DIR  = OUT_ROOT / "figures"

K_LIST = [15, 25, 35]
N_LIST = [25, 50, 100, 200]

# ======== HELPERS ========
def slugify_pair(a: str, b: str) -> str:
    import re as _re
    def sg(x): return _re.sub(r"[^A-Za-z0-9]+", "_", x).strip("_").lower()
    a2, b2 = sorted([a, b], key=lambda x: x.lower())
    return f"{sg(a2)}__{sg(b2)}"

def normalize_tag(t: str) -> str:
    t = str(t).strip().replace("_", " ")
    # normalize common tags to your canonical naming
    if t == "Children s":
        t = "Children's"
    low = t.lower()
    if low == "science fiction": t = "Science Fiction"
    elif low == "historical":    t = "Historical"
    elif low == "nonfiction":    t = "Nonfiction"
    elif low == "thriller":      t = "Thriller"
    elif low == "drama":         t = "Drama"
    elif low == "fantasy":       t = "Fantasy"
    elif low == "mystery":       t = "Mystery"
    elif low == "romance":       t = "Romance"
    elif low == "horror":        t = "Horror"
    elif low == "classics":      t = "Classics"
    elif low == "adventure":     t = "Adventure"
    elif low == "adult":         t = "Adult"
    return t

def book_has_both(gen_all: str, A: str, B: str) -> bool:
    if pd.isna(gen_all) or not str(gen_all).strip():
        return False
    parts = [x.strip() for x in str(gen_all).split(",") if str(x).strip()]
    tags = [normalize_tag(x) for x in parts]
    return (A in tags) and (B in tags)

def per_user_avg_pair_count(rec_df: pd.DataFrame, A: str, B: str) -> Tuple[float, int]:
    """
    rec_df columns expected: user_id, book_id, genres_all
    Returns (average_count_per_user, num_users_in_this_csv)
    """
    need = {"user_id", "book_id", "genres_all"}
    missing = need - set(rec_df.columns)
    if missing:
        raise ValueError(f"CSV missing columns: {missing}")
    users = rec_df["user_id"].drop_duplicates().sort_values()
    users_count = int(users.shape[0])
    mask = rec_df["genres_all"].apply(lambda s: book_has_both(s, A, B))
    pair_df = rec_df[mask].copy()
    if pair_df.empty:
        return (0.0, users_count)
    per_user = (
        pair_df.groupby("user_id", as_index=False)["book_id"]
               .count()
               .rename(columns={"book_id": "count"})
    )
    all_users = pd.DataFrame({"user_id": users})
    all_users = all_users.merge(per_user, on="user_id", how="left").fillna({"count": 0})
    return (float(all_users["count"].mean()), users_count)

def injected_files_for_pair_k_n(pos_dir: Path, A: str, B: str, k: int, n: int) -> List[Path]:
    """
    Match fpair_<A>__<B>_<n>u_pos{5|7}_neg0_all_{k}recommendation.csv (order-insensitive for A/B).
    """
    aT = re.sub(r"_+", "_", A.replace(" ", "_").replace("'", "_")).strip("_")
    bT = re.sub(r"_+", "_", B.replace(" ", "_").replace("'", "_")).strip("_")

    pat1 = re.compile(rf"^fpair_{aT}__{bT}_{n}u_pos[57]_neg0_all_{k}recommendation\.csv$")
    pat2 = re.compile(rf"^fpair_{bT}__{aT}_{n}u_pos[57]_neg0_all_{k}recommendation\.csv$")
    out: List[Path] = []
    for p in pos_dir.glob(f"*neg0_all_{k}recommendation.csv"):
        if pat1.match(p.name) or pat2.match(p.name):
            out.append(p)
    return sorted(out)

def discover_pairs_from_dirs(pos_dirs: Iterable[Path], k_list: Iterable[int], n_list: Iterable[int]) -> List[Tuple[str, str]]:
    """
    Parse file names under /5 and /7, return unique unordered (A,B) pairs present for valid K and n.
    """
    pair_set: Set[Tuple[str, str]] = set()
    regex = re.compile(
        r"^fpair_(?P<A>[A-Za-z0-9_']+)__(?P<B>[A-Za-z0-9_']+)_(?P<N>\d+)u_pos[57]_neg0_all_(?P<K>\d+)recommendation\.csv$"
    )
    valid_k = set(map(int, k_list))
    valid_n = set(map(int, n_list))
    for pos_dir in pos_dirs:
        for p in pos_dir.glob("fpair_*u_pos*_neg0_all_*recommendation.csv"):
            m = regex.match(p.name)
            if not m:
                continue
            k = int(m.group("K"))
            n = int(m.group("N"))
            if k not in valid_k or n not in valid_n:
                continue
            A_disp = normalize_tag(m.group("A").replace("_", " "))
            B_disp = normalize_tag(m.group("B").replace("_", " "))
            a_c, b_c = sorted([A_disp, B_disp], key=lambda x: x.lower())
            pair_set.add((a_c, b_c))
    return sorted(pair_set, key=lambda ab: (ab[0].lower(), ab[1].lower()))

def original_file_for_k(orig_dir: Path, k: int) -> Optional[Path]:
    """
    Return ORIGINAL_{K}recommendation.csv if it exists, else None.
    """
    p = orig_dir / f"ORIGINAL_{k}recommendation.csv"
    return p if p.exists() else None

def load_rec_csv(fp: Path) -> pd.DataFrame:
    # Use safe dtype handling; keep columns as-is otherwise
    df = pd.read_csv(fp, low_memory=False)
    # Ensure required columns exist (user_id, book_id, genres_all)
    # If genres_all missing but genre_g1/g2 exist, synthesize genres_all
    if "genres_all" not in df.columns:
        if "genre_g1" in df.columns and "genre_g2" in df.columns:
            df["genres_all"] = df[["genre_g1", "genre_g2"]].fillna("").agg(
                lambda x: ", ".join([t for t in [x["genre_g1"], x["genre_g2"]] if str(t).strip()]), axis=1
            )
        else:
            # best-effort fallback
            df["genres_all"] = ""
    return df

def grouped_bar_chart(triple: Dict[str, float], title: str, out_png: Path):
    """
    triple = {"ORIGINAL": val0, "POS5": val5, "POS7": val7}
    """
    labels = ["Original", "POS=5", "POS=7"]
    vals = [triple.get("ORIGINAL", 0.0), triple.get("POS5", 0.0), triple.get("POS7", 0.0)]

    plt.figure(figsize=(5.5, 3.8), dpi=160)
    plt.bar(labels, vals)
    plt.ylabel("Avg # of pair-books in Top-K per user")
    plt.title(title)
    plt.tight_layout()
    out_png.parent.mkdir(parents=True, exist_ok=True)
    plt.savefig(out_png)
    plt.close()

# ======== MAIN ========
def main():
    OUT_ROOT.mkdir(parents=True, exist_ok=True)
    INV_DIR.mkdir(parents=True, exist_ok=True)
    FIG_DIR.mkdir(parents=True, exist_ok=True)

    # -------- Discover pairs from BOTH pos5 and pos7 --------
    PAIRS = discover_pairs_from_dirs(POS_DIRS, K_LIST, N_LIST)
    if not PAIRS:
        print("[WARN] No pairs found in either /5 or /7 directories.")
        return

    print(f"[INFO] Found {len(PAIRS)} unique unordered pairs across /5 and /7")
    with open(INV_DIR / "discovered_pairs.txt", "w", encoding="utf-8") as f:
        for a, b in PAIRS:
            f.write(f"{a},{b}\n")
    pd.DataFrame(PAIRS, columns=["A", "B"]).to_csv(INV_DIR / "discovered_pairs.csv", index=False)
    print(f"[OK] Inventory saved in {INV_DIR}")

    # -------- Detailed per-file rows + aggregated summaries --------
    detailed_rows = []   # one row per (file)
    agg_rows = []        # aggregated over N for each (pos_branch, pair, K)

    # Preload original baselines by K
    original_avgs_by_pair_k: Dict[Tuple[str, str, int], float] = {}
    original_user_counts_by_k: Dict[int, int] = {}
    for k in K_LIST:
        orig_fp = original_file_for_k(ORIG_DIR, k)
        if orig_fp is None:
            print(f"[WARN] ORIGINAL_{k}recommendation.csv not found in {ORIG_DIR}")
            continue
        try:
            df_orig = load_rec_csv(orig_fp)
            # compute for all pairs on the same ORIGINAL file
            users_cnt = int(df_orig["user_id"].nunique())
            original_user_counts_by_k[k] = users_cnt
            for (A, B) in PAIRS:
                avgc, _ = per_user_avg_pair_count(df_orig, A, B)
                original_avgs_by_pair_k[(A, B, k)] = avgc
        except Exception as e:
            print(f"[ERROR] Reading ORIGINAL {k}: {e}")

    # Collect poisoned per-file rows, then aggregate across N for each (pair, K)
    for pos_dir in POS_DIRS:
        pos_label = pos_dir.name  # "5" or "7"
        for (A, B) in PAIRS:
            pair_slug = slugify_pair(A, B)
            for k in K_LIST:
                per_n_vals = []
                per_n_users = []
                # collect all matching files for all N
                matched_files = []
                for n in N_LIST:
                    files = injected_files_for_pair_k_n(pos_dir, A, B, k, n)
                    matched_files.extend(files)
                # detailed rows
                for f in matched_files:
                    try:
                        df = load_rec_csv(f)
                        avgc, users_cnt = per_user_avg_pair_count(df, A, B)
                        per_n_vals.append(avgc)
                        per_n_users.append(users_cnt)
                        detailed_rows.append({
                            "pos_branch": pos_label,
                            "pair": pair_slug,
                            "A": A, "B": B,
                            "K": k,
                            "file": f.name,
                            "avg_count": avgc,
                            "users_counted": users_cnt
                        })
                    except Exception as e:
                        print(f"[ERROR] Reading {f}: {e}")

                # aggregated (mean of per-N)
                if per_n_vals:
                    avg_over_n = float(sum(per_n_vals) / len(per_n_vals))
                    users_cnt_max = max(per_n_users) if per_n_users else 0
                    agg_rows.append({
                        "pos_branch": pos_label,
                        "pair": pair_slug,
                        "A": A, "B": B,
                        "K": k,
                        "avg_count_over_N": avg_over_n,
                        "users_counted_max": users_cnt_max
                    })

    # Save detailed rows
    if detailed_rows:
        dfd = pd.DataFrame(detailed_rows)
        dfd.sort_values(by=["pair", "pos_branch", "K", "file"], inplace=True)
        dfd_out = OUT_ROOT / "DETAILED_per_file_pair_counts.csv"
        dfd.to_csv(dfd_out, index=False)
        print(f"[OK] Saved per-file details: {dfd_out}")
    else:
        print("[WARN] No detailed rows collected.")

    # Build combined summary with Original vs POS=5 vs POS=7 per pair per K
    if agg_rows:
        dfa = pd.DataFrame(agg_rows)
        # pivot POS branches
        # Compute Original avg per pair/K, attach
        def get_orig_avg(row):
            key = (row["A"], row["B"], int(row["K"]))
            return original_avgs_by_pair_k.get(key, 0.0)

        dfa["avg_original"] = dfa.apply(get_orig_avg, axis=1)

        # Split POS=5 and POS=7 columns
        d5 = dfa[dfa["pos_branch"] == "5"][["pair", "A", "B", "K", "avg_count_over_N"]].rename(
            columns={"avg_count_over_N": "avg_pos5"})
        d7 = dfa[dfa["pos_branch"] == "7"][["pair", "A", "B", "K", "avg_count_over_N"]].rename(
            columns={"avg_count_over_N": "avg_pos7"})
        base = pd.DataFrame([(slugify_pair(a, b), a, b, k) for (a,b) in PAIRS for k in K_LIST],
                            columns=["pair", "A", "B", "K"])
        comb = (base.merge(d5, on=["pair", "A", "B", "K"], how="left")
                    .merge(d7, on=["pair", "A", "B", "K"], how="left"))

        # add original averages
        comb["avg_original"] = comb.apply(lambda r: original_avgs_by_pair_k.get((r["A"], r["B"], int(r["K"])), 0.0), axis=1)

        # fill NaNs with 0.0 where a branch might be missing
        for col in ["avg_pos5", "avg_pos7", "avg_original"]:
            if col in comb.columns:
                comb[col] = comb[col].fillna(0.0)

        # deltas
        comb["delta_5_minus_orig"] = comb["avg_pos5"] - comb["avg_original"]
        comb["delta_7_minus_orig"] = comb["avg_pos7"] - comb["avg_original"]

        comb.sort_values(by=["pair", "K"], inplace=True)
        out_all = OUT_ROOT / "SUMMARY_pair_avg_counts_with_original.csv"
        comb.to_csv(out_all, index=False)
        print(f"[OK] Saved combined summary: {out_all}")

        # -------- Figures: 3-bar charts per pair per K --------
        for _, row in comb.iterrows():
            pair_slug = row["pair"]
            A, B, k = row["A"], row["B"], int(row["K"])
            triple = {
                "ORIGINAL": float(row["avg_original"]),
                "POS5": float(row.get("avg_pos5", 0.0)),
                "POS7": float(row.get("avg_pos7", 0.0)),
            }
            title = f"{A} + {B} (Top-{k})"
            out_png = FIG_DIR / f"{pair_slug}_K{k}.png"
            grouped_bar_chart(triple, title, out_png)
        print(f"[OK] Figures written to: {FIG_DIR}")

    else:
        print("[WARN] No aggregated rows collected; skipping summary and figures.")

if __name__ == "__main__":
    main()


[INFO] Found 1 unique unordered pairs across /5 and /7
[OK] Inventory saved in /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/1015/SVD_pair/result/pair_summary/with_original/_inventory
[OK] Saved per-file details: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/1015/SVD_pair/result/pair_summary/with_original/DETAILED_per_file_pair_counts.csv
[OK] Saved combined summary: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/1015/SVD_pair/result/pair_summary/with_original/SUMMARY_pair_avg_counts_with_original.csv
[OK] Figures written to: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/1015/SVD_pair/result/pair_summary/with_original/figures
