In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt

## analysisis(also saved as .py)

In [9]:
#!/usr/bin/env python3
# summarize_pairs_figs_per_pos_1015.py
#
# One figure per (pair, pos), each with 3 bins (K in {15,25,35}).
# Bars inside each bin: Original, n=25, n=50, n=100, n=200.
# Matches your file naming:
#   ORIGINAL_{K}recommendation.csv                (under SVD_pair root)
#   fpair_<A>__<B>_<n>u_pos{5|7}_neg0_all_{K}recommendation.csv (under SVD_pair/{5,7})
#
# Output (no inventory):
#   /.../1015/result/figures/5/
#       - <pair_slug>__pos5.png   (figure)
#       - summary_pos5.csv        (all values for pos=5 incl. Original)
#   /.../1015/result/figures/7/
#       - <pair_slug>__pos7.png
#       - summary_pos7.csv

from pathlib import Path
from typing import Iterable, Tuple, List, Set, Optional, Dict
import re
import pandas as pd
import matplotlib
matplotlib.use("Agg")  # headless-safe
import matplotlib.pyplot as plt

# ======== PATHS (1015) ========
BASE = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/1015/SVD_pair")
ORIG_DIR = BASE                      # ORIGINAL_{K}recommendation.csv lives here
POS_DIRS = [BASE / "5", BASE / "7"]  # poisoned branches /5 and /7

OUT_ROOT = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/1015/result/figures")
OUT_5 = OUT_ROOT / "5"
OUT_7 = OUT_ROOT / "7"

K_LIST = [15, 25, 35]
N_LIST = [25, 50, 100, 200]

# ======== HELPERS ========
def slugify_pair(a: str, b: str) -> str:
    import re as _re
    def sg(x): return _re.sub(r"[^A-Za-z0-9]+", "_", x).strip("_").lower()
    a2, b2 = sorted([a, b], key=lambda x: x.lower())
    return f"{sg(a2)}__{sg(b2)}"

def normalize_tag(t: str) -> str:
    t = str(t).strip().replace("_", " ")
    if t == "Children s":
        t = "Children's"
    low = t.lower()
    if low == "science fiction": t = "Science Fiction"
    elif low == "historical":    t = "Historical"
    elif low == "nonfiction":    t = "Nonfiction"
    elif low == "thriller":      t = "Thriller"
    elif low == "drama":         t = "Drama"
    elif low == "fantasy":       t = "Fantasy"
    elif low == "mystery":       t = "Mystery"
    elif low == "romance":       t = "Romance"
    elif low == "horror":        t = "Horror"
    elif low == "classics":      t = "Classics"
    elif low == "adventure":     t = "Adventure"
    elif low == "adult":         t = "Adult"
    return t

def book_has_both(gen_all: str, A: str, B: str) -> bool:
    if pd.isna(gen_all) or not str(gen_all).strip():
        return False
    parts = [x.strip() for x in str(gen_all).split(",") if str(x).strip()]
    tags = [normalize_tag(x) for x in parts]
    return (A in tags) and (B in tags)

def per_user_avg_pair_count(rec_df: pd.DataFrame, A: str, B: str) -> Tuple[float, int]:
    """
    rec_df columns expected: user_id, book_id, genres_all
    Returns (average_count_per_user, num_users_in_this_csv)
    """
    need = {"user_id", "book_id", "genres_all"}
    missing = need - set(rec_df.columns)
    if missing:
        raise ValueError(f"CSV missing columns: {missing}")
    users = rec_df["user_id"].drop_duplicates().sort_values()
    users_count = int(users.shape[0])
    mask = rec_df["genres_all"].apply(lambda s: book_has_both(s, A, B))
    pair_df = rec_df[mask].copy()
    if pair_df.empty:
        return (0.0, users_count)
    per_user = (
        pair_df.groupby("user_id", as_index=False)["book_id"]
               .count()
               .rename(columns={"book_id": "count"})
    )
    all_users = pd.DataFrame({"user_id": users})
    all_users = all_users.merge(per_user, on="user_id", how="left").fillna({"count": 0})
    return (float(all_users["count"].mean()), users_count)

def injected_files_for_pair_k_n(pos_dir: Path, A: str, B: str, k: int, n: int) -> List[Path]:
    """
    Match fpair_<A>__<B>_<n>u_pos{5|7}_neg0_all_{k}recommendation.csv (order-insensitive for A/B).
    """
    aT = re.sub(r"_+", "_", A.replace(" ", "_").replace("'", "_")).strip("_")
    bT = re.sub(r"_+", "_", B.replace(" ", "_").replace("'", "_")).strip("_")
    pat1 = re.compile(rf"^fpair_{aT}__{bT}_{n}u_pos[57]_neg0_all_{k}recommendation\.csv$")
    pat2 = re.compile(rf"^fpair_{bT}__{aT}_{n}u_pos[57]_neg0_all_{k}recommendation\.csv$")
    out: List[Path] = []
    for p in pos_dir.glob(f"*neg0_all_{k}recommendation.csv"):
        if pat1.match(p.name) or pat2.match(p.name):
            out.append(p)
    return sorted(out)

def discover_pairs_from_dirs(pos_dirs: Iterable[Path],
                             k_list: Iterable[int],
                             n_list: Iterable[int]) -> List[Tuple[str, str]]:
    """
    Parse file names under /5 and /7, return unique unordered (A,B) pairs present for valid K and n.
    """
    pair_set: Set[Tuple[str, str]] = set()
    regex = re.compile(
        r"^fpair_(?P<A>[A-Za-z0-9_']+)__(?P<B>[A-Za-z0-9_']+)_(?P<N>\d+)u_pos[57]_neg0_all_(?P<K>\d+)recommendation\.csv$"
    )
    valid_k = set(map(int, k_list))
    valid_n = set(map(int, n_list))
    for pos_dir in pos_dirs:
        for p in pos_dir.glob("fpair_*u_pos*_neg0_all_*recommendation.csv"):
            m = regex.match(p.name)
            if not m:
                continue
            k = int(m.group("K"))
            n = int(m.group("N"))
            if k not in valid_k or n not in valid_n:
                continue
            A_disp = normalize_tag(m.group("A").replace("_", " "))
            B_disp = normalize_tag(m.group("B").replace("_", " "))
            a_c, b_c = sorted([A_disp, B_disp], key=lambda x: x.lower())
            pair_set.add((a_c, b_c))
    return sorted(pair_set, key=lambda ab: (ab[0].lower(), ab[1].lower()))

def original_file_for_k(orig_dir: Path, k: int) -> Optional[Path]:
    p = orig_dir / f"ORIGINAL_{k}recommendation.csv"
    return p if p.exists() else None

def load_rec_csv(fp: Path) -> pd.DataFrame:
    df = pd.read_csv(fp, low_memory=False)
    # Ensure genres_all exists
    if "genres_all" not in df.columns:
        if "genre_g1" in df.columns and "genre_g2" in df.columns:
            df["genres_all"] = df[["genre_g1", "genre_g2"]].fillna("").agg(
                lambda x: ", ".join([t for t in [x["genre_g1"], x["genre_g2"]] if str(t).strip()]), axis=1
            )
        else:
            df["genres_all"] = ""
    return df

def plot_pair_pos_three_bins(A: str, B: str, pos_label: str,
                             data_by_k: Dict[int, Dict[str, float]],
                             out_png: Path):
    """
    data_by_k: {K: {"Original": v0, "25": v25, "50": v50, "100": v100, "200": v200}}
    """
    ks = sorted(data_by_k.keys())
    groups = ["Original", "25", "50", "100", "200"]
    vals = [[data_by_k.get(k, {}).get(g, 0.0) for g in groups] for k in ks]

    # Plot: 3 bins (ks), 5 bars each
    width = 0.16
    x = list(range(len(ks)))
    plt.figure(figsize=(8.4, 4.4), dpi=160)

    for j, g in enumerate(groups):
        offs = [i + (j - 2)*width for i in x]  # center bars around each K bin
        plt.bar(offs, [vals[i][j] for i in range(len(ks))], width=width,
                label=("n="+g if g!="Original" else "Original"))

    plt.xticks(x, [f"Top-{k}" for k in ks])
    plt.ylabel("Avg # of pair-books in Top-K per user")
    plt.title(f"{A} + {B} — POS={pos_label}")
    plt.legend(ncol=3, fontsize=9)
    plt.tight_layout()
    out_png.parent.mkdir(parents=True, exist_ok=True)
    plt.savefig(out_png)
    plt.close()

# ======== MAIN ========
def main():
    OUT_5.mkdir(parents=True, exist_ok=True)
    OUT_7.mkdir(parents=True, exist_ok=True)

    # Discover all pairs from poisoned dirs
    PAIRS = discover_pairs_from_dirs(POS_DIRS, K_LIST, N_LIST)
    if not PAIRS:
        print("[WARN] No pairs found in /5 or /7.")
        return
    print(f"[OK] Found {len(PAIRS)} pairs across poisoned dirs.")

    # Preload ORIGINAL per K (same files for all pairs; per-pair counts differ)
    original_df_by_k: Dict[int, Optional[pd.DataFrame]] = {}
    for k in K_LIST:
        fp = original_file_for_k(ORIG_DIR, k)
        if fp is None:
            print(f"[WARN] Missing ORIGINAL_{k}recommendation.csv in {ORIG_DIR}")
            original_df_by_k[k] = None
        else:
            original_df_by_k[k] = load_rec_csv(fp)

    # Collect tall rows for both pos branches (5 & 7) + Original
    tall_rows = []  # columns: A,B,pair,pos,K,n,avg_count,users

    # ORIGINAL tall rows (n="ORIGINAL")
    for (A, B) in PAIRS:
        for k in K_LIST:
            dfO = original_df_by_k.get(k)
            if dfO is None:
                continue
            avgc, users_cnt = per_user_avg_pair_count(dfO, A, B)
            tall_rows.append({
                "A": A, "B": B, "pair": slugify_pair(A,B),
                "pos": "ORIGINAL", "K": k, "n": "ORIGINAL",
                "avg_count": avgc, "users": users_cnt
            })

    # Poisoned tall rows (per N separately; no averaging across files)
    for pos_dir in POS_DIRS:
        pos_label = pos_dir.name  # "5" or "7"
        for (A, B) in PAIRS:
            for k in K_LIST:
                for n in N_LIST:
                    files = injected_files_for_pair_k_n(pos_dir, A, B, k, n)
                    if not files:
                        continue
                    for f in files:
                        try:
                            df = load_rec_csv(f)
                            avgc, users_cnt = per_user_avg_pair_count(df, A, B)
                            tall_rows.append({
                                "A": A, "B": B, "pair": slugify_pair(A,B),
                                "pos": pos_label, "K": k, "n": str(n),
                                "avg_count": avgc, "users": users_cnt,
                                "source": f.name
                            })
                        except Exception as e:
                            print(f"[ERROR] Reading {f}: {e}")

    if not tall_rows:
        print("[WARN] No rows computed. Exiting.")
        return

    dft = pd.DataFrame(tall_rows)
    dft["n"] = dft["n"].astype(str)
    dft.sort_values(by=["pair","pos","K","n"], inplace=True)

    # --- Save per-pos CSVs (numbers live next to figures) ---
    dft_pos5 = dft[dft["pos"].isin(["ORIGINAL","5"])].copy()
    dft_pos7 = dft[dft["pos"].isin(["ORIGINAL","7"])].copy()
    dft_pos5.to_csv(OUT_5 / "summary_pos5.csv", index=False)
    dft_pos7.to_csv(OUT_7 / "summary_pos7.csv", index=False)
    print(f"[OK] Saved CSVs: {OUT_5/'summary_pos5.csv'} , {OUT_7/'summary_pos7.csv'}")

    # --- Make figures: one per (pair, pos) ---
    for (A, B) in PAIRS:
        pair_slug = slugify_pair(A, B)

        # POS=5 figure
        sub5 = dft_pos5[dft_pos5["pair"] == pair_slug]
        if not sub5.empty:
            data_by_k_5: Dict[int, Dict[str, float]] = {}
            for k in K_LIST:
                data_by_k_5[k] = {"Original": 0.0, "25": 0.0, "50": 0.0, "100": 0.0, "200": 0.0}
                sO = sub5[(sub5["K"] == k) & (sub5["pos"] == "ORIGINAL") & (sub5["n"] == "ORIGINAL")]
                if not sO.empty:
                    data_by_k_5[k]["Original"] = float(sO.iloc[0]["avg_count"])
                for n_str in ["25","50","100","200"]:
                    sN = sub5[(sub5["K"] == k) & (sub5["pos"] == "5") & (sub5["n"] == n_str)]
                    if not sN.empty:
                        data_by_k_5[k][n_str] = float(sN.iloc[0]["avg_count"])
            out_png_5 = OUT_5 / f"{pair_slug}__pos5.png"
            plot_pair_pos_three_bins(A, B, "5", data_by_k_5, out_png_5)

        # POS=7 figure
        sub7 = dft_pos7[dft_pos7["pair"] == pair_slug]
        if not sub7.empty:
            data_by_k_7: Dict[int, Dict[str, float]] = {}
            for k in K_LIST:
                data_by_k_7[k] = {"Original": 0.0, "25": 0.0, "50": 0.0, "100": 0.0, "200": 0.0}
                sO = sub7[(sub7["K"] == k) & (sub7["pos"] == "ORIGINAL") & (sub7["n"] == "ORIGINAL")]
                if not sO.empty:
                    data_by_k_7[k]["Original"] = float(sO.iloc[0]["avg_count"])
                for n_str in ["25","50","100","200"]:
                    sN = sub7[(sub7["K"] == k) & (sub7["pos"] == "7") & (sub7["n"] == n_str)]
                    if not sN.empty:
                        data_by_k_7[k][n_str] = float(sN.iloc[0]["avg_count"])
            out_png_7 = OUT_7 / f"{pair_slug}__pos7.png"
            plot_pair_pos_three_bins(A, B, "7", data_by_k_7, out_png_7)

    print(f"[OK] Figures written to:\n  {OUT_5}\n  {OUT_7}")

if __name__ == "__main__":
    main()


[OK] Found 2 pairs across poisoned dirs.
[OK] Saved CSVs: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/1015/result/figures/5/summary_pos5.csv , /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/1015/result/figures/7/summary_pos7.csv
[OK] Figures written to:
  /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/1015/result/figures/5
  /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/1015/result/figures/7
