In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt

## analysisis(also saved as .py)

In [3]:
#!/usr/bin/env python3
# compare_pairs_original_vs_pos5_pos7_neg0_all.py
#
# Purpose:
#   - Discover all unordered genre pairs from SVD outputs in /5 and /7 (neg0_all only)
#   - For K in {15,25,35}:
#       * ORIGINAL: per-user avg # of recs whose genres_all contains both tags
#       * POS=5:    mean of the same per-user avg across all N and files for that pair
#       * POS=7:    mean of the same per-user avg across all N and files for that pair
#   - Save inventory, a wide comparison CSV, and bar charts (three bins) per pair & K
#
# Inputs (directories of recommendation CSVs):
#   /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/1015/SVD_pair/5
#   /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/1015/SVD_pair/7
#   /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/1015/SVD_pair     (for ORIGINAL_{K}recommendation.csv)
#
# Outputs:
#   /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/1015/SVD_pair/result/pair_summary/all/
#     ├─ _inventory/discovered_pairs.txt
#     ├─ _inventory/discovered_pairs.csv
#     ├─ COMPARE_original_vs_pos5_pos7_k15_25_35.csv
#     └─ figures/<pair_slug>_K<k>.png   (bar chart with 3 bins)

from pathlib import Path
import re
import pandas as pd
import numpy as np
from typing import Iterable, Tuple, List, Set, Dict

# ======== CONFIG ========
BASE = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/1015/SVD_pair")
DIR_5 = BASE / "5"   # pos=5 branch
DIR_7 = BASE / "7"   # pos=7 branch
DIR_ORIG = BASE      # ORIGINAL_{K}recommendation.csv is saved at the root

OUT_ROOT = BASE / "result" / "pair_summary" / "all"
INV_DIR = OUT_ROOT / "_inventory"
FIG_DIR = OUT_ROOT / "figures"

# K & N
K_LIST = [15, 25, 35]
N_LIST = [25, 50, 100, 200]

# ======== HELPERS ========
def slugify_pair(a: str, b: str) -> str:
    import re as _re
    def sg(x): return _re.sub(r"[^A-Za-z0-9]+", "_", x).strip("_").lower()
    a, b = a.strip(), b.strip()
    return f"{sg(a)}__{sg(b)}"

def normalize_tag(t: str) -> str:
    t = str(t).strip().replace("_", " ")
    if t == "Children s":
        t = "Children's"
    if t.lower() == "science fiction":
        t = "Science Fiction"
    if t.lower() == "historical":
        t = "Historical"
    if t.lower() == "nonfiction":
        t = "Nonfiction"
    return t

def book_has_both(gen_all: str, A: str, B: str) -> bool:
    if pd.isna(gen_all) or not str(gen_all).strip():
        return False
    parts = [normalize_tag(x) for x in str(gen_all).split(",") if str(x).strip()]
    return (A in parts) and (B in parts)

def per_user_avg_pair_count(rec_df: pd.DataFrame, A: str, B: str) -> tuple[float, int]:
    need = {"user_id", "book_id", "genres_all"}
    missing = need - set(rec_df.columns)
    if missing:
        raise ValueError(f"CSV missing columns: {missing}")

    users = rec_df["user_id"].drop_duplicates().sort_values()
    users_count = int(users.shape[0])

    mask = rec_df["genres_all"].apply(lambda s: book_has_both(s, A, B))
    pair_df = rec_df[mask].copy()
    if pair_df.empty:
        return (0.0, users_count)

    per_user = (
        pair_df.groupby("user_id", as_index=False)["book_id"]
        .count()
        .rename(columns={"book_id": "count"})
    )

    all_users = pd.DataFrame({"user_id": users})
    all_users = all_users.merge(per_user, on="user_id", how="left").fillna({"count": 0})
    return (float(all_users["count"].mean()), users_count)

def discover_pairs_from_dirs(pos_dirs: Iterable[Path]) -> List[Tuple[str, str]]:
    """
    Parse poisoned filenames to discover unordered pairs present in /5 and /7.
    Only considers *_neg0_all_* files.
    """
    pair_set: Set[Tuple[str, str]] = set()
    regex = re.compile(
        r"^fpair_(?P<A>.+?)__(?P<B>.+?)_"
        r"(?P<N>\d+)u_pos(?P<POS>[57])_neg0_all_"
        r"(?P<K>\d+)recommendation\.csv$"
    )

    for pos_dir in pos_dirs:
        if not pos_dir.exists():
            continue
        for p in pos_dir.glob("fpair_*_neg0_all_*recommendation.csv"):
            m = regex.match(p.name)
            if not m:
                continue
            A_disp = normalize_tag(m.group("A").replace("_", " "))
            B_disp = normalize_tag(m.group("B").replace("_", " "))
            a_c, b_c = sorted([A_disp, B_disp], key=lambda x: x.lower())
            pair_set.add((a_c, b_c))

    return sorted(pair_set, key=lambda ab: (ab[0].lower(), ab[1].lower()))

def injected_files_for_pair_k_n(pos_dir: Path, A: str, B: str, k: int, n: int) -> list[Path]:
    """
    Find poisoned recommendation CSVs for (A,B), K, N inside a specific pos_dir (5 or 7).
    Works for either ordering and only matches *_neg0_all_* files.
    """
    aT = re.sub(r"_+", "_", A.replace(" ", "_").replace("'", "_")).strip("_")
    bT = re.sub(r"_+", "_", B.replace(" ", "_").replace("'", "_")).strip("_")

    pat1 = re.compile(
        rf"^fpair_{re.escape(aT)}__{re.escape(bT)}_{n}u_pos[57]_neg0_all_{k}recommendation\.csv$"
    )
    pat2 = re.compile(
        rf"^fpair_{re.escape(bT)}__{re.escape(aT)}_{n}u_pos[57]_neg0_all_{k}recommendation\.csv$"
    )

    out = []
    for p in pos_dir.glob(f"*neg0_all_{k}recommendation.csv"):
        name = p.name
        if pat1.match(name) or pat2.match(name):
            out.append(p)
    return sorted(out)

def original_file_for_k(orig_dir: Path, k: int) -> Path | None:
    """
    Return ORIGINAL_{K}recommendation.csv if it exists, else None.
    """
    p = orig_dir / f"ORIGINAL_{k}recommendation.csv"
    return p if p.exists() else None

def safe_read_csv(path: Path) -> pd.DataFrame | None:
    try:
        return pd.read_csv(path)
    except Exception as e:
        print(f"[ERROR] Reading {path}: {e}")
        return None

def plot_three_bin_bar(out_png: Path, title: str, values: Dict[str, float]):
    """
    Create a simple 3-bin bar chart (ORIGINAL, POS=5, POS=7).
    """
    import matplotlib.pyplot as plt

    labels = ["ORIGINAL", "POS=5", "POS=7"]
    xs = np.arange(len(labels))
    ys = [values.get("ORIGINAL", 0.0), values.get("5", 0.0), values.get("7", 0.0)]

    plt.figure(figsize=(6, 4))
    plt.bar(xs, ys)  # (no explicit colors or styles per instruction)
    plt.xticks(xs, labels, rotation=0)
    plt.ylabel("Per-user avg #pair books in Top-K")
    plt.title(title)
    plt.tight_layout()
    plt.savefig(out_png, dpi=150)
    plt.close()

# ======== MAIN ========
def main():
    OUT_ROOT.mkdir(parents=True, exist_ok=True)
    INV_DIR.mkdir(parents=True, exist_ok=True)
    FIG_DIR.mkdir(parents=True, exist_ok=True)

    # ---- Discover pairs from poisoned branches ----
    PAIRS = discover_pairs_from_dirs([DIR_5, DIR_7])
    if not PAIRS:
        print("[WARN] No pairs found in /5 or /7 (neg0_all). Nothing to compare.")
        return

    print(f"[INFO] Found {len(PAIRS)} unique unordered pairs across /5 and /7 (neg0_all).")
    with open(INV_DIR / "discovered_pairs.txt", "w", encoding="utf-8") as f:
        for a, b in PAIRS:
            f.write(f"{a},{b}\n")
    pd.DataFrame(PAIRS, columns=["A", "B"]).to_csv(INV_DIR / "discovered_pairs.csv", index=False)
    print(f"[OK] Inventory saved in {INV_DIR}")

    # ---- Build comparison rows ----
    rows = []  # wide format (one row per pair & K)
    for (A, B) in PAIRS:
        pair_slug = slugify_pair(A, B)

        for k in K_LIST:
            # ORIGINAL
            orig_csv = original_file_for_k(DIR_ORIG, k)
            orig_avg, orig_users = (np.nan, 0)
            if orig_csv is not None:
                dfO = safe_read_csv(orig_csv)
                if dfO is not None:
                    try:
                        orig_avg, orig_users = per_user_avg_pair_count(dfO, A, B)
                    except Exception as e:
                        print(f"[ERROR] ORIGINAL {k}, pair {A},{B}: {e}")

            # POS=5 (mean across all N and files)
            vals5, users5 = [], []
            if DIR_5.exists():
                for n in N_LIST:
                    files5 = injected_files_for_pair_k_n(DIR_5, A, B, k, n)
                    for f in files5:
                        df5 = safe_read_csv(f)
                        if df5 is not None:
                            try:
                                avgc, ucnt = per_user_avg_pair_count(df5, A, B)
                                vals5.append(avgc); users5.append(ucnt)
                            except Exception as e:
                                print(f"[ERROR] POS=5 {f.name}: {e}")
            pos5_avg = float(np.mean(vals5)) if vals5 else np.nan
            pos5_users = max(users5) if users5 else 0

            # POS=7 (mean across all N and files)
            vals7, users7 = [], []
            if DIR_7.exists():
                for n in N_LIST:
                    files7 = injected_files_for_pair_k_n(DIR_7, A, B, k, n)
                    for f in files7:
                        df7 = safe_read_csv(f)
                        if df7 is not None:
                            try:
                                avgc, ucnt = per_user_avg_pair_count(df7, A, B)
                                vals7.append(avgc); users7.append(ucnt)
                            except Exception as e:
                                print(f"[ERROR] POS=7 {f.name}: {e}")
            pos7_avg = float(np.mean(vals7)) if vals7 else np.nan
            pos7_users = max(users7) if users7 else 0

            rows.append({
                "pair_slug": pair_slug,
                "A": A, "B": B,
                "K": k,
                "avg_ORIGINAL": orig_avg,
                "users_ORIGINAL": orig_users,
                "avg_pos5": pos5_avg,
                "users_pos5": pos5_users,
                "avg_pos7": pos7_avg,
                "users_pos7": pos7_users,
            })

            # ---- Figure (three bins) ----
            title = f"{A} + {B} — Top-{k}"
            out_png = FIG_DIR / f"{pair_slug}_K{k}.png"
            vals_for_plot = {
                "ORIGINAL": 0.0 if np.isnan(orig_avg) else float(orig_avg),
                "5": 0.0 if np.isnan(pos5_avg) else float(pos5_avg),
                "7": 0.0 if np.isnan(pos7_avg) else float(pos7_avg),
            }
            try:
                plot_three_bin_bar(out_png, title, vals_for_plot)
            except Exception as e:
                print(f"[ERROR] Plotting {title}: {e}")

            print(f"[OK] {A},{B}  K={k}  →  ORIG={orig_avg:.4f}  pos5={pos5_avg:.4f}  pos7={pos7_avg:.4f}")

    # ---- Save combined comparison CSV ----
    if rows:
        dfw = pd.DataFrame(rows)
        dfw = dfw.sort_values(by=["A", "B", "K"]).reset_index(drop=True)
        out_csv = OUT_ROOT / "COMPARE_original_vs_pos5_pos7_k15_25_35.csv"
        dfw.to_csv(out_csv, index=False)
        print(f"[OK] Saved comparison CSV: {out_csv}")
        print(f"[OK] Figures saved in: {FIG_DIR}")
    else:
        print("[WARN] No rows to save — check your inputs.")

if __name__ == "__main__":
    main()


TypeError: unsupported operand type(s) for |: 'type' and 'NoneType'