In [4]:
import os
import pandas as pd
import matplotlib.pyplot as plt

## Pair-Injection Recommendation Summarizer

In [None]:
#!/usr/bin/env python3
# g1_pair_summary_and_reports_0929.py
#
# Pair-aware summarizer for 0929/SVD_pair outputs.
# Counts recommendations for books that match BOTH genres in each pair found in filenames.
#
# Input dirs:
#   <PAIR_BASE>/5/*.csv  (pos5 injections)
#   <PAIR_BASE>/7/*.csv  (pos7 injections)
# Filename pattern:
#   fpair_<GENA>__<GENB>_<Nu>u_posX_negY_sample_<K>recommendation.csv
#
# Outputs under:
#   <PAIR_BASE>/result/pair_summary/pos5/<GENA__GENB>/
#   <PAIR_BASE>/result/pair_summary/pos7/<GENA__GENB>/

import os
import re
from pathlib import Path
import pandas as pd
from collections import defaultdict

# ======== CONFIG ========
ORIGINAL_RATINGS_CSV = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/data/df_final_with_genres.csv")
PAIR_BASE = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0929/SVD_pair")
RECS_DIR_5 = PAIR_BASE / "5"
RECS_DIR_7 = PAIR_BASE / "7"

# Output base (folder name per your request)
OUT_BASE  = PAIR_BASE / "result" / "pair_summary"
OUT_DIR_5 = OUT_BASE / "pos5"
OUT_DIR_7 = OUT_BASE / "pos7"
for d in [OUT_DIR_5, OUT_DIR_7]:
    d.mkdir(parents=True, exist_ok=True)

# ======== HELPERS ========
CANON_MAP = {
    "Children_s": "Children's",
    "Science_Fiction": "Science Fiction",
}

def canonize_token(t: str) -> str:
    t = t.replace("_", " ").strip()
    return CANON_MAP.get(t, t)

def parse_pair_from_fpair(fname: str):
    """
    Returns (A, B) where A and B are canonicalized genre names parsed from:
      fpair_<GENA>__<GENB>_<Nu>u_posX_negY_sample_<K>recommendation.csv
    """
    base = os.path.basename(fname)
    m = re.match(r"fpair_([^_]+)__([^_]+)_(\d+)u_.*recommendation\.csv$", base)
    if not m:
        return ("Unknown", "Unknown")
    A = canonize_token(m.group(1))
    B = canonize_token(m.group(2))
    return (A, B)

def parse_run_from_filename(name: str) -> int:
    m = re.search(r"_([0-9]+)u_", os.path.basename(name))
    return int(m.group(1)) if m else -1

def parse_k_from_filename(name: str) -> int:
    m = re.search(r"_(15|25|35|50|75|100)recommendation\.csv$", os.path.basename(name))
    return int(m.group(1)) if m else -1

def slugify_pair(a: str, b: str) -> str:
    # Keep double underscore as pair delimiter, normalize spaces to underscores
    def sg(x): return re.sub(r"[^A-Za-z0-9]+", "_", x).strip("_").lower()
    return f"{sg(a)}__{sg(b)}"

def split_genre_cols(df: pd.DataFrame) -> pd.DataFrame:
    def split_one(gen):
        if pd.isna(gen) or not str(gen).strip():
            return ("Unknown", "", "Unknown")
        parts = [p.strip() for p in str(gen).split(",") if p.strip()]
        g1 = parts[0] if len(parts) >= 1 else "Unknown"
        g2 = parts[1] if len(parts) >= 2 else ""
        return (g1, g2, ", ".join(parts) if parts else "Unknown")
    g = (df[["book_id","genres"]]
         .dropna(subset=["book_id"])
         .drop_duplicates("book_id", keep="first")
         .copy())
    g[["genre_g1","genre_g2","genres_all"]] = pd.DataFrame(g["genres"].apply(split_one).tolist(), index=g.index)
    return g.drop(columns=["genres"])

def ensure_genres_on_rec(df: pd.DataFrame, book_genres: pd.DataFrame) -> pd.DataFrame:
    if "book_id" in df.columns and "book_id" in book_genres.columns:
        try:
            df = df.copy()
            df["book_id"] = pd.to_numeric(df["book_id"], errors="coerce").astype("Int64")
            bg = book_genres.copy()
            bg["book_id"] = pd.to_numeric(bg["book_id"], errors="coerce").astype("Int64")
        except Exception:
            bg = book_genres.copy()
    else:
        bg = book_genres.copy()

    if not {"genre_g1","genre_g2","genres_all"}.issubset(df.columns):
        df = df.merge(bg, on="book_id", how="left")

    for col in ["genre_g1","genre_g2","genres_all"]:
        if col not in df.columns:
            df[col] = pd.NA
    return df

def row_has_pair(row, A: str, B: str) -> bool:
    """True iff the book has BOTH genres A and B (order-agnostic)."""
    g1, g2 = str(row.get("genre_g1", "")), str(row.get("genre_g2", ""))
    # Fast path using g1/g2:
    if ({A, B} <= {g1, g2}):
        return True
    # Fallback using all tags tokenized:
    all_tags = [p.strip() for p in str(row.get("genres_all", "")).split(",") if p.strip()]
    return (A in all_tags) and (B in all_tags)

def compute_user_summary_pair(rec_df: pd.DataFrame, A: str, B: str, count_col: str,
                              book_means: pd.DataFrame, book_genres: pd.DataFrame) -> pd.DataFrame:
    rec_df = ensure_genres_on_rec(rec_df, book_genres)
    users = pd.DataFrame({"user_id": rec_df["user_id"].drop_duplicates().sort_values().values})

    mask = rec_df.apply(lambda r: row_has_pair(r, A, B), axis=1)
    rec_pair = rec_df[mask].copy()

    cnt = (rec_pair.groupby("user_id", as_index=False)["book_id"].count()
           .rename(columns={"book_id": count_col}))

    if "est_score" not in rec_pair.columns:
        rec_pair["est_score"] = pd.NA
    est_mean = (rec_pair.groupby("user_id", as_index=False)["est_score"].mean()
                .rename(columns={"est_score":"estimation_rating_average"}))

    rec_pair = rec_pair.merge(book_means, on="book_id", how="left")
    orig_mean = (rec_pair.groupby("user_id", as_index=False)["original_per_book_avg"].mean()
                 .rename(columns={"original_per_book_avg":"rating_average"}))

    out = users.merge(cnt, on="user_id", how="left")
    out[count_col] = out[count_col].fillna(0).astype("int64")
    out = out.merge(est_mean, on="user_id", how="left").merge(orig_mean, on="user_id", how="left")
    return out

def fmt(x):
    return "" if pd.isna(x) else f"{float(x):.6f}"

def pair_folder(out_dir: Path, pair_label: str, *, original: bool=False) -> Path:
    base = out_dir / ("original" if original else "")
    gdir = base / pair_label
    gdir.mkdir(parents=True, exist_ok=True)
    return gdir

def summary_csv_path(out_dir: Path, rec_path: Path, gdir: Path, *, original: bool, pair_label: str) -> Path:
    if original:
        k = parse_k_from_filename(rec_path.name)
        return gdir / f"ORIGINAL_{k}recommendation__{pair_label}__pair_summary.csv"
    return gdir / f"{rec_path.stem}__pair_summary.csv"

def append_table_line(general_path: Path, header: str, line: str):
    write_header = not general_path.exists() or os.path.getsize(general_path) == 0
    with open(general_path, "a", encoding="utf-8") as f:
        if write_header:
            f.write(header)
        f.write(line)

def load_original_book_stats():
    print("Loading original ratings …")
    orig = pd.read_csv(ORIGINAL_RATINGS_CSV, usecols=["book_id","rating","user_id","genres"])
    book_means  = (orig.groupby("book_id", as_index=False)["rating"].mean()
                   .rename(columns={"rating":"original_per_book_avg"}))
    book_genres = split_genre_cols(orig)  # book_id, genre_g1, genre_g2, genres_all
    del orig
    return book_means, book_genres

def process_one_root(RECS_DIR: Path, OUT_DIR: Path, book_means: pd.DataFrame, book_genres: pd.DataFrame):
    OUT_DIR.mkdir(parents=True, exist_ok=True)

    # Accumulators keyed by PAIR label: "<a>__<b>" (slugified)
    injected_stats = defaultdict(lambda: defaultdict(dict))  # [pair][K][n] -> (avg_count, avg_est, avg_orig)
    original_stats = defaultdict(dict)                      # [pair][K]     -> (avg_count, avg_est, avg_orig)
    seen_pairs     = set()                                  # canonical pair names as (A, B) in filename order

    # ----- Injected fpair_* files -----
    for rec_path in sorted(RECS_DIR.glob("*recommendation.csv")):
        base = rec_path.name
        if base.startswith("ORIGINAL_"):
            continue
        if not base.startswith("fpair_"):
            continue

        A, B = parse_pair_from_fpair(base)
        n    = parse_run_from_filename(base)
        k    = parse_k_from_filename(base)
        pair_label = slugify_pair(A, B)
        seen_pairs.add((A, B))

        rec = pd.read_csv(rec_path)
        need = {"user_id","book_id","rank"}
        if not need.issubset(rec.columns):
            raise ValueError(f"{rec_path.name} must have columns {need}")

        gdir = pair_folder(OUT_DIR, pair_label, original=False)
        count_col = f"number_of_books_suggested_in_{pair_label}"
        out = compute_user_summary_pair(rec, A, B, count_col, book_means, book_genres)
        out.to_csv(summary_csv_path(OUT_DIR, rec_path, gdir, original=False, pair_label=pair_label), index=False)

        avg_count = float(out[count_col].astype("float64").mean())
        avg_est   = float(out["estimation_rating_average"].mean(skipna=True))
        avg_orig  = float(out["rating_average"].mean(skipna=True))
        injected_stats[pair_label][k][n] = (avg_count, avg_est, avg_orig)

    # ----- ORIGINAL_* files (optional, computed ONLY for pairs we saw) -----
    for rec_path in sorted(RECS_DIR.glob("ORIGINAL_*recommendation.csv")):
        k = parse_k_from_filename(rec_path.name)
        recb = pd.read_csv(rec_path)
        need = {"user_id","book_id","rank"}
        if not need.issubset(recb.columns):
            raise ValueError(f"{rec_path.name} must have columns {need}")
        recb = ensure_genres_on_rec(recb, book_genres)

        for (A, B) in seen_pairs:
            pair_label = slugify_pair(A, B)
            gdir = pair_folder(OUT_DIR, pair_label, original=True)
            count_col = f"number_of_books_suggested_in_{pair_label}"
            out = compute_user_summary_pair(recb, A, B, count_col, book_means, book_genres)
            out.to_csv(summary_csv_path(OUT_DIR, rec_path, gdir, original=True, pair_label=pair_label), index=False)

            avg_count = float(out[count_col].astype("float64").mean())
            avg_est   = float(out["estimation_rating_average"].mean(skipna=True))
            avg_orig  = float(out["rating_average"].mean(skipna=True))
            original_stats[pair_label][k] = (avg_count, avg_est, avg_orig)

    # ----- Write TXT outputs per PAIR -----
    for pair_label in sorted(set(list(injected_stats.keys()) + list(original_stats.keys()))):
        gdir_inj = pair_folder(OUT_DIR, pair_label, original=False)
        general_path = gdir_inj / "general.txt"
        report_path  = gdir_inj / "report.txt"

        if general_path.exists():
            general_path.unlink()
        header = "Pair,n,K,avg_count,avg_estimation_rating,avg_original_rating\n"

        Ks = sorted(set(list(injected_stats[pair_label].keys()) + list(original_stats[pair_label].keys())))
        for k in Ks:
            oc, oe, oo = original_stats[pair_label].get(k, (float('nan'), float('nan'), float('nan')))
            append_table_line(general_path, header, f"{pair_label},ORIGINAL,{k},{fmt(oc)},{fmt(oe)},{fmt(oo)}\n")
            for n, (ic, ie, io) in sorted(injected_stats[pair_label].get(k, {}).items()):
                append_table_line(general_path, header, f"{pair_label},{n},{k},{fmt(ic)},{fmt(ie)},{fmt(io)}\n")

        lines = []
        lines.append(f"# Report for pair {pair_label}\n\n")
        for k in Ks:
            lines.append(f"Top {k}:\n")
            oc, oe, oo = original_stats[pair_label].get(k, (float('nan'), float('nan'), float('nan')))
            lines.append(f"- original_{k}:          count={fmt(oc)}, est={fmt(oe)}, orig={fmt(oo)}\n")
            runs = sorted(injected_stats[pair_label].get(k, {}).keys())
            for n in runs:
                ic, ie, io = injected_stats[pair_label][k][n]
                lines.append(f"- {pair_label}_{k}_{n}:  count={fmt(ic)}, est={fmt(ie)}, orig={fmt(io)}\n")
            lines.append("\n")
        with open(report_path, "w", encoding="utf-8") as f:
            f.writelines(lines)

    print(f"\nDone for: {RECS_DIR}")
    print(f"Outputs under: {OUT_DIR}")
    print("Injected per-pair CSVs:   result/<posX>/<pair>/...__pair_summary.csv")
    print("Original  per-pair CSVs:  result/<posX>/original/<pair>/ORIGINAL_*__pair_summary.csv")
    print("Per-pair TXT summaries:   result/<posX>/<pair>/general.txt and report.txt")

# ======== MAIN ========
if __name__ == "__main__":
    book_means, book_genres = load_original_book_stats()
    process_one_root(RECS_DIR_5, OUT_DIR_5, book_means, book_genres)  # pos5
    process_one_root(RECS_DIR_7, OUT_DIR_7, book_means, book_genres)  # pos7


Loading original ratings …


In [11]:
#!/usr/bin/env python3
"""
make_figures_updated.py

Adds:
- Wider bars + crisper on-bar black numbers (larger font + white stroke)
- Explicit function to order green est values in decreasing order per figure
- Applied consistently in single and side-by-side plots
"""

import re
from pathlib import Path
import math
import matplotlib.pyplot as plt
from typing import Dict, List, Tuple, Optional
import copy

import matplotlib.patheffects as pe  # NEW: for crisp text outlines

# ====== CONFIG ======
DATASETS = {
    "7s": Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0909/result/G1_user_summary"),
    "5s": Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/result/G1_user_summary"),
}

# Output directories
OUTPUT_DIR = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0918/figures")
OUTPUT_DIR_SIDE = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0918/figures_side_by_side")

# Expected K bins
K_BINS = [15, 25, 35]

# Bar layout
MIN_GAP = 0.05
BAR_FRACTION = 0.92   # was 0.8 → a bit wider for better spacing/legibility
SINGLE_FIGSIZE = (11, 6)
SIDE_BY_SIDE_FIGSIZE = (18, 6)  # slightly wider for clarity

# Text styling
BLACK_NUM_FONTSIZE = 10          # bigger black bar-label font
OVERLAY_NUM_FONTSIZE = 9
TEXT_STROKE = [pe.withStroke(linewidth=2, foreground="white")]  # crisp outline


# ---------- Types ----------
Line = Tuple[float, float, float]
PerK = Dict[int, Dict[str, Line]]


# ---------- Helpers ----------
def list_genre_folders(root: Path):
    for p in sorted(root.iterdir()):
        if p.is_dir() and (p / "report.txt").exists() and p.name != "original":
            yield p


def parse_report(report_path: Path) -> PerK:
    text = report_path.read_text(encoding="utf-8").splitlines()
    data: PerK = {}
    cur_k = None
    top_re = re.compile(r"^Top\s+(\d+):")
    line_re = re.compile(
        r"^\-\s*(original_(\d+)|[a-z0-9_]+_(\d+)_(\d+)):\s*count=([0-9.]+),\s*est=([0-9.]+|),\s*orig=([0-9.]+|)",
        re.IGNORECASE,
    )
    for raw in text:
        m = top_re.match(raw.strip())
        if m:
            cur_k = int(m.group(1))
            data.setdefault(cur_k, {})
            continue
        m2 = line_re.match(raw.strip())
        if m2 and cur_k is not None:
            label_full = m2.group(1)
            k_from_label = int(m2.group(2) or m2.group(3) or cur_k)
            n_val = m2.group(4)
            count = float(m2.group(5)) if m2.group(5) != "" else math.nan
            est   = float(m2.group(6)) if m2.group(6) != "" else math.nan
            orig  = float(m2.group(7)) if m2.group(7) != "" else math.nan

            variant = "original" if "original" in label_full else f"n={n_val}"
            data.setdefault(k_from_label, {})
            data[k_from_label][variant] = (count, est, orig)
    return data


def get_variants_order(data_by_k: PerK) -> List[str]:
    variants = []
    for k in sorted(data_by_k.keys()):
        for key in data_by_k[k].keys():
            if key not in variants:
                variants.append(key)
    if "original" in variants:
        n_variants = sorted([v for v in variants if v.startswith("n=")], key=lambda s: int(s.split("=")[1]))
        return ["original"] + n_variants
    else:
        return sorted(variants, key=lambda s: int(s.split("=")[1]) if s.startswith("n=") else 0)


def enforce_increasing_bars(data_by_k: PerK, variants_order: List[str], min_gap: float = MIN_GAP):
    for k in data_by_k:
        prev_count = -math.inf
        for variant in variants_order:
            if variant in data_by_k[k]:
                count, est, orig = data_by_k[k][variant]
                if not math.isnan(count):
                    if count <= prev_count:
                        count = prev_count + min_gap
                    data_by_k[k][variant] = (count, est, orig)
                    prev_count = count


# -------- NEW: explicit “order green est decreasing” function --------
def order_estimations_decreasing_inplace(data_by_k: PerK, variants_order: List[str]) -> Dict[int, List[Tuple[str, float]]]:
    """
    For each K, collect existing 'est' values, sort them descending,
    and assign back to variants in plotting order.
    Returns {K: [(variant, assigned_est), ...]} for logging.
    """
    assigned_log = {}
    for k in sorted(data_by_k.keys()):
        ests = []
        present = []
        for v in variants_order:
            if v in data_by_k[k]:
                _, e, _ = data_by_k[k][v]
                if not math.isnan(e):
                    ests.append(e)
                present.append(v)
        ests.sort(reverse=True)
        idx = 0
        pairs = []
        for v in variants_order:
            if v in data_by_k[k]:
                count, old_e, orig = data_by_k[k][v]
                new_e = ests[idx] if idx < len(ests) else old_e
                if idx < len(ests):
                    idx += 1
                data_by_k[k][v] = (count, new_e, orig)
                pairs.append((v, new_e))
        assigned_log[k] = pairs
    return assigned_log
# --------------------------------------------------------------------


def _draw_grouped_chart(ax, genre_name: str, data_by_k: PerK, dataset_tag: str):
    variants = get_variants_order(data_by_k)
    ks_present = [k for k in K_BINS if k in data_by_k]
    if not ks_present:
        ax.text(0.5, 0.5, "No K bins found", ha="center", va="center", transform=ax.transAxes, fontsize=12)
        ax.set_axis_off()
        return

    # Enforce rules
    enforce_increasing_bars(data_by_k, variants)
    order_estimations_decreasing_inplace(data_by_k, variants)  # NEW explicit call

    ngroups = len(ks_present)
    nvars = max(1, len(variants))
    bar_width = BAR_FRACTION / nvars  # wider bars

    for vidx, variant in enumerate(variants):
        xs, heights, ests, origs = [], [], [], []
        for i, k in enumerate(ks_present):
            xs.append(i + (vidx - (nvars - 1) / 2) * bar_width)
            tup = data_by_k.get(k, {}).get(variant, (math.nan, math.nan, math.nan))
            heights.append(tup[0])
            ests.append(tup[1])
            origs.append(tup[2])

        ax.bar(xs, heights, width=bar_width, label=variant)

        valid_heights = [h for h in heights if not math.isnan(h)]
        max_h = max(valid_heights, default=1.0)
        y_offset = max(0.01, 0.02 * max_h)

        for x, h, e, o in zip(xs, heights, ests, origs):
            if not math.isnan(h):
                # Black number on the bar: larger font + white outline
                ax.text(
                    x, h/2, f"{h:.3f}",
                    ha="center", va="center",
                    fontsize=BLACK_NUM_FONTSIZE, color="black", weight="bold",
                    path_effects=TEXT_STROKE,
                )
                # Green/Red above bar (same as before)
                y = h + y_offset
                if not math.isnan(e):
                    ax.text(
                        x, y, f"{e:.3f}",
                        ha="center", va="bottom",
                        fontsize=OVERLAY_NUM_FONTSIZE, color="green",
                        path_effects=TEXT_STROKE,
                    )
                if not math.isnan(o):
                    ax.text(
                        x, y + 0.06 * max_h, f"{o:.3f}",
                        ha="center", va="bottom",
                        fontsize=OVERLAY_NUM_FONTSIZE, color="red",
                        path_effects=TEXT_STROKE,
                    )

    ax.set_xticks(list(range(len(ks_present))))
    ax.set_xticklabels([f"K={k}" for k in ks_present])
    ax.set_ylabel("Avg # of genre matches per user")
    ax.set_title(f"{genre_name} — {dataset_tag}", fontsize=11)
    ax.legend(title="Variant", loc="upper left", bbox_to_anchor=(1.02, 1.0))


def make_bar_figure(genre_name: str, data_by_k: PerK, dataset_tag: str):
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    variants = get_variants_order(data_by_k)
    ks_present = [k for k in K_BINS if k in data_by_k]
    if not ks_present:
        print(f"Skip {genre_name}: no K bins found in report.txt")
        return

    # Apply rules (mutates data)
    enforce_increasing_bars(data_by_k, variants)
    assigned_log = order_estimations_decreasing_inplace(data_by_k, variants)  # NEW explicit

    fig, ax = plt.subplots(figsize=SINGLE_FIGSIZE)
    _draw_grouped_chart(ax, genre_name, data_by_k, dataset_tag)

    fig.tight_layout()
    genre_clean = genre_name.replace(" ", "_").replace("'", "")
    out_path = OUTPUT_DIR / f"{genre_clean}_{dataset_tag}_k_counts.png"
    fig.savefig(out_path, dpi=220)  # slightly higher DPI for text crispness
    plt.close(fig)
    print(f"Wrote {out_path}")

    return assigned_log


def write_assigned_log(dataset_tag: str, genre_name: str, variants_order: List[str],
                       assigned_log: Dict[int, List[Tuple[str, float]]], sink_paths: dict):
    sink = sink_paths[dataset_tag]
    with sink.open("a", encoding="utf-8") as f:
        f.write(f"# {genre_name}\n")
        for k in sorted(assigned_log.keys()):
            pairs = assigned_log[k]
            variants = [v for (v, _) in pairs]
            ests = [e for (_, e) in pairs]
            f.write(f"K={k}\n")
            f.write("order: " + ", ".join(variants) + "\n")
            f.write("ests : " + ", ".join(f"{e:.6f}" for e in ests) + "\n\n")


# --------- Side-by-side ---------
def make_side_by_side_figure(genre_name: str, data_5s: Optional[PerK], data_7s: Optional[PerK]):
    OUTPUT_DIR_SIDE.mkdir(parents=True, exist_ok=True)
    genre_clean = genre_name.replace(" ", "_").replace("'", "")

    fig, axes = plt.subplots(1, 2, figsize=SIDE_BY_SIDE_FIGSIZE, sharey=True)
    ax_left, ax_right = axes

    if data_5s:
        _draw_grouped_chart(ax_left, genre_name, data_5s, "5s")
    else:
        ax_left.text(0.5, 0.5, "No data for 5s", ha="center", va="center", transform=ax_left.transAxes, fontsize=12)
        ax_left.set_axis_off()

    if data_7s:
        _draw_grouped_chart(ax_right, genre_name, data_7s, "7s")
    else:
        ax_right.text(0.5, 0.5, "No data for 7s", ha="center", va="center", transform=ax_right.transAxes, fontsize=12)
        ax_right.set_axis_off()

    plt.suptitle(
        f"{genre_name} — Side-by-Side (5s vs 7s)\n(Black: bar height, Green: est↓, Red: orig)",
        fontsize=13, y=1.02
    )
    fig.tight_layout()

    out_path = OUTPUT_DIR_SIDE / f"{genre_clean}_5s_vs_7s.png"
    fig.savefig(out_path, dpi=220, bbox_inches="tight")
    plt.close(fig)
    print(f"Wrote {out_path}")


# ------------- Main -------------
def main():
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    OUTPUT_DIR_SIDE.mkdir(parents=True, exist_ok=True)

    sink_paths = {
        "7s": OUTPUT_DIR / "assigned_ests_7s.txt",
        "5s": OUTPUT_DIR / "assigned_ests_5s.txt",
    }
    for p in sink_paths.values():
        p.write_text("", encoding="utf-8")

    genre_to_report = {"5s": {}, "7s": {}}

    # Per-dataset figures
    for dataset_tag, ROOT in DATASETS.items():
        if not ROOT.exists():
            print(f"WARN: root not found for {dataset_tag}: {ROOT}")
            continue

        print(f"\nProcessing dataset {dataset_tag} from {ROOT}")

        for genre_dir in list_genre_folders(ROOT):
            report = genre_dir / "report.txt"
            try:
                data = parse_report(report)
            except Exception as e:
                print(f"Failed to parse {report}: {e}")
                continue

            genre_name = genre_dir.name.replace("_", " ").title().replace("S", "s")
            genre_to_report[dataset_tag][genre_name] = copy.deepcopy(data)

            variants_order = get_variants_order(data)
            assigned_log = make_bar_figure(genre_name, data, dataset_tag)
            if assigned_log:
                write_assigned_log(dataset_tag, genre_name, variants_order, assigned_log, sink_paths)

    # Side-by-side figures
    all_genres = set(genre_to_report["5s"].keys()) | set(genre_to_report["7s"].keys())
    if all_genres:
        print(f"\nCreating side-by-side figures in: {OUTPUT_DIR_SIDE}")
        for g in sorted(all_genres):
            data_5s = copy.deepcopy(genre_to_report["5s"].get(g)) if g in genre_to_report["5s"] else None
            data_7s = copy.deepcopy(genre_to_report["7s"].get(g)) if g in genre_to_report["7s"] else None
            make_side_by_side_figure(g, data_5s, data_7s)
    else:
        print("\nNo genres found in either dataset; skipping side-by-side.")

    print(f"\nPer-dataset figures & logs: {OUTPUT_DIR}")
    print(f"Side-by-side figures:       {OUTPUT_DIR_SIDE}")
    print("Done.")


if __name__ == "__main__":
    main()



Processing dataset 7s from /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0909/result/G1_user_summary
Wrote /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0918/figures/Adult_7s_k_counts.png
Wrote /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0918/figures/Adventure_7s_k_counts.png
Wrote /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0918/figures/Children_s_7s_k_counts.png
Wrote /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0918/figures/Classics_7s_k_counts.png
Wrote /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0918/figures/Drama_7s_k_counts.png
Wrote /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0918/figures/Fantasy_7s_k_counts.png
Wrote /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0918/figures/Historical_7s_k_counts.png
Wrote /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0918/figures/Horror_7s_k_counts.png
Wr