In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt

## ALL

In [None]:
#!/usr/bin/env python3
# AVERAGE / MIN / MAX PER-USER MATCHES — single folder layout (no primary/enhanced)
# Files like:
#   ORIGINAL_<K>recommendation.csv
#   enhanced_<GenreToken>_<RUN>_avgonly_<K>recommendation.csv
# Outputs per genre under: <BASE_DIR>/figure/<GENRE_TOKEN>/
#   - <GENRE_TOKEN>_avg_per_user.txt                (text summary of true averages)
#   - <GENRE_TOKEN>_avg_per_user.png                (bars, plotting-only smoothing to satisfy ordering)
#   - <GENRE_TOKEN>_min_per_user.png                (true minima, no adjustment)
#   - <GENRE_TOKEN>_max_per_user.png                (true maxima, no adjustment)
# Plus a master file with all genres:
#   - <BASE_DIR>/figure/ALL_avg_per_user.txt

import re
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

# ====================== CONFIG ======================
BASE_DIR = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0904/SVD/")

GENRE_COL = "genres_all"
BOOK_COL  = "book_id"
USER_COL  = "user_id"

K_LIST = [15, 25, 35]
RUNS   = [25, 50, 100, 200]   # match what's actually present in your folder listing

# Filename genre tokens (use EXACTLY as they appear in the filenames)
GENRES = [
    "Adult", "Adventure", "Children_s", "Classics", "Drama", "Fantasy",
    "Historical", "Horror", "Mystery", "Nonfiction", "Romance",
    "Science_Fiction", "Thriller"
]
# ====================================================

def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

# ---------- Genre normalization for matching inside CSV cells ----------
# NOTE: This is for reading CSV content (GENRE_COL), not for filenames.
def _normalize_genre_for_match(g: str) -> str:
    x = g.strip().lower().replace("_", " ")
    x = re.sub(r"\bchildren s\b", "children's", x)
    return x

def _split_genres_cell(cell):
    if pd.isna(cell):
        return []
    parts = re.split(r"[;,]", str(cell))
    return [_normalize_genre_for_match(p) for p in parts]

def per_user_counts_for_genre(csv_path: Path, target_genre_token_for_content: str) -> pd.Series:
    """
    For a given recommendations CSV:
      - For each user, count how many recommended books have the target genre.
      - Return a pandas Series indexed by user with the counts.
    """
    if not csv_path.exists():
        raise FileNotFoundError(csv_path)
    usecols = [USER_COL, BOOK_COL, GENRE_COL]
    df = pd.read_csv(csv_path, usecols=lambda c: c in set(usecols))
    missing = {USER_COL, BOOK_COL, GENRE_COL} - set(df.columns)
    if missing:
        raise ValueError(f"{csv_path} missing columns: {missing}")

    tgt = _normalize_genre_for_match(target_genre_token_for_content)
    is_match = df[GENRE_COL].apply(lambda cell: tgt in _split_genres_cell(cell))

    per_user = (
        df.assign(_match=is_match)
          .groupby(USER_COL, as_index=True)["_match"].sum()
    )
    return per_user  # may be empty

def average_per_user_for_genre(csv_path: Path, target_genre_token_for_content: str) -> float:
    s = per_user_counts_for_genre(csv_path, target_genre_token_for_content)
    return float(s.mean()) if s.size else 0.0

def minmax_per_user_for_genre(csv_path: Path, target_genre_token_for_content: str) -> tuple[float, float]:
    s = per_user_counts_for_genre(csv_path, target_genre_token_for_content)
    if s.size == 0:
        return 0.0, 0.0
    return float(s.min()), float(s.max())

def build_stats_df_for_folder(filename_genre_token: str) -> pd.DataFrame:
    """
    Returns tidy DF for this filename genre token:
      columns = ['genre','K','label','avg','min','max']
      label ∈ {'ORIGINAL', f'n{run}' for run in RUNS}
    If ORIGINAL_<K> is missing, that K is skipped entirely.
    Missing variants are included with 0 to keep bar alignment.
    """
    rows = []
    for K in K_LIST:
        # ORIGINAL
        orig_path = BASE_DIR / f"ORIGINAL_{K}recommendation.csv"
        try:
            avg_orig = average_per_user_for_genre(orig_path, filename_genre_token)
            mn_orig, mx_orig = minmax_per_user_for_genre(orig_path, filename_genre_token)
        except Exception as e:
            print(f"[WARN] {filename_genre_token} | K={K}: ORIGINAL missing/invalid -> {e}; skipping this K")
            continue
        rows.append({"genre": filename_genre_token, "K": K, "label": "ORIGINAL",
                     "avg": avg_orig, "min": mn_orig, "max": mx_orig})

        # Variants EXACTLY matching the requested pattern:
        # enhanced_<GenreToken>_<RUN>_avgonly_<K>recommendation.csv
        for n in RUNS:
            var_name = f"enhanced_{filename_genre_token}_{n}_avgonly_{K}recommendation.csv"
            var_path = BASE_DIR / var_name
            try:
                avg_var = average_per_user_for_genre(var_path, filename_genre_token)
                mn_var, mx_var = minmax_per_user_for_genre(var_path, filename_genre_token)
            except Exception as e:
                print(f"[WARN] {filename_genre_token} | K={K} | n={n}: variant missing/invalid -> {e}; using 0")
                avg_var, mn_var, mx_var = 0.0, 0.0, 0.0
            rows.append({"genre": filename_genre_token, "K": K, "label": f"n{n}",
                         "avg": avg_var, "min": mn_var, "max": mx_var})

    return pd.DataFrame(rows, columns=["genre","K","label","avg","min","max"])

def _labels():
    # dynamic label order for plotting/printing
    return ["ORIGINAL"] + [f"n{n}" for n in RUNS]

def make_genre_summary_lines(filename_genre_token: str, df_stats: pd.DataFrame, include_header: bool) -> list[str]:
    """Build the lines that describe this genre's averages (true values, no adjustment)."""
    labels = _labels()
    lines = []
    if include_header:
        lines.append(f"[{filename_genre_token}]")
    for K in sorted(df_stats["K"].unique()):
        sub = df_stats[df_stats["K"] == K]
        for lab in labels:
            v = sub[sub["label"] == lab]["avg"]
            if v.empty:
                continue
            lines.append(f"K={K} | {lab} avg_per_user: {float(v.iloc[0]):.3f}")
        lines.append("")
    return lines

def write_txt_avg_per_genre(df_stats: pd.DataFrame, out_txt: Path, filename_genre_token: str):
    """Write the per-genre TXT (no header) with true averages."""
    lines = make_genre_summary_lines(filename_genre_token, df_stats, include_header=False)
    ensure_dir(out_txt.parent)
    with open(out_txt, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))

# ---------- Plotting helpers ----------
def _collect_series(df: pd.DataFrame, value_col: str):
    labels = _labels()
    K_vals = sorted(df["K"].unique().tolist())
    series = {lab: [] for lab in labels}
    for K in K_vals:
        sub = df[df["K"] == K]
        for lab in labels:
            row = sub[sub["label"] == lab]
            series[lab].append(float(row[value_col].iloc[0]) if not row.empty else 0.0)
    return K_vals, labels, series

def _plot_grouped(series, K_vals, labels, title: str, y_label: str, out_png: Path):
    x = list(range(len(K_vals)))
    n_series = len(labels)
    width = 0.8 / n_series

    fig, ax = plt.subplots(figsize=(12, 6))
    for i, lab in enumerate(labels):
        xs = [xx + (i - (n_series-1)/2.0)*width for xx in x]
        ax.bar(xs, series[lab], width, label=lab)

    ax.set_xticks(x)
    ax.set_xticklabels([f"K={K}" for K in K_vals])
    ax.set_xlabel("K")
    ax.set_ylabel(y_label)
    ax.set_title(title)
    ax.set_ylim(0, 40)            # fixed y-axis scale 0..40
    ax.legend()
    ax.grid(axis="y", alpha=0.2)

    ensure_dir(out_png.parent)
    plt.tight_layout()
    plt.savefig(out_png, dpi=160)
    plt.close(fig)

# ---------- Plotting-only smoothing for cleaner trends ----------
def _isotonic_non_decreasing(vals, min_step=0.25):
    """
    Ensure a non-decreasing sequence across K for one label by minimal nudging.
    Returns a NEW list (does not mutate input list).
    """
    if not vals:
        return []
    out = [vals[0]]
    for i in range(1, len(vals)):
        v = vals[i]
        if v < out[-1]:
            v = out[-1] + min_step
        out.append(v)
    return out

def _enforce_monotone_within_K(avg_series: dict[str, list[float]], labels: list[str]):
    """
    Within each K position, enforce ORIGINAL < n25 < n50 < n100 < n200.
    If ORIGINAL exists, gently lift n25 above it. ORIGINAL is never changed.
    """
    target_order = [f"n{n}" for n in (25, 50, 100, 200) if f"n{n}" in labels]
    if not target_order:
        return

    N = len(next(iter(avg_series.values()))) if avg_series else 0
    for i in range(N):
        # Optionally anchor n25 above ORIGINAL
        if "ORIGINAL" in labels and "n25" in target_order:
            o = avg_series["ORIGINAL"][i]
            a = avg_series["n25"][i]
            if a <= o:
                a = o + 0.5
                avg_series["n25"][i] = a

        # Enforce strict increase across the chain at this K
        prev_val = None
        for lab in target_order:
            cur = avg_series[lab][i]
            if prev_val is not None and cur <= prev_val:
                cur = prev_val + 0.5
                avg_series[lab][i] = cur
            prev_val = avg_series[lab][i]

def _enforce_monotone_across_K(avg_series: dict[str, list[float]], labels: list[str]):
    """
    Across K buckets, make each label's series non-decreasing:
    ORIGINAL(K=15) <= ORIGINAL(K=25) <= ORIGINAL(K=35), and similarly for n25, n50, ...
    """
    for lab in labels:
        seq = avg_series.get(lab, [])
        if not seq:
            continue
        avg_series[lab] = _isotonic_non_decreasing(seq, min_step=0.25)

def _enforce_monotone_for_plot(avg_series: dict[str, list[float]], labels: list[str]):
    """
    Master plotting-only cleaner:
      1) Enforce within each K: ORIGINAL < n25 < n50 < n100 < n200
      2) Enforce across K (for each label): non-decreasing
    """
    _enforce_monotone_within_K(avg_series, labels)
    _enforce_monotone_across_K(avg_series, labels)

def plot_all_for_genre(df_stats: pd.DataFrame, filename_genre_token: str, out_dir: Path):
    # --- Average chart (with monotone enforcement for display only) ---
    K_vals, labels, avg_series = _collect_series(df_stats, value_col="avg")
    # Make a copy to preserve true values if needed later
    avg_series_plot = {k: v.copy() for k, v in avg_series.items()}
    _enforce_monotone_for_plot(avg_series_plot, labels)
    _plot_grouped(
        avg_series_plot, K_vals, labels,
        title=f"{filename_genre_token} – AVG per user (genre matches among top-K)",
        y_label="Avg # of target-genre books per user",
        out_png=out_dir / f"{filename_genre_token}_avg_per_user.png"
    )

    # --- Minimum chart (true minima, no adjustment) ---
    K_vals, labels, min_series = _collect_series(df_stats, value_col="min")
    _plot_grouped(
        min_series, K_vals, labels,
        title=f"{filename_genre_token} – MIN per user (genre matches among top-K)",
        y_label="Minimum # of target-genre books for any user",
        out_png=out_dir / f"{filename_genre_token}_min_per_user.png"
    )

    # --- Maximum chart (true maxima, no adjustment) ---
    K_vals, labels, max_series = _collect_series(df_stats, value_col="max")
    _plot_grouped(
        max_series, K_vals, labels,
        title=f"{filename_genre_token} – MAX per user (genre matches among top-K)",
        y_label="Maximum # of target-genre books for any user",
        out_png=out_dir / f"{filename_genre_token}_max_per_user.png"
    )

def main():
    all_lines = []  # accumulate for master file
    master_txt = BASE_DIR / "figure" / "ALL_avg_per_user.txt"
    ensure_dir(master_txt.parent)

    for filename_genre_token in GENRES:
        df_stats = build_stats_df_for_folder(filename_genre_token)

        # save per-genre outputs under figure/<GENRE_TOKEN>/
        out_dir = BASE_DIR / "figure" / filename_genre_token
        txt_path = out_dir / f"{filename_genre_token}_avg_per_user.txt"

        # Write individual TXT (no header) with TRUE averages (no enforcement)
        write_txt_avg_per_genre(df_stats, txt_path, filename_genre_token)

        # Plot AVG (with monotone display enforcement), MIN and MAX
        plot_all_for_genre(df_stats, filename_genre_token, out_dir)
        print(f"[OK] Wrote {txt_path} and figures in {out_dir}")

        # Append this genre's block (with header) to the master list (TRUE averages)
        all_lines.extend(make_genre_summary_lines(filename_genre_token, df_stats, include_header=True))

    # Write the combined master TXT once at the end
    with open(master_txt, "w", encoding="utf-8") as f:
        f.write("\n".join(all_lines))
    print(f"[OK] Wrote master summary → {master_txt}")

if __name__ == "__main__":
    main()


[OK] Wrote /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0904/SVD/figure/Adult/Adult_avg_per_user.txt and figures in /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0904/SVD/figure/Adult
[OK] Wrote /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0904/SVD/figure/Adventure/Adventure_avg_per_user.txt and figures in /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0904/SVD/figure/Adventure
[OK] Wrote /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0904/SVD/figure/Children_s/Children_s_avg_per_user.txt and figures in /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0904/SVD/figure/Children_s


##R

In [None]:
#!/usr/bin/env python3
# AVG TARGET-GENRE MATCHES PER USER — no adjustments, no plotting
# Scans ALL *recommendation.csv files under BASE_DIR and, for each file/K/genre,
# computes the average number of recommended books per user that match that genre.
#
# Outputs:
#   - <BASE_DIR>/figure/ALL_avg_per_user.csv  (tidy master table)
#   - <BASE_DIR>/figure/ALL_avg_per_user.txt  (human-readable summary)

import re
from pathlib import Path
import pandas as pd

# ====================== CONFIG ======================
BASE_DIR = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0904/SVD/")

GENRE_COL = "genres_all"
BOOK_COL  = "book_id"
USER_COL  = "user_id"

# Filename genre tokens (use EXACTLY as they appear in filenames)
GENRES = [
    "Adult", "Adventure", "Children_s", "Classics", "Drama", "Fantasy",
    "Historical", "Horror", "Mystery", "Nonfiction", "Romance",
    "Science_Fiction", "Thriller"
]
# ====================================================

def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

# ---------- Genre normalization for matching inside CSV cells ----------
# NOTE: This is for reading CSV content (GENRE_COL), not for filenames.
def _normalize_genre_for_match(g: str) -> str:
    x = g.strip().lower().replace("_", " ")
    # map filename token to how it appears in CSV content
    x = re.sub(r"\bchildren s\b", "children's", x)
    return x

def _split_genres_cell(cell):
    if pd.isna(cell):
        return []
    parts = re.split(r"[;,]", str(cell))
    return [_normalize_genre_for_match(p) for p in parts]

def per_user_counts_for_genre(df: pd.DataFrame, target_genre_token_for_content: str) -> pd.Series:
    """
    For a given recommendations DF:
      - For each user, count how many recommended books have the target genre.
      - Return a pandas Series indexed by user with the counts.
    """
    required = {USER_COL, BOOK_COL, GENRE_COL}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"missing columns: {missing}")

    tgt = _normalize_genre_for_match(target_genre_token_for_content)
    is_match = df[GENRE_COL].apply(lambda cell: tgt in _split_genres_cell(cell))
    per_user = (
        df.assign(_match=is_match)
          .groupby(USER_COL, as_index=True)["_match"].sum()
    )
    return per_user  # may be empty

def average_per_user_for_genre(df: pd.DataFrame, target_genre_token_for_content: str) -> float:
    s = per_user_counts_for_genre(df, target_genre_token_for_content)
    return float(s.mean()) if s.size else 0.0

# ---------- File parsing ----------
# Expect filenames like: ORIGINAL_<K>recommendation.csv  or  <LABEL>_<K>recommendation.csv
# We'll extract:
#   label = everything before the trailing "_<K>recommendation.csv"
#   K     = integer K
FILE_RE = re.compile(r"^(?P<label>.+)_(?P<K>\d+)recommendation\.csv$")

def parse_file_info(p: Path):
    m = FILE_RE.match(p.name)
    if not m:
        return None, None
    label = m.group("label")
    K = int(m.group("K"))
    return label, K

def main():
    out_dir = BASE_DIR / "figure"
    ensure_dir(out_dir)
    out_csv = out_dir / "ALL_avg_per_user.csv"
    out_txt = out_dir / "ALL_avg_per_user.txt"

    rows = []
    csv_files = sorted(BASE_DIR.glob("*recommendation.csv"))
    if not csv_files:
        print(f"[WARN] No *recommendation.csv files found under {BASE_DIR}")
    else:
        print(f"[INFO] Found {len(csv_files)} recommendation CSVs")

    for csv_path in csv_files:
        label, K = parse_file_info(csv_path)
        if label is None:
            print(f"[WARN] Skipping non-matching filename: {csv_path.name}")
            continue

        try:
            # Only load the needed columns (if present)
            usecols = lambda c: c in {USER_COL, BOOK_COL, GENRE_COL}
            df = pd.read_csv(csv_path, usecols=usecols)
        except Exception as e:
            print(f"[WARN] Failed reading {csv_path}: {e}; skipping")
            continue

        for gtok in GENRES:
            try:
                avg_val = average_per_user_for_genre(df, gtok)
            except Exception as e:
                print(f"[WARN] {csv_path.name} | genre={gtok}: {e}; using 0.0")
                avg_val = 0.0

            rows.append({
                "file": csv_path.name,
                "label": label,
                "K": K,
                "genre_token": gtok,
                "avg_per_user": avg_val
            })

    if not rows:
        print("[INFO] Nothing to write.")
        return

    # Build tidy DataFrame and save
    master = pd.DataFrame(rows, columns=["file", "label", "K", "genre_token", "avg_per_user"])
    master.sort_values(by=["genre_token", "label", "K"], inplace=True)
    master.to_csv(out_csv, index=False)
    print(f"[OK] Wrote {out_csv}")

    # Human-readable TXT summary grouped by genre -> K -> label
    lines = []
    for gtok, sub_g in master.groupby("genre_token", sort=True):
        lines.append(f"[{gtok}]")
        for K, sub_k in sub_g.groupby("K", sort=True):
            lines.append(f"  K={K}")
            for _, r in sub_k.sort_values(by=["label"]).iterrows():
                lines.append(f"    {r['label']}: {r['avg_per_user']:.3f}")
        lines.append("")
    with open(out_txt, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))
    print(f"[OK] Wrote {out_txt}")

if __name__ == "__main__":
    main()


In [3]:
#!/usr/bin/env python3
# count_users_for_item.py
import pandas as pd
from pathlib import Path

# --- config ---
CANDIDATE_FILES = [
    Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/data/df_final_with_genres.csv"),
]
USER_COL = "user_id"
ITEM_ID = 1
ITEM_COL_CANDIDATES = ("item_id", "book_id", "id", "ItemID", "BookID")

def main():
    # load first existing file
    for f in CANDIDATE_FILES:
        if f.exists():
            df = pd.read_csv(f, low_memory=False)
            break
    else:
        raise FileNotFoundError("No input file found. Update CANDIDATE_FILES.")

    # pick item column
    item_col = next((c for c in ITEM_COL_CANDIDATES if c in df.columns), None)
    if item_col is None:
        raise ValueError(f"No item id column found. Expected one of {ITEM_COL_CANDIDATES}")

    # coerce types
    df[item_col] = pd.to_numeric(df[item_col], errors="coerce")
    df[USER_COL] = pd.to_numeric(df[USER_COL], errors="coerce")
    if "rating" in df.columns:
        df["rating"] = pd.to_numeric(df["rating"], errors="coerce")

    # filter: item matches & rating present (non-NaN). 0 is a valid rating.
    if "rating" in df.columns:
        mask = (df[item_col] == ITEM_ID) & (~df["rating"].isna())
    else:
        # if no rating column exists, count any interaction as a "rating"
        mask = (df[item_col] == ITEM_ID)

    n_users = df.loc[mask, USER_COL].nunique()
    print(f"Users who rated item {ITEM_ID}: {n_users}")

if __name__ == "__main__":
    main()


Users who rated item 1: 22806


In [4]:
# boxplot_all_13_genres.py  — Jupyter-safe (no seaborn)

from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

def boxplot_all_genres(csv_path="/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0904/data/avg_pool_injection/books_summary.csv",
                       outdir="books_summary_viz_all13"):
    csv_path = Path(csv_path)
    outdir = Path(outdir); outdir.mkdir(parents=True, exist_ok=True)

    df = pd.read_csv(csv_path, low_memory=False)
    req = {"book_id","primary_genre","avg_rating_book_used","n_users_rated"}
    missing = req - set(df.columns)
    if missing:
        raise ValueError(f"books_summary.csv missing {missing}")

    df["avg_rating_book_used"] = pd.to_numeric(df["avg_rating_book_used"], errors="coerce")
    df["n_users_rated"] = pd.to_numeric(df["n_users_rated"], errors="coerce")
    df = df.dropna(subset=["primary_genre","avg_rating_book_used"])

    # Use ALL genres (should be 13 for your setup). Order by book count (desc) for readability.
    genre_order = (
        df["primary_genre"].astype(str).value_counts().sort_values(ascending=False).index.tolist()
    )

    data = [df.loc[df["primary_genre"] == g, "avg_rating_book_used"].values for g in genre_order]

    # --- Box plot for all 13 genres in one figure ---
    plt.figure(figsize=(max(12, 0.9*len(genre_order)+8), 6))
    plt.boxplot(data, labels=genre_order, showfliers=False)
    plt.xticks(rotation=35, ha="right")
    plt.ylabel("Average rating used (0–5)")
    plt.title("Per-genre distribution of book average ratings (all 13 genres)")
    plt.tight_layout()
    out_png = outdir / "box_avg_rating_by_genre_all13.png"
    plt.savefig(out_png, dpi=160)
    plt.close()
    print("Saved:", out_png)

    # --- Optional: per-genre summary stats ---
    stats = (
        df.groupby("primary_genre")["avg_rating_book_used"]
          .agg(n_books="count", mean="mean", median="median", p25=lambda s: s.quantile(0.25), p75=lambda s: s.quantile(0.75))
          .reindex(genre_order)
          .reset_index()
    )
    out_csv = outdir / "genre_rating_box_stats.csv"
    stats.to_csv(out_csv, index=False)
    print("Saved:", out_csv)

# Run:
boxplot_all_genres()


Saved: books_summary_viz_all13/box_avg_rating_by_genre_all13.png
Saved: books_summary_viz_all13/genre_rating_box_stats.csv


In [5]:
#!/usr/bin/env python3
# viz_books_summary.py  (Jupyter-safe)

from pathlib import Path
import sys
import argparse
import pandas as pd
import matplotlib.pyplot as plt

def viz_books_summary(csv_path, outdir="books_summary_viz", support=50):
    SUMMARY_CSV = Path(csv_path)
    OUT_DIR = Path(outdir); OUT_DIR.mkdir(parents=True, exist_ok=True)

    df = pd.read_csv(SUMMARY_CSV)
    expected = {"book_id", "primary_genre", "avg_rating_book_used", "n_users_rated"}
    missing = expected - set(df.columns)
    if missing:
        raise ValueError(f"Missing columns {missing} in {SUMMARY_CSV}")

    df["book_id"] = pd.to_numeric(df["book_id"], errors="coerce")
    df["avg_rating_book_used"] = pd.to_numeric(df["avg_rating_book_used"], errors="coerce")
    df["n_users_rated"] = pd.to_numeric(df["n_users_rated"], errors="coerce")
    df = df.dropna(subset=["book_id", "avg_rating_book_used", "n_users_rated"]).copy()

    # 1) Histogram: avg ratings (0..5)
    plt.figure(figsize=(8,5))
    plt.hist(df["avg_rating_book_used"].values, bins=30, range=(0,5))
    plt.xlabel("Average rating used (0–5)")
    plt.ylabel("Number of books")
    plt.title("Distribution of per-book average ratings")
    plt.tight_layout()
    plt.savefig(OUT_DIR / "hist_avg_rating.png", dpi=160); plt.close()

    # 2) Histogram: #users rated (log-y)
    plt.figure(figsize=(8,5))
    plt.hist(df["n_users_rated"].values, bins=50)
    plt.yscale("log")
    plt.xlabel("# of users who rated the book")
    plt.ylabel("Number of books (log scale)")
    plt.title("Distribution of #users rated per book")
    plt.tight_layout()
    plt.savefig(OUT_DIR / "hist_n_users_rated_logy.png", dpi=160); plt.close()

    # 3) Scatter: popularity vs rating (log-y)
    plt.figure(figsize=(8,6))
    plt.scatter(df["avg_rating_book_used"].values, df["n_users_rated"].values, s=6, alpha=0.5)
    plt.yscale("log")
    plt.xlabel("Average rating used")
    plt.ylabel("# of users rated (log scale)")
    plt.title("Popularity vs. average rating (each dot = one book)")
    plt.tight_layout()
    plt.savefig(OUT_DIR / "scatter_rating_vs_popularity.png", dpi=160); plt.close()

    # 4) Bar: books per genre (top-12)
    genre_counts = df["primary_genre"].astype(str).value_counts().head(12)
    plt.figure(figsize=(10,5))
    plt.bar(genre_counts.index.astype(str), genre_counts.values)
    plt.xticks(rotation=45, ha="right")
    plt.xlabel("Primary genre (top 12)")
    plt.ylabel("# of books")
    plt.title("Book counts by primary genre (top 12)")
    plt.tight_layout()
    plt.savefig(OUT_DIR / "bar_books_per_genre_top12.png", dpi=160); plt.close()

    # 5) Boxplot: avg ratings per genre (top-8 by count)
    top_genres = list(genre_counts.index[:8])
    data_for_box = [df.loc[df["primary_genre"] == g, "avg_rating_book_used"].values for g in top_genres]
    plt.figure(figsize=(10,5))
    plt.boxplot(data_for_box, labels=top_genres, showfliers=False)
    plt.xticks(rotation=30, ha="right")
    plt.ylabel("Average rating used")
    plt.title("Average ratings by genre (top 8 by book count)")
    plt.tight_layout()
    plt.savefig(OUT_DIR / "box_avg_rating_by_genre_top8.png", dpi=160); plt.close()

    # 6) CSV snapshots: extremes
    df.sort_values("n_users_rated", ascending=False).head(20)[
        ["book_id", "primary_genre", "avg_rating_book_used", "n_users_rated"]
    ].to_csv(OUT_DIR / "top20_most_rated.csv", index=False)

    df[df["n_users_rated"] >= support].sort_values(
        ["avg_rating_book_used", "n_users_rated"], ascending=[False, False]
    ).head(20)[["book_id", "primary_genre", "avg_rating_book_used", "n_users_rated"]].to_csv(
        OUT_DIR / f"top20_highest_rated_min{support}.csv", index=False
    )

    print("Saved to:", OUT_DIR.resolve())

if __name__ == "__main__":
    # Notebook-safe: ignore unknown args like --f=...
    parser = argparse.ArgumentParser(add_help=False)
    parser.add_argument("--csv", default="/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0904/data/avg_pool_injection/books_summary.csv")
    parser.add_argument("--outdir", default="books_summary_viz")
    parser.add_argument("--support", type=int, default=50)
    args, _unknown = parser.parse_known_args(sys.argv[1:])
    viz_books_summary(args.csv, args.outdir, args.support)


Saved to: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/notebook/0904/books_summary_viz
