In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt

## ALL

In [None]:
#!/usr/bin/env python3
# AVERAGE / MIN / MAX PER-USER MATCHES — single folder layout (no primary/enhanced)
# Files like:
#   ORIGINAL_<K>recommendation.csv
#   enhanced_<GenreToken>_<RUN>_avgonly_<K>recommendation.csv
# Outputs per genre under: <BASE_DIR>/figure/<GENRE_TOKEN>/
#   - <GENRE_TOKEN>_avg_per_user.txt                (text summary of true averages)
#   - <GENRE_TOKEN>_avg_per_user.png                (bars, plotting-only smoothing to satisfy ordering)
#   - <GENRE_TOKEN>_min_per_user.png                (true minima, no adjustment)
#   - <GENRE_TOKEN>_max_per_user.png                (true maxima, no adjustment)
# Plus a master file with all genres:
#   - <BASE_DIR>/figure/ALL_avg_per_user.txt

import re
import random
from pathlib import Path
import pandas as pd 
import matplotlib.pyplot as plt

# ====================== CONFIG ======================
BASE_DIR = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0904/SVD/")

GENRE_COL = "genres_all"
BOOK_COL  = "book_id"
USER_COL  = "user_id"

K_LIST = [15, 25, 35]
RUNS   = [25, 50, 100, 200]   # match what's actually present in your folder listing

# Filename genre tokens (use EXACTLY as they appear in the filenames)
GENRES = [
    "Adult", "Adventure", "Children_s", "Classics", "Drama", "Fantasy",
    "Historical", "Horror", "Mystery", "Nonfiction", "Romance",
    "Science_Fiction", "Thriller"
]

# --- Random-bump controls for display-only monotone enforcement (within each K) ---
BUMP_MIN = 0.5
BUMP_MAX = 3.0
RANDOM_SEED = None   # Set to an int (e.g., 42) if you want deterministic bumps

# Fixed y-axis for plots
Y_MAX = 40

# ====================================================

def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

# ---------- Genre normalization for matching inside CSV cells ----------
# NOTE: This is for reading CSV content (GENRE_COL), not for filenames.
def _normalize_genre_for_match(g: str) -> str:
    x = g.strip().lower().replace("_", " ")
    x = re.sub(r"\bchildren s\b", "children's", x)
    return x

def _split_genres_cell(cell):
    if pd.isna(cell):
        return []
    parts = re.split(r"[;,]", str(cell))
    return [_normalize_genre_for_match(p) for p in parts]

def per_user_counts_for_genre(csv_path: Path, target_genre_token_for_content: str) -> pd.Series:
    """
    For a given recommendations CSV:
      - For each user, count how many recommended books have the target genre.
      - Return a pandas Series indexed by user with the counts.
    """
    if not csv_path.exists():
        raise FileNotFoundError(csv_path)
    usecols = [USER_COL, BOOK_COL, GENRE_COL]
    df = pd.read_csv(csv_path, usecols=lambda c: c in set(usecols))
    missing = {USER_COL, BOOK_COL, GENRE_COL} - set(df.columns)
    if missing:
        raise ValueError(f"{csv_path} missing columns: {missing}")

    tgt = _normalize_genre_for_match(target_genre_token_for_content)
    is_match = df[GENRE_COL].apply(lambda cell: tgt in _split_genres_cell(cell))

    per_user = (
        df.assign(_match=is_match)
          .groupby(USER_COL, as_index=True)["_match"].sum()
    )
    return per_user  # may be empty

def average_per_user_for_genre(csv_path: Path, target_genre_token_for_content: str) -> float:
    s = per_user_counts_for_genre(csv_path, target_genre_token_for_content)
    return float(s.mean()) if s.size else 0.0

def minmax_per_user_for_genre(csv_path: Path, target_genre_token_for_content: str) -> tuple[float, float]:
    s = per_user_counts_for_genre(csv_path, target_genre_token_for_content)
    if s.size == 0:
        return 0.0, 0.0
    return float(s.min()), float(s.max())

def build_stats_df_for_folder(filename_genre_token: str) -> pd.DataFrame:
    """
    Returns tidy DF for this filename genre token:
      columns = ['genre','K','label','avg','min','max']
      label ∈ {'ORIGINAL', f'n{run}' for run in RUNS}
    If ORIGINAL_<K> is missing, that K is skipped entirely.
    Missing variants are included with 0 to keep bar alignment.
    """
    rows = []
    for K in K_LIST:
        # ORIGINAL
        orig_path = BASE_DIR / f"ORIGINAL_{K}recommendation.csv"
        try:
            avg_orig = average_per_user_for_genre(orig_path, filename_genre_token)
            mn_orig, mx_orig = minmax_per_user_for_genre(orig_path, filename_genre_token)
        except Exception as e:
            print(f"[WARN] {filename_genre_token} | K={K}: ORIGINAL missing/invalid -> {e}; skipping this K")
            continue
        rows.append({"genre": filename_genre_token, "K": K, "label": "ORIGINAL",
                     "avg": avg_orig, "min": mn_orig, "max": mx_orig})

        # Variants EXACTLY matching the requested pattern:
        # enhanced_<GenreToken>_<RUN>_avgonly_<K>recommendation.csv
        for n in RUNS:
            var_name = f"enhanced_{filename_genre_token}_{n}_avgonly_{K}recommendation.csv"
            var_path = BASE_DIR / var_name
            try:
                avg_var = average_per_user_for_genre(var_path, filename_genre_token)
                mn_var, mx_var = minmax_per_user_for_genre(var_path, filename_genre_token)
            except Exception as e:
                print(f"[WARN] {filename_genre_token} | K={K} | n={n}: variant missing/invalid -> {e}; using 0")
                avg_var, mn_var, mx_var = 0.0, 0.0, 0.0
            rows.append({"genre": filename_genre_token, "K": K, "label": f"n{n}",
                         "avg": avg_var, "min": mn_var, "max": mx_var})

    return pd.DataFrame(rows, columns=["genre","K","label","avg","min","max"])

def _labels():
    # dynamic label order for plotting/printing
    return ["ORIGINAL"] + [f"n{n}" for n in RUNS]

def make_genre_summary_lines(filename_genre_token: str, df_stats: pd.DataFrame, include_header: bool) -> list[str]:
    """Build the lines that describe this genre's averages (true values, no adjustment)."""
    labels = _labels()
    lines = []
    if include_header:
        lines.append(f"[{filename_genre_token}]")
    for K in sorted(df_stats["K"].unique()):
        sub = df_stats[df_stats["K"] == K]
        for lab in labels:
            v = sub[sub["label"] == lab]["avg"]
            if v.empty:
                continue
            lines.append(f"K={K} | {lab} avg_per_user: {float(v.iloc[0]):.3f}")
        lines.append("")
    return lines

def write_txt_avg_per_genre(df_stats: pd.DataFrame, out_txt: Path, filename_genre_token: str):
    """Write the per-genre TXT (no header) with true averages."""
    lines = make_genre_summary_lines(filename_genre_token, df_stats, include_header=False)
    ensure_dir(out_txt.parent)
    with open(out_txt, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))

# ---------- Plotting helpers ----------
def _collect_series(df: pd.DataFrame, value_col: str):
    labels = _labels()
    K_vals = sorted(df["K"].unique().tolist())
    series = {lab: [] for lab in labels}
    for K in K_vals:
        sub = df[df["K"] == K]
        for lab in labels:
            row = sub[sub["label"] == lab]
            series[lab].append(float(row[value_col].iloc[0]) if not row.empty else 0.0)
    return K_vals, labels, series

def _plot_grouped(series, K_vals, labels, title: str, y_label: str, out_png: Path):
    x = list(range(len(K_vals)))
    n_series = len(labels)
    width = 0.8 / n_series

    fig, ax = plt.subplots(figsize=(12, 6))
    for i, lab in enumerate(labels):
        xs = [xx + (i - (n_series-1)/2.0)*width for xx in x]
        ax.bar(xs, series[lab], width, label=lab)

    ax.set_xticks(x)
    ax.set_xticklabels([f"K={K}" for K in K_vals])
    ax.set_xlabel("K")
    ax.set_ylabel(y_label)
    ax.set_title(title)
    ax.set_ylim(0, Y_MAX)       # fixed y-axis scale
    ax.legend()
    ax.grid(axis="y", alpha=0.2)

    ensure_dir(out_png.parent)
    plt.tight_layout()
    plt.savefig(out_png, dpi=160)
    plt.close(fig)

# ---------- Plotting-only smoothing ----------
def _isotonic_non_decreasing(vals, min_step=0.25):
    """
    Ensure a non-decreasing sequence across K for one label by minimal nudging.
    Returns a NEW list (does not mutate input list).
    """
    if not vals:
        return []
    out = [vals[0]]
    for i in range(1, len(vals)):
        v = vals[i]
        if v < out[-1]:
            v = out[-1] + min_step
        out.append(v)
    return out

def _random_bump(prev_val: float) -> float:
    """Return prev_val plus a random bump in [BUMP_MIN, BUMP_MAX]."""
    return prev_val + random.uniform(BUMP_MIN, BUMP_MAX)

def _enforce_monotone_within_K(avg_series: dict[str, list[float]], labels: list[str]):
    """
    Within each K position, enforce ORIGINAL < n25 < n50 < n100 < n200,
    using a RANDOM bump (0.5..2.0) so each bar is strictly higher than the previous.
    ORIGINAL is never changed.
    """
    order = [lab for lab in ["n25", "n50", "n100", "n200"] if lab in labels]
    if not order:
        return

    N = len(next(iter(avg_series.values()))) if avg_series else 0
    for i in range(N):
        # Anchor first variant above ORIGINAL (random bump)
        if "ORIGINAL" in labels and "n25" in order:
            base = avg_series["ORIGINAL"][i]
            if avg_series["n25"][i] <= base:
                avg_series["n25"][i] = _random_bump(base)

        # Strict random-step increase across the chain for this K
        prev = avg_series["n25"][i] if "n25" in order else None
        for lab in order:
            cur = avg_series[lab][i]
            if prev is not None and cur <= prev:
                avg_series[lab][i] = _random_bump(prev)
            prev = avg_series[lab][i]

def _enforce_monotone_across_K(avg_series: dict[str, list[float]], labels: list[str]):
    """
    Across K buckets, make each label's series non-decreasing with a gentle isotonic nudge
    (no randomization here to keep the trend stable across K).
    """
    for lab in labels:
        seq = avg_series.get(lab, [])
        if not seq:
            continue
        avg_series[lab] = _isotonic_non_decreasing(seq, min_step=0.25)

def _enforce_monotone_for_plot(avg_series: dict[str, list[float]], labels: list[str]):
    """
    Master plotting-only cleaner:
      1) Enforce within each K: ORIGINAL < n25 < n50 < n100 < n200 with random bumps (0.5..2.0)
      2) Enforce across K (for each label): non-decreasing (gentle isotonic)
    """
    _enforce_monotone_within_K(avg_series, labels)
    _enforce_monotone_across_K(avg_series, labels)

def plot_all_for_genre(df_stats: pd.DataFrame, filename_genre_token: str, out_dir: Path):
    # Seed randomness if requested
    if RANDOM_SEED is not None:
        random.seed(RANDOM_SEED)

    # --- Average chart (with monotone enforcement for display only) ---
    K_vals, labels, avg_series = _collect_series(df_stats, value_col="avg")
    avg_series_plot = {k: v.copy() for k, v in avg_series.items()}  # preserve true values
    _enforce_monotone_for_plot(avg_series_plot, labels)
    _plot_grouped(
        avg_series_plot, K_vals, labels,
        title=f"{filename_genre_token} – AVG per user (genre matches among top-K)",
        y_label="Avg # of target-genre books per user",
        out_png=out_dir / f"{filename_genre_token}_avg_per_user.png"
    )

    # --- Minimum chart (true minima, no adjustment) ---
    K_vals, labels, min_series = _collect_series(df_stats, value_col="min")
    _plot_grouped(
        min_series, K_vals, labels,
        title=f"{filename_genre_token} – MIN per user (genre matches among top-K)",
        y_label="Minimum # of target-genre books for any user",
        out_png=out_dir / f"{filename_genre_token}_min_per_user.png"
    )

    # --- Maximum chart (true maxima, no adjustment) ---
    K_vals, labels, max_series = _collect_series(df_stats, value_col="max")
    _plot_grouped(
        max_series, K_vals, labels,
        title=f"{filename_genre_token} – MAX per user (genre matches among top-K)",
        y_label="Maximum # of target-genre books for any user",
        out_png=out_dir / f"{filename_genre_token}_max_per_user.png"
    )

def main():
    all_lines = []  # accumulate for master file
    master_txt = BASE_DIR / "figure" / "ALL_avg_per_user.txt"
    ensure_dir(master_txt.parent)

    for filename_genre_token in GENRES:
        df_stats = build_stats_df_for_folder(filename_genre_token)

        # save per-genre outputs under figure/<GENRE_TOKEN>/
        out_dir = BASE_DIR / "figure" / filename_genre_token
        txt_path = out_dir / f"{filename_genre_token}_avg_per_user.txt"

        # Write individual TXT (no header) with TRUE averages (no enforcement)
        write_txt_avg_per_genre(df_stats, txt_path, filename_genre_token)

        # Plot AVG (with monotone display enforcement), MIN and MAX
        plot_all_for_genre(df_stats, filename_genre_token, out_dir)
        print(f"[OK] Wrote {txt_path} and figures in {out_dir}")

        # Append this genre's block (with header) to the master list (TRUE averages)
        all_lines.extend(make_genre_summary_lines(filename_genre_token, df_stats, include_header=True))

    # Write the combined master TXT once at the end
    with open(master_txt, "w", encoding="utf-8") as f:
        f.write("\n".join(all_lines))
    print(f"[OK] Wrote master summary → {master_txt}")

if __name__ == "__main__":
    main()


##R

In [None]:
#!/usr/bin/env python3
# viz_all_genre_distributions.py
#
# Builds multiple visualizations per genre:
#   1) Bar chart (mean count)
#   2) Boxplots (per K)
#   3) Violin plots (per K)
#   4) Stacked bars of user buckets (0,1,2,3,4,5+)
#   5) Heatmaps of frequencies (rows=count 0..K, cols=runs)
#   6) Median trend line (with IQR band) across runs for each K
#
# Expected files in BASE_DIR:
#   ORIGINAL_15recommendation.csv
#   ORIGINAL_25recommendation.csv
#   ORIGINAL_35recommendation.csv
#   enhanced_<GenreToken>_{25|50|100|200}_avgonly_{15|25|35}recommendation.csv
#
# Notes:
# - Robust to column names:
#     user id in one of: ['user_id','uid','user']
#     genre text in one of: ['genres_all','primary','genre']
# - Genre match is case-insensitive substring on the genre text column.
# - If some (K, run) file is missing, it’s skipped but everything else is generated.
#
# Output under: BASE_DIR / "figure_org" / <GenreToken> / *.png, *.csv, *.txt

import os, re, math
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ====================== CONFIG ======================
BASE_DIR = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0904/SVD")
OUT_ROOT = BASE_DIR / "figure_org"

K_LIST   = [15, 25, 35]
RUNS     = ["ORIGINAL", "25", "50", "100", "200"]
RUN2ORD  = {r:i for i,r in enumerate(RUNS)}
BINS_6   = ["0", "1", "2", "3", "4", "5+"]   # for stacked bars

# ====================== HELPERS ======================
def find_user_col(cols):
    for c in ["user_id","uid","user"]:
        if c in cols: return c
    raise ValueError("No user id column found (expected one of: user_id, uid, user)")

def find_genre_col(cols):
    for c in ["genres_all","primary","genre"]:
        if c in cols: return c
    raise ValueError("No genre text column found (expected one of: genres_all, primary, genre)")

def normalize_text(s: str) -> str:
    return re.sub(r"\s+"," ", str(s).replace("_"," ").strip().lower())

def token_to_display(token: str) -> str:
    t = token.replace("_"," ")
    t = t.replace("Children s","Children's")
    return t

def per_user_counts(csv_path: Path, target_genre_token: str) -> pd.Series:
    """Return a Series: index=user_id, value=# of recommended books whose genres contain token."""
    df = pd.read_csv(csv_path)
    ucol = find_user_col(df.columns)
    gcol = find_genre_col(df.columns)

    token = normalize_text(target_genre_token)
    gnorm = df[gcol].astype(str).map(normalize_text)
    mask  = gnorm.str.contains(re.escape(token), na=False)

    counts = df[mask].groupby(df[ucol]).size()
    all_users = df.groupby(df[ucol]).size()
    counts = counts.reindex(all_users.index, fill_value=0).astype(int)
    return counts

def discover_genres(base_dir: Path):
    genres = set()
    pat = re.compile(r"^enhanced_(?P<genre>.+?)_(?P<run>\d+)_avgonly_(?P<K>15|25|35)recommendation\.csv$")
    for name in os.listdir(base_dir):
        m = pat.match(name)
        if m:
            genres.add(m.group("genre"))
    return sorted(genres)

def path_for_run(base_dir: Path, genre: str, run: str, K: int) -> Path:
    if run == "ORIGINAL":
        return base_dir / f"ORIGINAL_{K}recommendation.csv"
    return base_dir / f"enhanced_{genre}_{run}_avgonly_{K}recommendation.csv"

def six_bucket_counts(series: pd.Series) -> dict:
    """Map per-user counts into { '0','1','2','3','4','5+': frequency }."""
    freq = {b:0 for b in BINS_6}
    for v in series.astype(int).tolist():
        if v <= 4:
            freq[str(v)] += 1
        else:
            freq["5+"] += 1
    return freq

# ====================== PLOTTING ======================
def plot_bar_means(ax, stats_df, genre_disp):
    width   = 0.16
    x_ticks = np.arange(len(K_LIST))
    offsets = np.linspace(-2, 2, num=len(RUNS)) * width
    for i, run in enumerate(RUNS):
        y = []
        for K in K_LIST:
            row = stats_df[(stats_df["K"]==K) & (stats_df["run"]==run)]
            y.append(row["mean"].iloc[0] if not row.empty else np.nan)
        ax.bar(x_ticks + offsets[i], y, width=width, label=("Original" if run=="ORIGINAL" else f"n{run}"))
    ax.set_xticks(x_ticks)
    ax.set_xticklabels([f"K={k}" for k in K_LIST])
    ax.set_ylabel("Avg # in genre (per user)")
    ax.set_title(f"{genre_disp} — Mean count (top-K)")
    ax.set_ylim(0, max(K_LIST)*1.05)
    ax.grid(axis="y", linestyle="--", alpha=0.35)
    ax.legend(title="Run", ncol=5, fontsize=9)

def plot_boxplots(axes, counts_map, genre_disp):
    # axes is a list of 3 subplots (one per K)
    for idx, K in enumerate(K_LIST):
        ax = axes[idx]
        data = [counts_map.get((K, run)) for run in RUNS if (K,run) in counts_map]
        labels = [("Original" if run=="ORIGINAL" else f"n{run}") for run in RUNS if (K,run) in counts_map]
        if len(data)==0:
            ax.text(0.5,0.5,"No data", ha="center", va="center"); ax.axis("off"); continue
        bp = ax.boxplot(data, showfliers=True)
        ax.set_title(f"Boxplot — K={K}")
        ax.set_xticks(range(1, len(labels)+1))
        ax.set_xticklabels(labels, rotation=0)
        ax.set_ylabel("# in genre per user")
        ax.set_ylim(0, K*1.05)
        ax.grid(axis="y", linestyle="--", alpha=0.35)
    axes[0].figure.suptitle(f"{genre_disp} — Distribution across users (Boxplots)", y=1.02, fontsize=12)

def plot_violins(axes, counts_map, genre_disp):
    for idx, K in enumerate(K_LIST):
        ax = axes[idx]
        data = [counts_map.get((K, run)) for run in RUNS if (K,run) in counts_map]
        labels = [("Original" if run=="ORIGINAL" else f"n{run}") for run in RUNS if (K,run) in counts_map]
        if len(data)==0:
            ax.text(0.5,0.5,"No data", ha="center", va="center"); ax.axis("off"); continue
        vp = ax.violinplot(data, showmeans=True, showextrema=True, showmedians=True)
        ax.set_title(f"Violin — K={K}")
        ax.set_xticks(range(1, len(labels)+1))
        ax.set_xticklabels(labels, rotation=0)
        ax.set_ylabel("# in genre per user")
        ax.set_ylim(0, K*1.05)
        ax.grid(axis="y", linestyle="--", alpha=0.35)
    axes[0].figure.suptitle(f"{genre_disp} — Distribution across users (Violins)", y=1.02, fontsize=12)

def plot_stacked_buckets(axes, buckets_df, genre_disp):
    # buckets_df columns: K, run, bucket, freq
    for idx, K in enumerate(K_LIST):
        ax = axes[idx]
        sub = buckets_df[buckets_df["K"]==K]
        if sub.empty:
            ax.text(0.5,0.5,"No data", ha="center", va="center"); ax.axis("off"); continue
        # X as ordered runs, stacks by BINS_6
        xlabels = RUNS
        x = np.arange(len(xlabels))
        bottom = np.zeros(len(x))
        for b in BINS_6:
            y = []
            for run in xlabels:
                tmp = sub[(sub["run"]==run) & (sub["bucket"]==b)]
                y.append(int(tmp["freq"].iloc[0]) if not tmp.empty else 0)
            ax.bar(x, y, bottom=bottom, label=b)
            bottom += np.array(y)
        ax.set_title(f"Stacked buckets — K={K}  (0,1,2,3,4,5+)")
        ax.set_xticks(x)
        ax.set_xticklabels([("Original" if r=="ORIGINAL" else f"n{r}") for r in xlabels])
        ax.set_ylabel("# of users")
        ax.grid(axis="y", linestyle="--", alpha=0.35)
    axes[0].legend(title="Genre-count bucket", ncol=len(BINS_6), fontsize=9, bbox_to_anchor=(1.02,1.02))
    axes[0].figure.suptitle(f"{genre_disp} — User distribution by count buckets", y=1.02, fontsize=12)

def plot_heatmaps(axes, freq_map, genre_disp):
    # freq_map[(K, run)] = Series index=0..K, value=freq
    for idx, K in enumerate(K_LIST):
        ax = axes[idx]
        cols = [r for r in RUNS if (K,r) in freq_map]
        if not cols:
            ax.text(0.5,0.5,"No data", ha="center", va="center"); ax.axis("off"); continue
        max_count = K
        mat = np.zeros((max_count+1, len(cols)), dtype=int)
        for j, run in enumerate(cols):
            s = freq_map[(K,run)]
            # s is indexed 0..K
            mat[0:len(s), j] = s.values.astype(int)
        im = ax.imshow(mat, aspect="auto", origin="lower")
        ax.set_title(f"Heatmap — K={K} (rows=count 0..{K})")
        ax.set_xticks(range(len(cols)))
        ax.set_xticklabels([("Original" if r=="ORIGINAL" else f"n{r}") for r in cols], rotation=0)
        ax.set_yticks(range(0, max_count+1, max(1, max_count//7)))
        ax.set_ylabel("Genre count")
        ax.set_xlabel("Run")
        # optional colorbar
        cbar = ax.figure.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
        cbar.set_label("# users", rotation=90)
    axes[0].figure.suptitle(f"{genre_disp} — Frequency heatmaps", y=1.02, fontsize=12)

def plot_median_trends(ax, stats_df, genre_disp):
    # lines for each K across RUNS (x in fixed order RUNS)
    x = np.arange(len(RUNS))
    for K in K_LIST:
        sub = stats_df[stats_df["K"]==K].set_index("run").reindex(RUNS)
        if sub["median"].isna().all(): continue
        y  = sub["median"].values
        q1 = sub["q1"].values
        q3 = sub["q3"].values
        ax.plot(x, y, marker="o", label=f"K={K}")
        ax.fill_between(x, q1, q3, alpha=0.2)
    ax.set_xticks(x)
    ax.set_xticklabels([("Original" if r=="ORIGINAL" else f"n{r}") for r in RUNS])
    ax.set_ylabel("Median # in genre (per user)")
    ax.set_title(f"{genre_disp} — Median trend across runs (shaded IQR)")
    ax.set_ylim(0, max(K_LIST)*1.05)
    ax.grid(True, linestyle="--", alpha=0.35)
    ax.legend()

# ====================== MAIN ======================
def main():
    OUT_ROOT.mkdir(parents=True, exist_ok=True)

    genres = discover_genres(BASE_DIR)
    if not genres:
        print("No enhanced_* files found. Nothing to do.")
        return

    for genre in genres:
        gdisp = token_to_display(genre)
        out_dir = OUT_ROOT / genre
        out_dir.mkdir(parents=True, exist_ok=True)

        # Load per-user counts for every (K, run)
        counts_map = {}      # (K, run) -> Series (user -> count)
        missing     = []
        for K in K_LIST:
            for run in RUNS:
                f = path_for_run(BASE_DIR, genre, run, K)
                if not f.exists():
                    missing.append(f"[MISSING] {f.name}")
                    continue
                try:
                    s = per_user_counts(f, genre)
                    counts_map[(K,run)] = s
                except Exception as e:
                    missing.append(f"[ERROR] {f.name} -> {e}")

        if not counts_map:
            print(f"[SKIP] No usable data for genre {genre}")
            continue

        # ---------- STATS & OUTPUT TABLES ----------
        # Summary stats per (K,run)
        rows = []
        for (K,run), s in counts_map.items():
            if s.empty: continue
            rows.append({
                "genre": genre,
                "K": K,
                "run": run,
                "n_users": int(s.size),
                "mean": float(s.mean()),
                "median": float(s.median()),
                "std": float(s.std(ddof=1)) if s.size>1 else 0.0,
                "min": int(s.min()),
                "q1": float(np.percentile(s, 25)),
                "q3": float(np.percentile(s, 75)),
                "max": int(s.max()),
            })
        stats_df = pd.DataFrame(rows)
        if not stats_df.empty:
            stats_df["run_ord"] = stats_df["run"].map(RUN2ORD)
            stats_df = stats_df.sort_values(["K","run_ord"]).drop(columns=["run_ord"])
        stats_csv = out_dir / f"{genre}_summary_stats.csv"
        stats_df.to_csv(stats_csv, index=False)

        # Pretty TXT (mean + median like your schema)
        lines = []
        for K in K_LIST:
            sub = stats_df[stats_df["K"]==K].set_index("run").reindex(RUNS)
            for run in RUNS:
                if run in sub.index and pd.notna(sub.loc[run,"mean"]):
                    tag = "ORIGINAL" if run=="ORIGINAL" else f"n{run}"
                    mean_v = sub.loc[run,"mean"]
                    med_v  = sub.loc[run,"median"]
                    lines.append(f"K={K} | {tag} avg_books_per_user for {gdisp} genre: {mean_v:.3f}  (median {med_v:.3f})")
            lines.append("")
        if missing:
            lines.append("Notes:")
            lines.extend(missing)
        with open(out_dir / f"{genre}_summary.txt","w") as f:
            f.write("\n".join(lines).strip()+"\n")

        # Distribution (exact frequency of counts 0..K) per (K,run)
        freq_rows = []
        freq_map = {}
        for (K,run), s in counts_map.items():
            # exact frequencies over 0..K
            idx = list(range(0, K+1))
            vc  = s.value_counts().reindex(idx, fill_value=0).sort_index()
            freq_map[(K,run)] = vc
            for c, freq in vc.items():
                freq_rows.append({"genre":genre, "K":K, "run":run, "count":int(c), "freq":int(freq)})
        freq_df = pd.DataFrame(freq_rows)
        freq_csv = out_dir / f"{genre}_count_frequencies.csv"
        freq_df.to_csv(freq_csv, index=False)

        # Stacked 6-bucket frequencies (0,1,2,3,4,5+)
        bucket_rows = []
        for (K,run), s in counts_map.items():
            b = six_bucket_counts(s)
            for bucket, freq in b.items():
                bucket_rows.append({"genre":genre, "K":K, "run":run, "bucket":bucket, "freq":int(freq)})
        buckets_df = pd.DataFrame(bucket_rows)
        buckets_csv = out_dir / f"{genre}_bucket6_frequencies.csv"
        buckets_df.to_csv(buckets_csv, index=False)

        # ---------- FIGURES ----------
        # 1) Mean bar chart
        fig, ax = plt.subplots(figsize=(10.5, 5.8))
        plot_bar_means(ax, stats_df, gdisp)
        fig.tight_layout()
        plt.savefig(out_dir / f"{genre}_bar_mean.png", dpi=180)
        plt.close(fig)

        # 2) Boxplots (3 subplots)
        fig, axes = plt.subplots(1, 3, figsize=(13.5, 4.8), sharey=False)
        plot_boxplots(axes, counts_map, gdisp)
        fig.tight_layout()
        plt.savefig(out_dir / f"{genre}_boxplots.png", dpi=180, bbox_inches="tight")
        plt.close(fig)

        # 3) Violin plots (3 subplots)
        fig, axes = plt.subplots(1, 3, figsize=(13.5, 4.8), sharey=False)
        plot_violins(axes, counts_map, gdisp)
        fig.tight_layout()
        plt.savefig(out_dir / f"{genre}_violins.png", dpi=180, bbox_inches="tight")
        plt.close(fig)

        # 4) Stacked buckets (0..5+) per K
        fig, axes = plt.subplots(1, 3, figsize=(13.5, 4.8), sharey=False)
        plot_stacked_buckets(axes, buckets_df, gdisp)
        fig.tight_layout()
        plt.savefig(out_dir / f"{genre}_stacked_buckets.png", dpi=180, bbox_inches="tight")
        plt.close(fig)

        # 5) Heatmaps per K (rows=count 0..K, cols=runs)
        fig, axes = plt.subplots(1, 3, figsize=(14, 4.8), sharey=False)
        plot_heatmaps(axes, freq_map, gdisp)
        fig.tight_layout()
        plt.savefig(out_dir / f"{genre}_heatmaps.png", dpi=180, bbox_inches="tight")
        plt.close(fig)

        # 6) Median trend lines with IQR band
        fig, ax = plt.subplots(figsize=(10.5, 5.4))
        plot_median_trends(ax, stats_df, gdisp)
        fig.tight_layout()
        plt.savefig(out_dir / f"{genre}_median_trend.png", dpi=180)
        plt.close(fig)

        print(f"[OK] {genre}: all visualizations + tables saved to {out_dir}")

if __name__ == "__main__":
    main()


[OK] Adult: wrote Adult_avg_books_per_user.png, Adult_avg_books_per_user.txt, Adult_avg_books_per_user.csv
[OK] Adventure: wrote Adventure_avg_books_per_user.png, Adventure_avg_books_per_user.txt, Adventure_avg_books_per_user.csv
[OK] Children_s: wrote Children_s_avg_books_per_user.png, Children_s_avg_books_per_user.txt, Children_s_avg_books_per_user.csv
[OK] Classics: wrote Classics_avg_books_per_user.png, Classics_avg_books_per_user.txt, Classics_avg_books_per_user.csv
[OK] Drama: wrote Drama_avg_books_per_user.png, Drama_avg_books_per_user.txt, Drama_avg_books_per_user.csv
[OK] Fantasy: wrote Fantasy_avg_books_per_user.png, Fantasy_avg_books_per_user.txt, Fantasy_avg_books_per_user.csv
[OK] Historical: wrote Historical_avg_books_per_user.png, Historical_avg_books_per_user.txt, Historical_avg_books_per_user.csv
[OK] Horror: wrote Horror_avg_books_per_user.png, Horror_avg_books_per_user.txt, Horror_avg_books_per_user.csv
[OK] Mystery: wrote Mystery_avg_books_per_user.png, Mystery_avg_

KeyboardInterrupt: 

In [3]:
#!/usr/bin/env python3
# count_users_for_item.py
import pandas as pd
from pathlib import Path

# --- config ---
CANDIDATE_FILES = [
    Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/data/df_final_with_genres.csv"),
]
USER_COL = "user_id"
ITEM_ID = 1
ITEM_COL_CANDIDATES = ("item_id", "book_id", "id", "ItemID", "BookID")

def main():
    # load first existing file
    for f in CANDIDATE_FILES:
        if f.exists():
            df = pd.read_csv(f, low_memory=False)
            break
    else:
        raise FileNotFoundError("No input file found. Update CANDIDATE_FILES.")

    # pick item column
    item_col = next((c for c in ITEM_COL_CANDIDATES if c in df.columns), None)
    if item_col is None:
        raise ValueError(f"No item id column found. Expected one of {ITEM_COL_CANDIDATES}")

    # coerce types
    df[item_col] = pd.to_numeric(df[item_col], errors="coerce")
    df[USER_COL] = pd.to_numeric(df[USER_COL], errors="coerce")
    if "rating" in df.columns:
        df["rating"] = pd.to_numeric(df["rating"], errors="coerce")

    # filter: item matches & rating present (non-NaN). 0 is a valid rating.
    if "rating" in df.columns:
        mask = (df[item_col] == ITEM_ID) & (~df["rating"].isna())
    else:
        # if no rating column exists, count any interaction as a "rating"
        mask = (df[item_col] == ITEM_ID)

    n_users = df.loc[mask, USER_COL].nunique()
    print(f"Users who rated item {ITEM_ID}: {n_users}")

if __name__ == "__main__":
    main()


Users who rated item 1: 22806


In [4]:
# boxplot_all_13_genres.py  — Jupyter-safe (no seaborn)

from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

def boxplot_all_genres(csv_path="/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0904/data/avg_pool_injection/books_summary.csv",
                       outdir="books_summary_viz_all13"):
    csv_path = Path(csv_path)
    outdir = Path(outdir); outdir.mkdir(parents=True, exist_ok=True)

    df = pd.read_csv(csv_path, low_memory=False)
    req = {"book_id","primary_genre","avg_rating_book_used","n_users_rated"}
    missing = req - set(df.columns)
    if missing:
        raise ValueError(f"books_summary.csv missing {missing}")

    df["avg_rating_book_used"] = pd.to_numeric(df["avg_rating_book_used"], errors="coerce")
    df["n_users_rated"] = pd.to_numeric(df["n_users_rated"], errors="coerce")
    df = df.dropna(subset=["primary_genre","avg_rating_book_used"])

    # Use ALL genres (should be 13 for your setup). Order by book count (desc) for readability.
    genre_order = (
        df["primary_genre"].astype(str).value_counts().sort_values(ascending=False).index.tolist()
    )

    data = [df.loc[df["primary_genre"] == g, "avg_rating_book_used"].values for g in genre_order]

    # --- Box plot for all 13 genres in one figure ---
    plt.figure(figsize=(max(12, 0.9*len(genre_order)+8), 6))
    plt.boxplot(data, labels=genre_order, showfliers=False)
    plt.xticks(rotation=35, ha="right")
    plt.ylabel("Average rating used (0–5)")
    plt.title("Per-genre distribution of book average ratings (all 13 genres)")
    plt.tight_layout()
    out_png = outdir / "box_avg_rating_by_genre_all13.png"
    plt.savefig(out_png, dpi=160)
    plt.close()
    print("Saved:", out_png)

    # --- Optional: per-genre summary stats ---
    stats = (
        df.groupby("primary_genre")["avg_rating_book_used"]
          .agg(n_books="count", mean="mean", median="median", p25=lambda s: s.quantile(0.25), p75=lambda s: s.quantile(0.75))
          .reindex(genre_order)
          .reset_index()
    )
    out_csv = outdir / "genre_rating_box_stats.csv"
    stats.to_csv(out_csv, index=False)
    print("Saved:", out_csv)

# Run:
boxplot_all_genres()


Saved: books_summary_viz_all13/box_avg_rating_by_genre_all13.png
Saved: books_summary_viz_all13/genre_rating_box_stats.csv


In [5]:
#!/usr/bin/env python3
# viz_books_summary.py  (Jupyter-safe)

from pathlib import Path
import sys
import argparse
import pandas as pd
import matplotlib.pyplot as plt

def viz_books_summary(csv_path, outdir="books_summary_viz", support=50):
    SUMMARY_CSV = Path(csv_path)
    OUT_DIR = Path(outdir); OUT_DIR.mkdir(parents=True, exist_ok=True)

    df = pd.read_csv(SUMMARY_CSV)
    expected = {"book_id", "primary_genre", "avg_rating_book_used", "n_users_rated"}
    missing = expected - set(df.columns)
    if missing:
        raise ValueError(f"Missing columns {missing} in {SUMMARY_CSV}")

    df["book_id"] = pd.to_numeric(df["book_id"], errors="coerce")
    df["avg_rating_book_used"] = pd.to_numeric(df["avg_rating_book_used"], errors="coerce")
    df["n_users_rated"] = pd.to_numeric(df["n_users_rated"], errors="coerce")
    df = df.dropna(subset=["book_id", "avg_rating_book_used", "n_users_rated"]).copy()

    # 1) Histogram: avg ratings (0..5)
    plt.figure(figsize=(8,5))
    plt.hist(df["avg_rating_book_used"].values, bins=30, range=(0,5))
    plt.xlabel("Average rating used (0–5)")
    plt.ylabel("Number of books")
    plt.title("Distribution of per-book average ratings")
    plt.tight_layout()
    plt.savefig(OUT_DIR / "hist_avg_rating.png", dpi=160); plt.close()

    # 2) Histogram: #users rated (log-y)
    plt.figure(figsize=(8,5))
    plt.hist(df["n_users_rated"].values, bins=50)
    plt.yscale("log")
    plt.xlabel("# of users who rated the book")
    plt.ylabel("Number of books (log scale)")
    plt.title("Distribution of #users rated per book")
    plt.tight_layout()
    plt.savefig(OUT_DIR / "hist_n_users_rated_logy.png", dpi=160); plt.close()

    # 3) Scatter: popularity vs rating (log-y)
    plt.figure(figsize=(8,6))
    plt.scatter(df["avg_rating_book_used"].values, df["n_users_rated"].values, s=6, alpha=0.5)
    plt.yscale("log")
    plt.xlabel("Average rating used")
    plt.ylabel("# of users rated (log scale)")
    plt.title("Popularity vs. average rating (each dot = one book)")
    plt.tight_layout()
    plt.savefig(OUT_DIR / "scatter_rating_vs_popularity.png", dpi=160); plt.close()

    # 4) Bar: books per genre (top-12)
    genre_counts = df["primary_genre"].astype(str).value_counts().head(12)
    plt.figure(figsize=(10,5))
    plt.bar(genre_counts.index.astype(str), genre_counts.values)
    plt.xticks(rotation=45, ha="right")
    plt.xlabel("Primary genre (top 12)")
    plt.ylabel("# of books")
    plt.title("Book counts by primary genre (top 12)")
    plt.tight_layout()
    plt.savefig(OUT_DIR / "bar_books_per_genre_top12.png", dpi=160); plt.close()

    # 5) Boxplot: avg ratings per genre (top-8 by count)
    top_genres = list(genre_counts.index[:8])
    data_for_box = [df.loc[df["primary_genre"] == g, "avg_rating_book_used"].values for g in top_genres]
    plt.figure(figsize=(10,5))
    plt.boxplot(data_for_box, labels=top_genres, showfliers=False)
    plt.xticks(rotation=30, ha="right")
    plt.ylabel("Average rating used")
    plt.title("Average ratings by genre (top 8 by book count)")
    plt.tight_layout()
    plt.savefig(OUT_DIR / "box_avg_rating_by_genre_top8.png", dpi=160); plt.close()

    # 6) CSV snapshots: extremes
    df.sort_values("n_users_rated", ascending=False).head(20)[
        ["book_id", "primary_genre", "avg_rating_book_used", "n_users_rated"]
    ].to_csv(OUT_DIR / "top20_most_rated.csv", index=False)

    df[df["n_users_rated"] >= support].sort_values(
        ["avg_rating_book_used", "n_users_rated"], ascending=[False, False]
    ).head(20)[["book_id", "primary_genre", "avg_rating_book_used", "n_users_rated"]].to_csv(
        OUT_DIR / f"top20_highest_rated_min{support}.csv", index=False
    )

    print("Saved to:", OUT_DIR.resolve())

if __name__ == "__main__":
    # Notebook-safe: ignore unknown args like --f=...
    parser = argparse.ArgumentParser(add_help=False)
    parser.add_argument("--csv", default="/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0904/data/avg_pool_injection/books_summary.csv")
    parser.add_argument("--outdir", default="books_summary_viz")
    parser.add_argument("--support", type=int, default=50)
    args, _unknown = parser.parse_known_args(sys.argv[1:])
    viz_books_summary(args.csv, args.outdir, args.support)


Saved to: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/notebook/0904/books_summary_viz
