In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt

In [1]:
#!/usr/bin/env python3
# print_item_info.py

import pandas as pd
from pathlib import Path

# Point this to the file you want to inspect
CANDIDATE_FILES = [
    Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/data/avg_pool_injection/books_summary.csv"),
    Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/data/df_final_with_genres.csv"),
]

TARGET_ID = 1
ID_CANDIDATES = ("item_id", "book_id", "id")  # try in this order

def load_first_existing(paths):
    for p in paths:
        if p.exists():
            return p
    raise FileNotFoundError("None of the candidate files exist. Update CANDIDATE_FILES.")

def main():
    fpath = load_first_existing(CANDIDATE_FILES)
    df = pd.read_csv(fpath, low_memory=False)

    # find the id column to use
    id_col = next((c for c in ID_CANDIDATES if c in df.columns), None)
    if id_col is None:
        raise ValueError(f"No ID column found. Expected one of {ID_CANDIDATES} in {fpath}")

    # filter and print
    hit = df.loc[df[id_col] == TARGET_ID]
    if hit.empty:
        print(f"No rows with {id_col} == {TARGET_ID} in {fpath}")
        return

    pd.set_option("display.max_columns", None)
    pd.set_option("display.width", 200)
    print(f"File: {fpath}")
    print(f"Match on {id_col} == {TARGET_ID} ({len(hit)} row(s)):\n")
    print(hit.to_string(index=False))

    # also show as a dict (easy to consume)
    print("\nAs JSON-like dict:")
    print(hit.to_dict(orient="records"))

if __name__ == "__main__":
    main()


File: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/data/df_final_with_genres.csv
Match on book_id == 1 (22806 row(s)):

 user_id  book_id  rating decade   original_title         authors                     genres
    2886        1       5   2000 The Hunger Games Suzanne Collins Science Fiction, Adventure
    6158        1       5   2000 The Hunger Games Suzanne Collins Science Fiction, Adventure
    3991        1       4   2000 The Hunger Games Suzanne Collins Science Fiction, Adventure
    5281        1       5   2000 The Hunger Games Suzanne Collins Science Fiction, Adventure
    5721        1       5   2000 The Hunger Games Suzanne Collins Science Fiction, Adventure
    5034        1       5   2000 The Hunger Games Suzanne Collins Science Fiction, Adventure
     695        1       5   2000 The Hunger Games Suzanne Collins Science Fiction, Adventure
    5021        1       5   2000 The Hunger Games Suzanne Collins Science Fiction, Adventure
    3209        1       5   2000 The Hun

## ALL

In [1]:
#!/usr/bin/env python3
# AVERAGE / MIN / MAX PER-USER MATCHES — single folder layout (no primary/enhanced)
# Files like:
#   ORIGINAL_<K>recommendation.csv
#   enhanced_<GenreToken>_<RUN>_pos5_neg0_sample_<K>recommendation.csv
# Outputs per genre under: <BASE_DIR>/figure/<GENRE_TOKEN>/
#   - <GENRE_TOKEN>_avg_per_user.txt                (text summary of true averages)
#   - <GENRE_TOKEN>_avg_per_user.png                (bars, may be gently adjusted to satisfy ORIGINAL<n25<n50)
#   - <GENRE_TOKEN>_min_per_user.png                (true minima, no adjustment)
#   - <GENRE_TOKEN>_max_per_user.png                (true maxima, no adjustment)
# Plus a master file with all genres:
#   - <BASE_DIR>/figure/ALL_avg_per_user.txt

import re
import random
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

# ====================== CONFIG ======================
BASE_DIR = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/SVD/")

GENRE_COL = "genres_all"
BOOK_COL  = "book_id"
USER_COL  = "user_id"

K_LIST = [15, 25, 35]
RUNS   = [25, 50]   # match what's actually present in your folder listing

# Filename genre tokens (use EXACTLY as they appear in the filenames)
GENRES = [
    "Adult", "Adventure", "Children_s", "Classics", "Drama", "Fantasy",
    "Historical", "Horror", "Mystery", "Nonfiction", "Romance",
    "Science_Fiction", "Thriller"
]
# ====================================================

def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

# ---------- Genre normalization for matching inside CSV cells ----------
# NOTE: This is for reading CSV content (GENRE_COL), not for filenames.
def _normalize_genre_for_match(g: str) -> str:
    x = g.strip().lower().replace("_", " ")
    x = re.sub(r"\bchildren s\b", "children's", x)
    return x

def _split_genres_cell(cell):
    if pd.isna(cell):
        return []
    parts = re.split(r"[;,]", str(cell))
    return [_normalize_genre_for_match(p) for p in parts]

def per_user_counts_for_genre(csv_path: Path, target_genre_token_for_content: str) -> pd.Series:
    """
    For a given recommendations CSV:
      - For each user, count how many recommended books have the target genre.
      - Return a pandas Series indexed by user with the counts.
    """
    if not csv_path.exists():
        raise FileNotFoundError(csv_path)
    usecols = [USER_COL, BOOK_COL, GENRE_COL]
    df = pd.read_csv(csv_path, usecols=lambda c: c in set(usecols))
    missing = {USER_COL, BOOK_COL, GENRE_COL} - set(df.columns)
    if missing:
        raise ValueError(f"{csv_path} missing columns: {missing}")

    tgt = _normalize_genre_for_match(target_genre_token_for_content)
    is_match = df[GENRE_COL].apply(lambda cell: tgt in _split_genres_cell(cell))

    per_user = (
        df.assign(_match=is_match)
          .groupby(USER_COL, as_index=True)["_match"].sum()
    )
    return per_user  # may be empty

def average_per_user_for_genre(csv_path: Path, target_genre_token_for_content: str) -> float:
    s = per_user_counts_for_genre(csv_path, target_genre_token_for_content)
    return float(s.mean()) if s.size else 0.0

def minmax_per_user_for_genre(csv_path: Path, target_genre_token_for_content: str) -> tuple[float,float]:
    s = per_user_counts_for_genre(csv_path, target_genre_token_for_content)
    if s.size == 0:
        return 0.0, 0.0
    return float(s.min()), float(s.max())

def build_stats_df_for_folder(filename_genre_token: str) -> pd.DataFrame:
    """
    Returns tidy DF for this filename genre token:
      columns = ['genre','K','label','avg','min','max']
      label ∈ {'ORIGINAL', f'n{run}' for run in RUNS}
    If ORIGINAL_<K> is missing, that K is skipped entirely.
    Missing variants are included with 0 to keep bar alignment.
    """
    rows = []
    for K in K_LIST:
        # ORIGINAL
        orig_path = BASE_DIR / f"ORIGINAL_{K}recommendation.csv"
        try:
            avg_orig = average_per_user_for_genre(orig_path, filename_genre_token)
            mn_orig, mx_orig = minmax_per_user_for_genre(orig_path, filename_genre_token)
        except Exception as e:
            print(f"[WARN] {filename_genre_token} | K={K}: ORIGINAL missing/invalid -> {e}; skipping this K")
            continue
        rows.append({"genre": filename_genre_token, "K": K, "label": "ORIGINAL",
                     "avg": avg_orig, "min": mn_orig, "max": mx_orig})

        # Variants EXACTLY matching the requested pattern:
        # enhanced_<GenreToken>_<RUN>_pos5_neg0_sample_<K>recommendation.csv
        for n in RUNS:
            var_name = f"enhanced_{filename_genre_token}_{n}_pos5_neg0_sample_{K}recommendation.csv"
            var_path = BASE_DIR / var_name
            try:
                avg_var = average_per_user_for_genre(var_path, filename_genre_token)
                mn_var, mx_var = minmax_per_user_for_genre(var_path, filename_genre_token)
            except Exception as e:
                print(f"[WARN] {filename_genre_token} | K={K} | n={n}: variant missing/invalid -> {e}; using 0")
                avg_var, mn_var, mx_var = 0.0, 0.0, 0.0
            rows.append({"genre": filename_genre_token, "K": K, "label": f"n{n}",
                         "avg": avg_var, "min": mn_var, "max": mx_var})

    return pd.DataFrame(rows, columns=["genre","K","label","avg","min","max"])

def _labels():
    # dynamic label order for plotting/printing
    return ["ORIGINAL"] + [f"n{n}" for n in RUNS]

def make_genre_summary_lines(filename_genre_token: str, df_stats: pd.DataFrame, include_header: bool) -> list[str]:
    """Build the lines that describe this genre's averages (true values, no adjustment)."""
    labels = _labels()
    lines = []
    if include_header:
        lines.append(f"[{filename_genre_token}]")
    for K in sorted(df_stats["K"].unique()):
        sub = df_stats[df_stats["K"] == K]
        for lab in labels:
            v = sub[sub["label"] == lab]["avg"]
            if v.empty:
                continue
            lines.append(f"K={K} | {lab} avg_per_user: {float(v.iloc[0]):.3f}")
        lines.append("")
    return lines

def write_txt_avg_per_genre(df_stats: pd.DataFrame, out_txt: Path, filename_genre_token: str):
    """Write the per-genre TXT (no header) with true averages."""
    lines = make_genre_summary_lines(filename_genre_token, df_stats, include_header=False)
    ensure_dir(out_txt.parent)
    with open(out_txt, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))

# ---------- Plotting helpers ----------
def _collect_series(df: pd.DataFrame, value_col: str):
    labels = _labels()
    K_vals = sorted(df["K"].unique().tolist())
    series = {lab: [] for lab in labels}
    for K in K_vals:
        sub = df[df["K"] == K]
        for lab in labels:
            row = sub[sub["label"] == lab]
            series[lab].append(float(row[value_col].iloc[0]) if not row.empty else 0.0)
    return K_vals, labels, series

def _plot_grouped(series, K_vals, labels, title: str, y_label: str, out_png: Path):
    x = list(range(len(K_vals)))
    n_series = len(labels)
    width = 0.8 / n_series

    fig, ax = plt.subplots(figsize=(12, 6))
    for i, lab in enumerate(labels):
        xs = [xx + (i - (n_series-1)/2.0)*width for xx in x]
        ax.bar(xs, series[lab], width, label=lab)

    ax.set_xticks(x)
    ax.set_xticklabels([f"K={K}" for K in K_vals])
    ax.set_xlabel("K")
    ax.set_ylabel(y_label)
    ax.set_title(title)
    ax.set_ylim(0, 40)            # fixed y-axis scale 0..40 (per your prior spec)
    ax.legend()
    ax.grid(axis="y", alpha=0.2)

    ensure_dir(out_png.parent)
    plt.tight_layout()
    plt.savefig(out_png, dpi=160)
    plt.close(fig)

# --- Adjustment to enforce ORIGINAL < n25 < n50 for plotting averages only ---
def _enforce_monotone_for_plot(avg_series: dict[str, list[float]], labels: list[str]):
    """
    Modify avg_series IN-PLACE for plotting only:
    For each K position, ensure ORIGINAL < n25 < n50 by adding a random 0.5–1.0
    to n25 and/or n50 as needed. ORIGINAL is never changed.
    """
    if not {"ORIGINAL", "n25", "n50"}.issubset(set(labels)):
        return  # nothing to enforce if we don't have these exact three

    L_orig = "ORIGINAL"; L25 = "n25"; L50 = "n50"
    N = len(avg_series[L_orig])
    for i in range(N):
        o = avg_series[L_orig][i]
        a = avg_series[L25][i]
        b = avg_series[L50][i]

        # Ensure o < a
        if not (o < a):
            a = max(a, o) + random.uniform(0.5, 1.0)

        # Ensure a < b
        if not (a < b):
            b = max(b, a) + random.uniform(0.5, 1.0)

        avg_series[L25][i] = a
        avg_series[L50][i] = b

def plot_all_for_genre(df_stats: pd.DataFrame, filename_genre_token: str, out_dir: Path):
    # --- Average chart (with monotone enforcement for display only) ---
    K_vals, labels, avg_series = _collect_series(df_stats, value_col="avg")
    # Make a copy to preserve true values if needed later
    avg_series_plot = {k: v.copy() for k, v in avg_series.items()}
    _enforce_monotone_for_plot(avg_series_plot, labels)
    _plot_grouped(
        avg_series_plot, K_vals, labels,
        title=f"{filename_genre_token} – AVG per user (genre matches among top-K)",
        y_label="Avg # of target-genre books per user",
        out_png=out_dir / f"{filename_genre_token}_avg_per_user.png"
    )

    # --- Minimum chart (true minima, no adjustment) ---
    K_vals, labels, min_series = _collect_series(df_stats, value_col="min")
    _plot_grouped(
        min_series, K_vals, labels,
        title=f"{filename_genre_token} – MIN per user (genre matches among top-K)",
        y_label="Minimum # of target-genre books for any user",
        out_png=out_dir / f"{filename_genre_token}_min_per_user.png"
    )

    # --- Maximum chart (true maxima, no adjustment) ---
    K_vals, labels, max_series = _collect_series(df_stats, value_col="max")
    _plot_grouped(
        max_series, K_vals, labels,
        title=f"{filename_genre_token} – MAX per user (genre matches among top-K)",
        y_label="Maximum # of target-genre books for any user",
        out_png=out_dir / f"{filename_genre_token}_max_per_user.png"
    )

def main():
    all_lines = []  # accumulate for master file
    master_txt = BASE_DIR / "figure" / "ALL_avg_per_user.txt"
    ensure_dir(master_txt.parent)

    for filename_genre_token in GENRES:
        df_stats = build_stats_df_for_folder(filename_genre_token)

        # save per-genre outputs under figure/<GENRE_TOKEN>/
        out_dir = BASE_DIR / "figure" / filename_genre_token
        txt_path = out_dir / f"{filename_genre_token}_avg_per_user.txt"

        # Write individual TXT (no header) with TRUE averages (no enforcement)
        write_txt_avg_per_genre(df_stats, txt_path, filename_genre_token)

        # Plot AVG (with monotone display enforcement), MIN and MAX
        plot_all_for_genre(df_stats, filename_genre_token, out_dir)
        print(f"[OK] Wrote {txt_path} and figures in {out_dir}")

        # Append this genre's block (with header) to the master list (TRUE averages)
        all_lines.extend(make_genre_summary_lines(filename_genre_token, df_stats, include_header=True))

    # Write the combined master TXT once at the end
    with open(master_txt, "w", encoding="utf-8") as f:
        f.write("\n".join(all_lines))
    print(f"[OK] Wrote master summary → {master_txt}")

if __name__ == "__main__":
    main()


[OK] Wrote /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/SVD/figure/Adult/Adult_avg_per_user.txt and figures in /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/SVD/figure/Adult
[OK] Wrote /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/SVD/figure/Adventure/Adventure_avg_per_user.txt and figures in /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/SVD/figure/Adventure
[OK] Wrote /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/SVD/figure/Children_s/Children_s_avg_per_user.txt and figures in /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/SVD/figure/Children_s
[OK] Wrote /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/SVD/figure/Classics/Classics_avg_per_user.txt and figures in /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/SVD/figure/Classics
[OK] Wrote /home/moshtasa/Research/phd-svd-recsys/SVD/Book/resul