In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt

## ALL

In [None]:
#!/usr/bin/env python3
# AVERAGE PER-USER MATCHES — single folder layout (no primary/enhanced)
# Files like:
#   ORIGINAL_<K>recommendation.csv
#   enhanced_<GenreToken>_<RUN>_pos5_neg0_sample_<K>recommendation.csv
# Output per genre under: <BASE_DIR>/figure/<GENRE_TOKEN>/
#   - <GENRE_TOKEN>_avg_per_user.txt
#   - <GENRE_TOKEN>_avg_per_user.png
# Plus a master file with all genres:
#   - <BASE_DIR>/figure/ALL_avg_per_user.txt

import re
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

# ====================== CONFIG ======================
BASE_DIR = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/SVD/")

GENRE_COL = "genres_all"
BOOK_COL  = "book_id"
USER_COL  = "user_id"

K_LIST = [15, 25, 35]
RUNS   = [25, 50]   # match what's actually present in your folder listing

# Filename genre tokens (use EXACTLY as they appear in the filenames)
GENRES = [
    "Adult", "Adventure", "Children_s", "Classics", "Drama", "Fantasy",
    "Historical", "Horror", "Mystery", "Nonfiction", "Romance",
    "Science_Fiction", "Thriller"
]
# ====================================================

def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

# ---------- Genre normalization for matching inside CSV cells ----------
# NOTE: This is for reading CSV content (GENRE_COL), not for filenames.
def _normalize_genre_for_match(g: str) -> str:
    x = g.strip().lower().replace("_", " ")
    x = re.sub(r"\bchildren s\b", "children's", x)
    return x

def _split_genres_cell(cell):
    if pd.isna(cell):
        return []
    parts = re.split(r"[;,]", str(cell))
    return [_normalize_genre_for_match(p) for p in parts]

def average_per_user_for_genre(csv_path: Path, target_genre_token_for_content: str) -> float:
    """
    For a given recommendations CSV:
      - For each user, count how many recommended books have the target genre.
      - Return the average of those per-user counts.
    """
    if not csv_path.exists():
        raise FileNotFoundError(csv_path)
    usecols = [USER_COL, BOOK_COL, GENRE_COL]
    df = pd.read_csv(csv_path, usecols=lambda c: c in set(usecols))
    missing = {USER_COL, BOOK_COL, GENRE_COL} - set(df.columns)
    if missing:
        raise ValueError(f"{csv_path} missing columns: {missing}")

    tgt = _normalize_genre_for_match(target_genre_token_for_content)
    is_match = df[GENRE_COL].apply(lambda cell: tgt in _split_genres_cell(cell))

    # Per-user count of matched recommendations
    per_user = (
        df.assign(_match=is_match)
          .groupby(USER_COL, as_index=False)["_match"].sum()
    )

    # If a user has no rows at all (shouldn't happen in a rec file), they won't appear.
    # Average across users present in the file:
    if per_user.empty:
        return 0.0
    return float(per_user["_match"].mean())

def build_avg_df_for_folder(filename_genre_token: str) -> pd.DataFrame:
    """
    Returns tidy DF for this filename genre token:
      columns = ['genre','K','label','avg_per_user']
      label ∈ {'ORIGINAL', f'n{run}' for run in RUNS}
    If ORIGINAL_<K> is missing, that K is skipped entirely.
    Missing variants are included with 0 to keep bar alignment.
    """
    rows = []
    for K in K_LIST:
        # ORIGINAL
        orig_path = BASE_DIR / f"ORIGINAL_{K}recommendation.csv"
        try:
            avg_orig = float(average_per_user_for_genre(orig_path, filename_genre_token))
        except Exception as e:
            print(f"[WARN] {filename_genre_token} | K={K}: ORIGINAL missing/invalid -> {e}; skipping this K")
            continue
        rows.append({"genre": filename_genre_token, "K": K, "label": "ORIGINAL", "avg_per_user": avg_orig})

        # Variants EXACTLY matching the requested pattern:
        # enhanced_<GenreToken>_<RUN>_pos5_neg0_sample_<K>recommendation.csv
        for n in RUNS:
            var_name = f"enhanced_{filename_genre_token}_{n}_pos5_neg0_sample_{K}recommendation.csv"
            var_path = BASE_DIR / var_name
            try:
                avg_var = float(average_per_user_for_genre(var_path, filename_genre_token))
            except Exception as e:
                print(f"[WARN] {filename_genre_token} | K={K} | n={n}: variant missing/invalid -> {e}; using 0")
                avg_var = 0.0
            rows.append({"genre": filename_genre_token, "K": K, "label": f"n{n}", "avg_per_user": avg_var})

    return pd.DataFrame(rows, columns=["genre","K","label","avg_per_user"])

def _labels():
    # dynamic label order for plotting/printing
    return ["ORIGINAL"] + [f"n{n}" for n in RUNS]

def make_genre_summary_lines(filename_genre_token: str, df_avg: pd.DataFrame, include_header: bool) -> list[str]:
    """Build the lines that describe this genre's averages."""
    labels = _labels()
    lines = []
    if include_header:
        lines.append(f"[{filename_genre_token}]")
    for K in sorted(df_avg["K"].unique()):
        sub = df_avg[df_avg["K"] == K]
        for lab in labels:
            v = sub[sub["label"] == lab]["avg_per_user"]
            if v.empty:
                continue
            lines.append(f"K={K} | {lab} avg_per_user: {float(v.iloc[0]):.3f}")
        lines.append("")
    return lines

def write_txt_avg_per_genre(df_avg: pd.DataFrame, out_txt: Path, filename_genre_token: str):
    """Write the per-genre TXT (no header to match your original style)."""
    lines = make_genre_summary_lines(filename_genre_token, df_avg, include_header=False)
    ensure_dir(out_txt.parent)
    with open(out_txt, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))

def plot_grouped_avg(df_avg: pd.DataFrame, title: str, out_png: Path):
    """Grouped bar chart: x=K, bars=ORIGINAL + dynamic RUN labels; y=avg # of target-genre books per user."""
    if df_avg.empty:
        print(f"[INFO] Nothing to plot for {title}")
        return
    labels = _labels()
    K_vals = sorted(df_avg["K"].unique().tolist())
    series = {lab: [] for lab in labels}
    for K in K_vals:
        sub = df_avg[df_avg["K"] == K]
        for lab in labels:
            row = sub[sub["label"] == lab]
            series[lab].append(float(row["avg_per_user"].iloc[0]) if not row.empty else 0.0)

    x = list(range(len(K_vals)))
    n_series = len(labels)
    width = 0.8 / n_series

    fig, ax = plt.subplots(figsize=(12, 6))
    for i, lab in enumerate(labels):
        xs = [xx + (i - (n_series-1)/2.0)*width for xx in x]
        ax.bar(xs, series[lab], width, label=lab)

    ax.set_xticks(x)
    ax.set_xticklabels([f"K={K}" for K in K_vals])
    ax.set_xlabel("K")
    ax.set_ylabel("Avg # of target-genre books per user")
    ax.set_title(title)
    ax.set_ylim(0, 40)                # <<< fixed y-axis scale 0..40
    ax.legend()
    ax.grid(axis="y", alpha=0.2)

    ensure_dir(out_png.parent)
    plt.tight_layout()
    plt.savefig(out_png, dpi=160)
    plt.close(fig)

def main():
    all_lines = []  # accumulate for master file
    master_txt = BASE_DIR / "figure" / "ALL_avg_per_user.txt"
    ensure_dir(master_txt.parent)

    for filename_genre_token in GENRES:
        df_avg = build_avg_df_for_folder(filename_genre_token)

        # save per-genre outputs under figure/<GENRE_TOKEN>/
        out_dir = BASE_DIR / "figure" / filename_genre_token
        txt_path = out_dir / f"{filename_genre_token}_avg_per_user.txt"
        png_path = out_dir / f"{filename_genre_token}_avg_per_user.png"

        # Write individual TXT (no header) and PNG
        write_txt_avg_per_genre(df_avg, txt_path, filename_genre_token)
        plot_grouped_avg(df_avg, title=f"{filename_genre_token} – AVG per user (genre matches among top-K)", out_png=png_path)
        print(f"[OK] Wrote {txt_path} and {png_path}")

        # Append this genre's block (with header) to the master list
        all_lines.extend(make_genre_summary_lines(filename_genre_token, df_avg, include_header=True))

    # Write the combined master TXT once at the end
    with open(master_txt, "w", encoding="utf-8") as f:
        f.write("\n".join(all_lines))
    print(f"[OK] Wrote master summary → {master_txt}")

if __name__ == "__main__":
    main()
