In [3]:
import os
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
#!/usr/bin/env python3
# Merge of: (1) avg summary builder + (2) per-genre logging + bin plots
# Single dataset: PRIMARY only

import re
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# ====================== CONFIG ======================
BASE_DIR   = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0927/SVD")
PRIMARY_DIR = BASE_DIR  # files live directly here

USER_COL  = "user_id"
GENRE_COL = "genres_all"

K_LIST   = [15, 25, 35]
RUNS     = [25, 50, 100, 200]
NUM_BINS = 10

GENRES = [
    "Adult", "Adventure", "Children_s", "Classics", "Drama", "Fantasy",
    "Historical", "Horror", "Mystery", "Nonfiction", "Romance",
    "Science_Fiction", "Thriller"
]
# ====================================================

def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def normalize_token(s: str) -> str:
    """Normalize a genre token for matching inside CSV cells."""
    x = s.strip().lower().replace("_", " ")
    x = re.sub(r"\bchildren s\b", "children's", x)
    return x

def split_genres_cell(cell) -> list:
    if pd.isna(cell):
        return []
    parts = re.split(r"[;,]", str(cell))
    return [normalize_token(p) for p in parts]

def count_genre_per_user(csv_path: Path, target_genre_token: str) -> pd.Series:
    """
    Returns a Series indexed by user_id with the COUNT of rows whose 'genres_all'
    include the target genre.
    """
    if not csv_path.exists():
        raise FileNotFoundError(f"Missing file: {csv_path}")
    df = pd.read_csv(csv_path)
    if USER_COL not in df.columns or GENRE_COL not in df.columns:
        raise ValueError(f"{csv_path} must contain '{USER_COL}' and '{GENRE_COL}'.")
    tgt = normalize_token(target_genre_token)
    df["_hit"] = df[GENRE_COL].apply(lambda cell: int(tgt in split_genres_cell(cell)))
    return df.groupby(USER_COL)["_hit"].sum()

def summarize_one(file_path: Path, genre: str):
    """
    Returns (users, total_hits, mean_per_user) for one CSV/genre.
    If file missing, returns None.
    """
    try:
        s = count_genre_per_user(file_path, genre)
    except (FileNotFoundError, ValueError):
        return None
    users = int(s.shape[0])
    total = int(s.sum())
    mean = (total / users) if users else 0.0
    return users, total, float(mean)

def intersect_users(series_list):
    """Intersection of indices across non-empty series."""
    series_list = [s for s in series_list if s is not None and len(s) > 0]
    if not series_list:
        return pd.Index([])
    inter = series_list[0].index
    for s in series_list[1:]:
        inter = inter.intersection(s.index)
    return inter

def compute_bin_means(original_s: pd.Series, variant_series: dict, num_bins=10) -> dict:
    """Sort users by ORIGINAL desc, split into bins, average per bin."""
    sorted_users = original_s.sort_values(ascending=False).index.tolist()
    n = len(sorted_users)
    if n == 0:
        return {lab: [0.0]*num_bins for lab in ["ORIGINAL", *variant_series.keys()]}
    bins = []
    base = n // num_bins
    rem = n % num_bins
    start = 0
    for i in range(num_bins):
        size = base + (1 if i < rem else 0)
        end = start + size
        bins.append(sorted_users[start:end])
        start = end
    out = {}
    out["ORIGINAL"] = [float(original_s.loc[b].mean()) if b else 0.0 for b in bins]
    for lab, s in variant_series.items():
        out[lab] = [float(s.loc[b].mean()) if b else 0.0 for b in bins]
    return out

def plot_grouped_bars(bin_stats: dict, title: str, out_path: Path):
    labels = list(bin_stats.keys())  # ["ORIGINAL", "25", "50", "100", "200"]
    x = np.arange(NUM_BINS)
    n_series = len(labels)
    width = 0.8 / n_series
    fig, ax = plt.subplots(figsize=(14, 6))
    for i, lab in enumerate(labels):
        offsets = (i - (n_series-1)/2.0) * width
        ax.bar(x + offsets, bin_stats[lab], width, label=lab)
    ax.set_xlabel("User bins (sorted by ORIGINAL genre count, high → low)")
    ax.set_ylabel("Avg count of target-genre items per user")
    ax.set_title(title)
    ax.set_xticks(x)
    ax.set_xticklabels([f"Bin {i+1}" for i in range(NUM_BINS)], rotation=0)
    ax.legend()
    ax.grid(axis="y", alpha=0.2)
    ensure_dir(out_path.parent)
    plt.tight_layout()
    plt.savefig(out_path, dpi=160)
    plt.close(fig)

def log_genre_counts(folder_dir: Path, genre: str, K: int,
                     total_original: int, totals_variants: dict,
                     n_users_original: int, n_users_variants: dict):
    """
    Append per-genre totals (and means) to one summary text file in PRIMARY_DIR.
    """
    log_path = folder_dir / "genre_counts_summary.txt"
    pretty_g = genre.replace("_", " ").replace("Children s", "Children's")
    lines = []
    mean_o = total_original / n_users_original if n_users_original else 0.0
    lines.append(
        f"primary_ {pretty_g} | K={K} | ORIGINAL: total={int(total_original)}, users={n_users_original}, mean_per_user={mean_o:.4f}"
    )
    for lab in ["25", "50", "100", "200"]:
        tot = int(totals_variants.get(lab, 0))
        n_u = int(n_users_variants.get(lab, 0))
        mean_v = (tot / n_u) if n_u else 0.0
        lines.append(
            f"primary_ {pretty_g} | K={K} | {lab}: total={tot}, users={n_u}, mean_per_user={mean_v:.4f}"
        )
    lines.append("")  # spacer
    with open(log_path, "a", encoding="utf-8") as f:
        f.write("\n".join(lines))

def build_primary_outputs():
    """
    For each genre and K:
      - Load ORIGINAL_K and primary variants 25/50/100/200
      - Log totals to genre_counts_summary.txt
      - Plot 10-bin grouped bar charts into ./figure
    """
    prefix = "p_"
    out_dir = PRIMARY_DIR / "figure"
    ensure_dir(out_dir)

    for g in GENRES:
        for K in K_LIST:
            # ORIGINAL
            original_fp = PRIMARY_DIR / f"ORIGINAL_{K}recommendation.csv"
            try:
                s_original = count_genre_per_user(original_fp, g)
            except Exception as e:
                print(f"[WARN] Skip {g} K={K}: cannot load ORIGINAL -> {e}")
                continue

            # Variants (load what exists; if missing, we’ll log zeros and plot zeros)
            loaded_variants = {}
            users_variants = {}
            totals_variants = {}
            for r in RUNS:
                fp = PRIMARY_DIR / f"{prefix}{g}_{r}_{K}recommendation.csv"
                try:
                    s = count_genre_per_user(fp, g)
                    loaded_variants[str(r)] = s
                    users_variants[str(r)] = int(s.shape[0])
                    totals_variants[str(r)] = int(s.sum())
                except Exception as e:
                    print(f"[WARN] Missing/invalid: {fp} -> {e}")
                    users_variants[str(r)] = 0
                    totals_variants[str(r)] = 0

            # ---- Logging (totals; no intersection) ----
            total_original = int(s_original.sum())
            n_users_original = int(s_original.shape[0])
            log_genre_counts(PRIMARY_DIR, g, K, total_original, totals_variants,
                             n_users_original, users_variants)

            # ---- Plotting with aligned users across ORIGINAL + available variants ----
            inter = intersect_users([s_original] + list(loaded_variants.values()))
            if len(inter) == 0:
                print(f"[WARN] No common users for {g} K={K}; skip plot.")
                continue

            s_orig_aligned = s_original.loc[inter]
            # Ensure we have a series for every label (zeros if missing)
            aligned_variants = {}
            for lab in ["25", "50", "100", "200"]:
                if lab in loaded_variants:
                    aligned_variants[lab] = loaded_variants[lab].loc[inter]
                else:
                    aligned_variants[lab] = pd.Series(0, index=inter)

            bin_stats = compute_bin_means(s_orig_aligned, aligned_variants, num_bins=NUM_BINS)

            pretty_g = g.replace("_", " ").replace("Children s", "Children's")
            title = f"Primary – {pretty_g} – K={K}"
            out_path = out_dir / f"{g}_K{K}_primary.png"
            try:
                plot_grouped_bars(bin_stats, title, out_path)
                print(f"[OK] Saved: {out_path}")
            except Exception as e:
                print(f"[ERR] Plot fail for {g} K={K}: {e}")

def collect_summary_primary():
    """
    Build avg_counts_summary.csv in PRIMARY_DIR with:
      analysis, genre, K, variant, users, total_hits, mean_per_user
    """
    rows = []
    prefix = "primary_p_"
    for g in GENRES:
        for K in K_LIST:
            # ORIGINAL
            original_fp = PRIMARY_DIR / f"ORIGINAL_{K}recommendation.csv"
            orig_stats = summarize_one(original_fp, g)
            if orig_stats is None:
                print(f"[WARN] Missing ORIGINAL_{K} for {g}; skip row group.")
                continue
            users, total, mean = orig_stats
            rows.append({
                "analysis": "primary",
                "genre": g,
                "K": K,
                "variant": "ORIGINAL",
                "users": users,
                "total_hits": total,
                "mean_per_user": round(mean, 6),
            })
            # Variants
            for r in RUNS:
                fp = PRIMARY_DIR / f"{prefix}{g}_{r}_{K}recommendation.csv"
                stats = summarize_one(fp, g)
                if stats is None:
                    rows.append({
                        "analysis": "primary",
                        "genre": g,
                        "K": K,
                        "variant": str(r),
                        "users": 0,
                        "total_hits": 0,
                        "mean_per_user": 0.0,
                    })
                else:
                    u, t, m = stats
                    rows.append({
                        "analysis": "primary",
                        "genre": g,
                        "K": K,
                        "variant": str(r),
                        "users": u,
                        "total_hits": t,
                        "mean_per_user": round(m, 6),
                    })
    out_csv = PRIMARY_DIR / "avg_counts_summary.csv"
    pd.DataFrame(rows).to_csv(out_csv, index=False)
    print(f"[OK] Wrote {out_csv} with {len(rows)} rows.")

def main():
    build_primary_outputs()     # logs + figures
    collect_summary_primary()   # tidy CSV with averages

if __name__ == "__main__":
    main()


[WARN] Missing/invalid: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0926/SVD/p_Adult_25_15recommendation.csv -> Missing file: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0926/SVD/p_Adult_25_15recommendation.csv
[WARN] Missing/invalid: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0926/SVD/p_Adult_50_15recommendation.csv -> Missing file: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0926/SVD/p_Adult_50_15recommendation.csv
[OK] Saved: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0926/SVD/figure/Adult_K15_primary.png
[WARN] Missing/invalid: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0926/SVD/p_Adult_25_25recommendation.csv -> Missing file: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0926/SVD/p_Adult_25_25recommendation.csv
[WARN] Missing/invalid: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0926/SVD/p_Adult_50_25recomme

KeyboardInterrupt: 

## ALL

In [None]:
#!/usr/bin/env python3
# UNIQUE BOOKS ONLY — single folder layout (no primary/enhanced)
# Files like:
#   ORIGINAL_<K>recommendation.csv
#   enhanced_<Genre>_<RUN>_<K>recommendation.csv
# Output per genre under: <BASE_DIR>/figure/<GENRE>/
#   - <GENRE>_unique_totals.txt
#   - <GENRE>_unique_totals.png
# Plus a master file with all genres:
#   - <BASE_DIR>/figure/ALL_unique_totals.txt

import re
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

# ====================== CONFIG ======================
BASE_DIR = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0927/SVD")

GENRE_COL = "genres_all"
BOOK_COL  = "book_id"

K_LIST = [15, 25, 35]
RUNS   = [25, 400, 1000]  # match what's actually present in your folder listing
# Genres as they appear in filenames (underscores ok)
GENRES = [
    "Adult", "Adventure", "Children_s", "Classics", "Drama", "Fantasy",
    "Historical", "Horror", "Mystery", "Nonfiction", "Romance",
    "Science_Fiction", "Thriller"
]
# ====================================================

def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def _normalize_genre_for_match(g: str) -> str:
    x = g.strip().lower().replace("_", " ")
    x = re.sub(r"\bchildren s\b", "children's", x)
    return x

def _split_genres_cell(cell):
    if pd.isna(cell):
        return []
    parts = re.split(r"[;,]", str(cell))
    return [_normalize_genre_for_match(p) for p in parts]

def count_unique_books_for_genre(csv_path: Path, target_genre_token: str) -> int:
    """Count UNIQUE book_id values in this file whose genres include the target genre."""
    if not csv_path.exists():
        raise FileNotFoundError(csv_path)
    df = pd.read_csv(csv_path, usecols=lambda c: c in {BOOK_COL, GENRE_COL})
    missing = {BOOK_COL, GENRE_COL} - set(df.columns)
    if missing:
        raise ValueError(f"{csv_path} missing columns: {missing}")
    tgt = _normalize_genre_for_match(target_genre_token)
    mask = df[GENRE_COL].apply(lambda cell: tgt in _split_genres_cell(cell))
    return df.loc[mask, BOOK_COL].nunique()

def build_unique_df_for_folder(genre: str) -> pd.DataFrame:
    """
    Returns tidy DF for this folder:
      columns = ['genre','K','label','unique_books']
      label ∈ {'ORIGINAL', f'n{run}' for run in RUNS}
    If ORIGINAL_<K> is missing, that K is skipped entirely.
    Missing variants are included with 0 to keep bar alignment.
    """
    rows = []
    for K in K_LIST:
        # ORIGINAL
        orig_path = BASE_DIR / f"ORIGINAL_{K}recommendation.csv"
        try:
            tot_orig = int(count_unique_books_for_genre(orig_path, genre))
        except Exception as e:
            print(f"[WARN] {genre} | K={K}: ORIGINAL missing/invalid -> {e}; skipping this K")
            continue
        rows.append({"genre": genre, "K": K, "label": "ORIGINAL", "unique_books": tot_orig})

        # Variants enhanced_<Genre>_<RUN>_<K>
        for n in RUNS:
            var_path = BASE_DIR / f"enhanced_{genre}_{n}_{K}recommendation.csv"
            try:
                tot_var = int(count_unique_books_for_genre(var_path, genre))
            except Exception as e:
                print(f"[WARN] {genre} | K={K} | n={n}: variant missing/invalid -> {e}; using 0")
                tot_var = 0
            rows.append({"genre": genre, "K": K, "label": f"n{n}", "unique_books": tot_var})

    return pd.DataFrame(rows, columns=["genre","K","label","unique_books"])

def _labels():
    # dynamic label order for plotting/printing
    return ["ORIGINAL"] + [f"n{n}" for n in RUNS]

def make_genre_summary_lines(genre: str, df_uni: pd.DataFrame, include_header: bool) -> list[str]:
    """Build the lines that describe this genre's unique-book totals."""
    labels = _labels()
    lines = []
    if include_header:
        lines.append(f"[{genre}]")
    for K in sorted(df_uni["K"].unique()):
        sub = df_uni[df_uni["K"] == K]
        for lab in labels:
            v = sub[sub["label"] == lab]["unique_books"]
            if v.empty:
                continue
            lines.append(f"K={K} | {lab} unique_books: {int(v.iloc[0])}")
        lines.append("")
    return lines

def write_txt_unique_per_genre(df_uni: pd.DataFrame, out_txt: Path, genre: str):
    """Write the per-genre TXT (no header to match your original style)."""
    lines = make_genre_summary_lines(genre, df_uni, include_header=False)
    ensure_dir(out_txt.parent)
    with open(out_txt, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))

def plot_grouped_unique(df_uni: pd.DataFrame, title: str, out_png: Path):
    """Grouped bar chart: x=K, bars=ORIGINAL + dynamic RUN labels; y=unique book count."""
    if df_uni.empty:
        print(f"[INFO] Nothing to plot for {title}")
        return
    labels = _labels()
    K_vals = sorted(df_uni["K"].unique().tolist())
    series = {lab: [] for lab in labels}
    for K in K_vals:
        sub = df_uni[df_uni["K"] == K]
        for lab in labels:
            row = sub[sub["label"] == lab]
            series[lab].append(int(row["unique_books"].iloc[0]) if not row.empty else 0)

    x = list(range(len(K_vals)))
    n_series = len(labels)
    width = 0.8 / n_series

    fig, ax = plt.subplots(figsize=(12, 6))
    for i, lab in enumerate(labels):
        xs = [xx + (i - (n_series-1)/2.0)*width for xx in x]
        ax.bar(xs, series[lab], width, label=lab)

    ax.set_xticks(x)
    ax.set_xticklabels([f"K={K}" for K in K_vals])
    ax.set_xlabel("K")
    ax.set_ylabel("Unique books with target genre")
    ax.set_title(title)
    ax.legend()
    ax.grid(axis="y", alpha=0.2)

    ensure_dir(out_png.parent)
    plt.tight_layout()
    plt.savefig(out_png, dpi=160)
    plt.close(fig)

def main():
    all_lines = []  # accumulate for master file
    master_txt = BASE_DIR / "figure" / "ALL_unique_totals.txt"
    ensure_dir(master_txt.parent)

    for g in GENRES:
        df_uni = build_unique_df_for_folder(g)

        # save per-genre outputs under figure/<GENRE>/
        out_dir = BASE_DIR / "figure" / g
        txt_path = out_dir / f"{g}_unique_totals.txt"
        png_path = out_dir / f"{g}_unique_totals.png"

        # Write individual TXT (no header) and PNG
        write_txt_unique_per_genre(df_uni, txt_path, g)
        plot_grouped_unique(df_uni, title=f"{g} – UNIQUE books (no clustering)", out_png=png_path)
        print(f"[OK] Wrote {txt_path} and {png_path}")

        # Append this genre's block (with header) to the master list
        all_lines.extend(make_genre_summary_lines(g, df_uni, include_header=True))

    # Write the combined master TXT once at the end
    with open(master_txt, "w", encoding="utf-8") as f:
        f.write("\n".join(all_lines))
    print(f"[OK] Wrote master summary → {master_txt}")

if __name__ == "__main__":
    main()
