In [5]:
import re
import pandas as pd
from pathlib import Path

# ====================== CONFIG ======================
BASE_DIR = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0922")
PRIMARY_DIR  = BASE_DIR / "primary_analysis"
ENHANCED_DIR = BASE_DIR / "enhanced_analysis"

USER_COL  = "user_id"
GENRE_COL = "genres_all"

K_LIST   = [15, 25, 35]
RUNS     = [25, 50, 100, 200]

# The 13 genres as they appear in filenames
GENRES = [
    "Adult", "Adventure", "Children_s", "Classics", "Drama", "Fantasy",
    "Historical", "Horror", "Mystery", "Nonfiction", "Romance",
    "Science_Fiction", "Thriller"
]
# ====================================================

def normalize_token(s: str) -> str:
    """Normalize a genre token for matching inside CSV cells."""
    x = s.strip().lower().replace("_", " ")
    x = re.sub(r"\bchildren s\b", "children's", x)
    return x

def split_genres_cell(cell) -> list:
    if pd.isna(cell):
        return []
    parts = re.split(r"[;,]", str(cell))
    return [normalize_token(p) for p in parts]

def count_genre_per_user(csv_path: Path, target_genre_token: str) -> pd.Series:
    """
    Returns a Series indexed by user_id with the COUNT of rows whose 'genres'
    include the target genre.
    """
    df = pd.read_csv(csv_path)
    if USER_COL not in df.columns or GENRE_COL not in df.columns:
        raise ValueError(f"{csv_path} must contain '{USER_COL}' and '{GENRE_COL}'.")

    tgt = normalize_token(target_genre_token)

    def has_target(cell):
        return int(tgt in split_genres_cell(cell))

    df["_hit"] = df[GENRE_COL].apply(has_target)
    return df.groupby(USER_COL)["_hit"].sum()

def summarize_one(file_path: Path, genre: str):
    """
    Returns (users, total_hits, mean_per_user) for one CSV/genre.
    If file missing, returns None.
    """
    if not file_path.exists():
        return None
    s = count_genre_per_user(file_path, genre)
    users = int(s.shape[0])
    total = float(s.sum())
    mean = (total / users) if users else 0.0
    return users, int(total), float(mean)

def collect_summary(kind: str):
    """
    kind: 'primary' or 'enhanced'
    For each genre and K:
      - Compare ORIGINAL_K (from PRIMARY_DIR) vs the folder’s 25/50/100/200.
    Writes a CSV into that folder:
      - primary_analysis/avg_counts_summary.csv
      - enhanced_analysis/avg_counts_summary.csv
    """
    folder = PRIMARY_DIR if kind == "primary" else ENHANCED_DIR
    prefix = "primary_p_" if kind == "primary" else "improved_"

    rows = []
    for g in GENRES:
        for K in K_LIST:
            # ORIGINAL comes from the primary folder
            original_fp = PRIMARY_DIR / f"ORIGINAL_{K}recommendation.csv"
            orig_stats = summarize_one(original_fp, g)
            if orig_stats is not None:
                users, total, mean = orig_stats
                rows.append({
                    "analysis": kind,
                    "genre": g,
                    "K": K,
                    "variant": "ORIGINAL",
                    "users": users,
                    "total_hits": total,
                    "mean_per_user": round(mean, 6),
                })
            else:
                # If ORIGINAL missing, skip this (genre, K) entirely
                continue

            # Folder variants: 25, 50, 100, 200
            for r in RUNS:
                fp = folder / f"{prefix}{g}_{r}_{K}recommendation.csv"
                stats = summarize_one(fp, g)
                if stats is None:
                    # If a variant is missing, just note it and continue
                    rows.append({
                        "analysis": kind,
                        "genre": g,
                        "K": K,
                        "variant": str(r),
                        "users": 0,
                        "total_hits": 0,
                        "mean_per_user": 0.0,
                    })
                else:
                    users, total, mean = stats
                    rows.append({
                        "analysis": kind,
                        "genre": g,
                        "K": K,
                        "variant": str(r),
                        "users": users,
                        "total_hits": total,
                        "mean_per_user": round(mean, 6),
                    })

    # Save
    out_csv = folder / "avg_counts_summary.csv"
    pd.DataFrame(rows).to_csv(out_csv, index=False)
    print(f"[OK] Wrote {out_csv} with {len(rows)} rows.")

def main():
    collect_summary("primary")
    collect_summary("enhanced")

if __name__ == "__main__":
    main()


[OK] Wrote /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0922/primary_analysis/avg_counts_summary.csv with 195 rows.
[OK] Wrote /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0922/enhanced_analysis/avg_counts_summary.csv with 195 rows.


In [3]:
import os
import re
import math
import glob
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from collections import defaultdict

# ====================== CONFIG ======================
BASE_DIR = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0922")

PRIMARY_DIR  = BASE_DIR / "primary_analysis"
ENHANCED_DIR = BASE_DIR / "enhanced_analysis"

USER_COL  = "user_id"
GENRE_COL = "genres_all"

RUNS     = [25, 50, 100, 200]
K_LIST   = [15, 25, 35]
NUM_BINS = 10

FIXED_GENRES = [
    "Adult", "Adventure", "Children_s", "Classics", "Drama", "Fantasy",
    "Historical", "Horror", "Mystery", "Nonfiction", "Romance", "Science_Fiction", "Thriller"
]
# ===================================================

def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def normalize_genre_for_match(g: str) -> str:
    x = g.strip().lower().replace("_", " ")
    x = re.sub(r"\bchildren s\b", "children's", x)
    return x

def split_genres_cell(cell: str):
    if pd.isna(cell):
        return []
    parts = re.split(r"[;,]", str(cell))
    return [normalize_genre_for_match(p) for p in parts]

def count_genre_per_user(csv_path: Path, target_genre_token: str) -> pd.Series:
    if not csv_path.exists():
        raise FileNotFoundError(f"Missing file: {csv_path}")
    df = pd.read_csv(csv_path)
    if USER_COL not in df.columns or GENRE_COL not in df.columns:
        raise ValueError(f"{csv_path} must contain '{USER_COL}' and '{GENRE_COL}'.")
    tgt = normalize_genre_for_match(target_genre_token)
    def has_target(cell):
        toks = split_genres_cell(cell)
        return int(tgt in toks)
    df["_hit"] = df[GENRE_COL].apply(has_target)
    s = df.groupby(USER_COL)["_hit"].sum()
    return s

def intersect_users(series_list):
    if not series_list:
        return pd.Index([])
    inter = series_list[0].index
    for s in series_list[1:]:
        inter = inter.intersection(s.index)
    return inter

def compute_bin_means(original_s: pd.Series, variant_series: dict, num_bins=10) -> dict:
    sorted_users = original_s.sort_values(ascending=False).index.tolist()
    n = len(sorted_users)
    if n == 0:
        return {lab: [0.0]*num_bins for lab in ["ORIGINAL", *variant_series.keys()]}
    bins = []
    base = n // num_bins
    rem = n % num_bins
    start = 0
    for i in range(num_bins):
        size = base + (1 if i < rem else 0)
        end = start + size
        bins.append(sorted_users[start:end])
        start = end
    out = {}
    out["ORIGINAL"] = [float(original_s.loc[b].mean()) if b else 0.0 for b in bins]
    for lab, s in variant_series.items():
        out[lab] = [float(s.loc[b].mean()) if b else 0.0 for b in bins]
    return out

def plot_grouped_bars(bin_stats: dict, title: str, out_path: Path):
    labels = list(bin_stats.keys())
    x = np.arange(NUM_BINS)
    n_series = len(labels)
    width = 0.8 / n_series
    fig, ax = plt.subplots(figsize=(14, 6))
    for i, lab in enumerate(labels):
        offsets = (i - (n_series-1)/2.0) * width
        ax.bar(x + offsets, bin_stats[lab], width, label=lab)
    ax.set_xlabel("User bins (sorted by ORIGINAL genre count, high → low)")
    ax.set_ylabel("Avg count of target-genre items per user")
    ax.set_title(title)
    ax.set_xticks(x)
    ax.set_xticklabels([f"Bin {i+1}" for i in range(NUM_BINS)], rotation=0)
    ax.legend()
    ax.grid(axis="y", alpha=0.2)
    ensure_dir(out_path.parent)
    plt.tight_layout()
    plt.savefig(out_path, dpi=160)
    plt.close(fig)

# ============ NEW: logging helper to a single text file ============
def log_genre_counts(kind: str, folder_dir: Path, genre: str, K: int,
                     total_original: float, totals_variants: dict,
                     n_users_original: int, n_users_variants: dict):
    """
    Append per-genre total counts (and quick means) to one summary text file in the folder.
    kind: 'primary' or 'enhanced'
    folder_dir: directory whose log we’re writing (PRIMARY_DIR or ENHANCED_DIR)
    genre: e.g., 'Adult', 'Children_s'
    K: 15/25/35
    total_original: sum of ORIGINAL genre hits across all users (from primary ORIGINAL file)
    totals_variants: dict like {'25': sum, '50': sum, '100': sum, '200': sum}
    n_users_original: number of users in ORIGINAL series
    n_users_variants: dict with user counts per variant
    """
    log_path = folder_dir / "genre_counts_summary.txt"
    pretty_g = genre.replace("_", " ").replace("Children s", "Children's")

    lines = []
    # Original line
    mean_o = total_original / n_users_original if n_users_original else 0.0
    lines.append(
        f"{kind}_ {pretty_g} | K={K} | ORIGINAL: total={int(total_original)}, users={n_users_original}, mean_per_user={mean_o:.4f}"
    )
    # Variants
    for lab in ["25", "50", "100", "200"]:
        tot = totals_variants.get(lab, 0.0)
        n_u = n_users_variants.get(lab, 0)
        mean_v = tot / n_u if n_u else 0.0
        lines.append(
            f"{kind}_ {pretty_g} | K={K} | {lab}: total={int(tot)}, users={n_u}, mean_per_user={mean_v:.4f}"
        )
    # Blank line for readability
    lines.append("")

    with open(log_path, "a", encoding="utf-8") as f:
        f.write("\n".join(lines))

# ================================================================

def build_for_folder(kind: str, base_dir: Path, original_dir: Path, genres: list):
    prefix = "primary_p_" if kind == "primary" else "improved_"
    out_dir = base_dir / "figure"
    ensure_dir(out_dir)

    for g in genres:
        for K in K_LIST:
            # --- load ORIGINAL from primary (for plotting baseline & logging) ---
            original_file = original_dir / f"ORIGINAL_{K}recommendation.csv"
            try:
                s_original = count_genre_per_user(original_file, g)
            except Exception as e:
                print(f"[WARN] Skipping {g} K={K} (original load failed): {e}")
                continue

            # --- load variants for this folder ---
            s_variants = {}
            ok = True
            for r in RUNS:
                f = base_dir / f"{prefix}{g}_{r}_{K}recommendation.csv"
                try:
                    s_variants[str(r)] = count_genre_per_user(f, g)
                except Exception as e:
                    print(f"[WARN] Missing/invalid file for {kind} {g} run={r} K={K}: {e}")
                    ok = False
                    break
            if not ok:
                continue

            # -------- NEW: compute totals for logging (no user intersection) --------
            total_original = float(s_original.sum())
            totals_variants = {lab: float(s.sum()) for lab, s in s_variants.items()}
            n_users_original = int(s_original.shape[0])
            n_users_variants = {lab: int(s.shape[0]) for lab, s in s_variants.items()}
            # Write one block into the folder’s summary text file
            log_genre_counts(kind, base_dir, g, K, total_original, totals_variants,
                             n_users_original, n_users_variants)
            # -----------------------------------------------------------------------

            # --- align users across all 5 series for fair binning/plotting ---
            inter = intersect_users([s_original] + list(s_variants.values()))
            if len(inter) == 0:
                print(f"[WARN] No common users for {kind} {g} K={K}; skipping plot.")
                continue

            s_orig_aligned = s_original.loc[inter]
            s_vars_aligned = {lab: s.loc[inter] for lab, s in s_variants.items()}

            # --- compute bin means (sorted by ORIGINAL) ---
            bin_stats = compute_bin_means(s_orig_aligned, s_vars_aligned, num_bins=NUM_BINS)

            # --- plot ---
            pretty_g = g.replace("_", " ").replace("Children s", "Children's")
            title = f"{kind.capitalize()} – {pretty_g} – K={K}"
            out_path = out_dir / f"{g}_K{K}_{kind}.png"
            try:
                plot_grouped_bars(bin_stats, title, out_path)
                print(f"[OK] Saved: {out_path}")
            except Exception as e:
                print(f"[ERR] Plot fail for {kind} {g} K={K}: {e}")

def main():
    genres = FIXED_GENRES[:]
    build_for_folder("primary", PRIMARY_DIR, PRIMARY_DIR, genres)
    build_for_folder("enhanced", ENHANCED_DIR, PRIMARY_DIR, genres)

if __name__ == "__main__":
    main()


[OK] Saved: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0922/primary_analysis/figure/Adult_K15_primary.png
[OK] Saved: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0922/primary_analysis/figure/Adult_K25_primary.png
[OK] Saved: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0922/primary_analysis/figure/Adult_K35_primary.png
[OK] Saved: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0922/primary_analysis/figure/Adventure_K15_primary.png
[OK] Saved: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0922/primary_analysis/figure/Adventure_K25_primary.png
[OK] Saved: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0922/primary_analysis/figure/Adventure_K35_primary.png
[OK] Saved: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0922/primary_analysis/figure/Children_s_K15_primary.png
[OK] Saved: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/09