In [3]:

# build_avg_pool_injection.py
# For each primary genre G:
#   - Compute per-book average rating (w/ fallbacks)
#   - Add RUNS synthetic users who rate every book with primary==G at that book's average
#
# NOTE: No negative (0) ratings are added in this scheme.

import os
import re
import math
import random
import pandas as pd
from pathlib import Path

# ========= CONFIG =========
BASE_DIR    = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book")
INPUT_CSV   = BASE_DIR / "data/df_final_with_genres.csv"   # requires: user_id, book_id, rating, genres
OUT_DIR     = BASE_DIR / "result/rec/top_re/0904/data/avg_pool_injection"
SUMMARY_TXT = OUT_DIR / "summary.txt"
SUMMARY_CSV = OUT_DIR / "books_summary.csv"   # per-book avg + #raters (all ~10k books)

GENRE_COL   = "genres"
USER_COL    = "user_id"
BOOK_COL    = "book_id"
RATING_COL  = "rating"

RUNS = [25, 50, 100 , 200]  # number of synthetic users per genre

# Synthetic user id block spacing (avoid collisions)
BLOCK      = 1_000_000
RNG_SEED   = 42
# ==========================

def sanitize_fn(s: str) -> str:
    s = (s or "").strip().replace(" ", "_")
    return re.sub(r"[^0-9A-Za-z_]+", "_", s) or "UNK"

def primary_genre(cell: str) -> str:
    """Return the first token of 'genres' as the 'primary' genre."""
    if not isinstance(cell, str) or not cell.strip():
        return ""
    return cell.split(",")[0].strip()

def main():
    OUT_DIR.mkdir(parents=True, exist_ok=True)
    random.seed(RNG_SEED)

    # ---------- Load ----------
    df = pd.read_csv(INPUT_CSV)
    required = {USER_COL, BOOK_COL, RATING_COL, GENRE_COL}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"Input must contain columns {required}. Missing: {missing}")

    # hygiene
    df[USER_COL]   = pd.to_numeric(df[USER_COL], errors="raise", downcast="integer")
    df[BOOK_COL]   = pd.to_numeric(df[BOOK_COL], errors="raise")
    df[RATING_COL] = pd.to_numeric(df[RATING_COL], errors="raise")
    df[GENRE_COL]  = df[GENRE_COL].fillna("").astype(str)

    # baseline stats
    baseline_users = df[USER_COL].nunique()
    baseline_rows  = len(df)
    base_start_uid = int(df[USER_COL].max()) + 1

    # ---------- Primary genre per book ----------
    book_gen = (
        df[[BOOK_COL, GENRE_COL]]
        .drop_duplicates()
        .assign(primary=lambda x: x[GENRE_COL].apply(primary_genre))
    )
    book_gen = book_gen[book_gen["primary"] != ""].copy()

    # Merge primary onto rating rows (for genre-level fallbacks)
    df_prim = df.merge(book_gen[[BOOK_COL, "primary"]], on=BOOK_COL, how="left")

    # ---------- Per-book base stats ----------
    # Raw book means & counts (across ALL raters)
    book_stats = (
        df.groupby(BOOK_COL)[RATING_COL]
          .agg(avg_rating_book="mean", n_ratings_book="count")
          .reset_index()
    )

    # Genre-level fallbacks (mean by primary genre)
    genre_means = (
        df_prim.dropna(subset=["primary"])
              .groupby("primary")[RATING_COL].mean()
              .to_dict()
    )

    global_mean = float(df[RATING_COL].mean()) if len(df) else 3.0

    # Attach primary genre to book_stats
    book_stats = book_stats.merge(book_gen[[BOOK_COL, "primary"]], on=BOOK_COL, how="left")

    # Compute the "used" average (book mean; fallback to genre mean; fallback to global mean)
    def _choose_avg(row):
        m = row["avg_rating_book"]
        if pd.notna(m):
            return float(m)
        g = row["primary"]
        if isinstance(g, str) and g in genre_means and pd.notna(genre_means[g]):
            return float(genre_means[g])
        return global_mean

    book_stats["avg_rating_used"] = book_stats.apply(_choose_avg, axis=1)

    # Save full 10k-book summary (as requested)
    books_summary = book_stats[[BOOK_COL, "primary", "avg_rating_used", "n_ratings_book"]].copy()
    books_summary.rename(columns={
        BOOK_COL: "book_id",
        "primary": "primary_genre",
        "avg_rating_used": "avg_rating_book_used",
        "n_ratings_book": "n_users_rated"
    }, inplace=True)
    books_summary.to_csv(SUMMARY_CSV, index=False)

    # Quick lookups for fast row build
    book_to_genres = dict(book_gen[[BOOK_COL, GENRE_COL]].drop_duplicates().values)
    # (use original genres string for output consistency)
    avg_used_map   = dict(zip(book_stats[BOOK_COL], book_stats["avg_rating_used"]))

    # All unique books & per-genre book lists
    all_books = sorted(book_gen[BOOK_COL].astype(int).unique().tolist())
    per_genre = (
        book_gen.groupby("primary")[BOOK_COL]
                .apply(lambda s: sorted(pd.Series(s.unique()).astype(int).tolist()))
                .to_frame("genre_books")
                .reset_index()
    )
    per_genre["n_books_in_genre"] = per_genre["genre_books"].apply(len)
    target_genres = sorted(per_genre["primary"].tolist(), key=lambda x: x.lower())

    # ---------- Logging ----------
    with open(SUMMARY_TXT, "w", encoding="utf-8") as log:
        log.write("=== BASELINE ===\n")
        log.write(f"👤 Unique users: {baseline_users:,}\n")
        log.write(f"🧾 Rows: {baseline_rows:,}\n")
        log.write(f"🔢 Synthetic user_id base start: {base_start_uid}\n")
        log.write(f"RNG_SEED={RNG_SEED}\n")
        log.write("="*80 + "\n\n")
        log.write(f"📄 Per-book summary CSV: {SUMMARY_CSV}\n\n")

    grand_added = 0
    made_any = False

    # ---------- Build per-genre injected datasets ----------
    for gi, g in enumerate(target_genres):
        pos_books = per_genre.loc[per_genre["primary"] == g, "genre_books"].iloc[0]
        n_pos     = int(per_genre.loc[per_genre["primary"] == g, "n_books_in_genre"].iloc[0])
        if n_pos == 0:
            continue

        safe_g = sanitize_fn(g)
        with open(SUMMARY_TXT, "a", encoding="utf-8") as log:
            log.write(f"🎭 {g} | genre_books = {n_pos}\n")

        for r_i, run in enumerate(RUNS):
            start_uid = base_start_uid + gi * (len(RUNS) * BLOCK) + r_i * BLOCK
            new_uids = list(range(start_uid, start_uid + run))

            # Build ONLY positives at book-average (no negatives)
            rows = {
                USER_COL:   [],
                BOOK_COL:   [],
                RATING_COL: [],
                GENRE_COL:  [],
            }

            for uid in new_uids:
                rows[USER_COL].extend([uid] * n_pos)
                rows[BOOK_COL].extend(pos_books)
                # use per-book average (float)
                rows[RATING_COL].extend([avg_used_map.get(b, global_mean) for b in pos_books])
                rows[GENRE_COL].extend([book_to_genres.get(b, "") for b in pos_books])

            synth_df = pd.DataFrame(rows)
            expected_added = run * n_pos

            # combine and save
            combined = pd.concat([df, synth_df], ignore_index=True)
            new_users_total = combined[USER_COL].nunique()

            out_path = OUT_DIR / f"enhanced_{safe_g}_{run}_avgonly.csv"
            combined.to_csv(out_path, index=False)

            with open(SUMMARY_TXT, "a", encoding="utf-8") as log:
                log.write(f"  run={str(run):>5} → +rows={expected_added:>9,} "
                          f"(avg-only; no negatives) | "
                          f"new_rows={len(combined):,} | new_users={new_users_total:,} | "
                          f"output={out_path.name}\n")

            grand_added += expected_added
            made_any = True

        with open(SUMMARY_TXT, "a", encoding="utf-8") as log:
            log.write("\n")

    with open(SUMMARY_TXT, "a", encoding="utf-8") as log:
        log.write("="*80 + "\n")
        log.write(f"Grand total injected rows (all genres & runs): {grand_added:,}\n")
        log.write(f"Outputs folder: {OUT_DIR}\n")
        log.write(f"Per-book summary CSV: {SUMMARY_CSV}\n")

    if not made_any:
        print("⚠️ No datasets were produced. Check genre names / columns.")
    else:
        print("\n✅ Done.")
        print("  • Datasets:", OUT_DIR)
        print("  • Summary (text):", SUMMARY_TXT)
        print("  • Books summary (CSV):", SUMMARY_CSV)

if __name__ == "__main__":
    main()


✅ Done.
  • Datasets: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0904/data/avg_pool_injection
  • Summary (text): /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0904/data/avg_pool_injection/summary.txt
  • Books summary (CSV): /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0904/data/avg_pool_injection/books_summary.csv
