In [1]:
#!/usr/bin/env python3
# build_heavy_bias_pos5_neg0.py
# For each primary genre G:
#   - positives: all books with primary==G rated 5
#   - negatives: all other books rated 0  (configurable: ALL or sampled)
#
# IMPORTANT: Train Surprise with Reader(rating_scale=(0, 5)) to accept zeros.

import os
import re
import math
import random
import pandas as pd
from pathlib import Path

# ========= CONFIG =========
BASE_DIR    = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book")
INPUT_CSV   = BASE_DIR / "data/df_final_with_genres.csv"   # requires: user_id, book_id, rating, genres
OUT_DIR     = BASE_DIR / "result/rec/top_re/0909/data/improved_synthetic_heavy_pos7_neg0"
SUMMARY_TXT = OUT_DIR / "summary.txt"
SUMMARY_CSV = OUT_DIR / "summary.csv"

GENRE_COL   = "genres"
USER_COL    = "user_id"
BOOK_COL    = "book_id"
RATING_COL  = "rating"

RUNS = [25, 50, 100, 200]  # number of synthetic users per genre

POS_RATING  = 7
NEG_RATING  = 0

# ---- NEGATIVE assignment mode ----
# "all"    → rate EVERY non-target book as 0  (exactly what you asked; WARNING: HUGE FILES)
# "sample" → sample a subset of non-target books per user to keep datasets manageable
ZERO_MODE   = "sample"   # change to "all" for literal “rate rest 0”
NEG_RATIO   = 4          # when ZERO_MODE="sample": negatives per user ≈ NEG_RATIO * (#positives)
RNG_SEED    = 42         # deterministic sampling
# ================================

def sanitize_fn(s: str) -> str:
    s = (s or "").strip().replace(" ", "_")
    return re.sub(r"[^0-9A-Za-z_]+", "_", s) or "UNK"

def primary_genre(cell: str) -> str:
    if not isinstance(cell, str) or not cell.strip():
        return ""
    return cell.split(",")[0].strip()

def main():
    OUT_DIR.mkdir(parents=True, exist_ok=True)
    random.seed(RNG_SEED)

    # ---------- Load ----------
    df = pd.read_csv(INPUT_CSV)
    required = {USER_COL, BOOK_COL, RATING_COL, GENRE_COL}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"Input must contain columns {required}. Missing: {missing}")

    # hygiene
    df[USER_COL]   = pd.to_numeric(df[USER_COL], errors="raise", downcast="integer")
    df[BOOK_COL]   = pd.to_numeric(df[BOOK_COL], errors="raise")
    df[RATING_COL] = pd.to_numeric(df[RATING_COL], errors="raise")
    df[GENRE_COL]  = df[GENRE_COL].fillna("").astype(str)

    # baseline stats
    baseline_users = df[USER_COL].nunique()
    baseline_rows  = len(df)
    base_start_uid = int(df[USER_COL].max()) + 1

    # lookups
    book_to_genres = dict(df[[BOOK_COL, GENRE_COL]].drop_duplicates().values)

    # primary genre per book (dedup by book)
    book_gen = (df[[BOOK_COL, GENRE_COL]].drop_duplicates()
                  .assign(_primary=lambda x: x[GENRE_COL].apply(primary_genre)))
    book_gen = book_gen[book_gen["_primary"] != ""].copy()

    # all unique books and per-genre positive book lists
    all_books = sorted(book_gen[BOOK_COL].astype(int).unique().tolist())
    per_genre = (
        book_gen.groupby("_primary")[BOOK_COL]
        .apply(lambda s: sorted(pd.Series(s.unique()).astype(int).tolist()))
        .to_frame("pos_books")
        .reset_index()
    )
    per_genre["n_pos"] = per_genre["pos_books"].apply(len)

    target_genres = sorted(per_genre["_primary"].tolist(), key=lambda x: x.lower())

    # ID block to avoid collisions
    BLOCK = 1_000_000

    # logging
    rows_summary = []
    with open(SUMMARY_TXT, "w", encoding="utf-8") as log:
        log.write("=== BASELINE ===\n")
        log.write(f"👤 Unique users: {baseline_users:,}\n")
        log.write(f"🧾 Rows: {baseline_rows:,}\n")
        log.write(f"🔢 Synthetic user_id base start: {base_start_uid}\n")
        log.write(f"ZERO_MODE={ZERO_MODE} | NEG_RATIO={NEG_RATIO} | RNG_SEED={RNG_SEED}\n")
        log.write("="*80 + "\n\n")

    grand_added = 0
    made_any = False

    for gi, g in enumerate(target_genres):
        pos_books = per_genre.loc[per_genre["_primary"] == g, "pos_books"].iloc[0]
        n_pos     = int(per_genre.loc[per_genre["_primary"] == g, "n_pos"].iloc[0])
        if n_pos == 0:
            continue

        pos_set = set(pos_books)
        neg_pool = [b for b in all_books if b not in pos_set]

        safe_g = sanitize_fn(g)
        with open(SUMMARY_TXT, "a", encoding="utf-8") as log:
            log.write(f"🎭 {g} | positives (primary-genre books) = {n_pos} | neg_pool = {len(neg_pool)}\n")

        for r_i, run in enumerate(RUNS):
            start_uid = base_start_uid + gi * (len(RUNS) * BLOCK) + r_i * BLOCK
            new_uids = list(range(start_uid, start_uid + run))

            # ----- choose negatives (either ALL or sampled) -----
            if ZERO_MODE == "all":
                neg_books_for_all_users = neg_pool  # WARNING: huge
            else:
                # sample a fixed subset once per (genre, run), same for all new users (fast & reproducible)
                target_neg = min(len(neg_pool), NEG_RATIO * n_pos)
                rng = random.Random(RNG_SEED + gi*1000 + r_i)
                neg_books_for_all_users = rng.sample(neg_pool, target_neg) if target_neg > 0 else []

            n_neg = len(neg_books_for_all_users)

            # ----- build synthetic block -----
            # Positives (5)
            pos_rows = {
                USER_COL:   [uid for uid in new_uids for _ in range(n_pos)],
                BOOK_COL:   [b for _ in new_uids for b in pos_books],
                RATING_COL: [POS_RATING] * (run * n_pos),
                GENRE_COL:  [book_to_genres.get(b, "") for _ in new_uids for b in pos_books],
            }

            # Negatives (0)
            neg_rows = {
                USER_COL:   [uid for uid in new_uids for _ in range(n_neg)],
                BOOK_COL:   [b for _ in new_uids for b in neg_books_for_all_users],
                RATING_COL: [NEG_RATING] * (run * n_neg),
                GENRE_COL:  [book_to_genres.get(b, "") for _ in new_uids for b in neg_books_for_all_users],
            }

            synth_df = pd.concat([pd.DataFrame(pos_rows), pd.DataFrame(neg_rows)], ignore_index=True)
            expected_added = run * (n_pos + n_neg)

            # quick check
            assert len(synth_df) == expected_added, f"Row count mismatch for {g}, run={run}"

            # combine and save
            combined = pd.concat([df, synth_df], ignore_index=True)
            new_users_total = combined[USER_COL].nunique()

            out_path = OUT_DIR / f"f_{safe_g}_{run}_pos7_neg0_{ZERO_MODE}.csv"
            combined.to_csv(out_path, index=False)

            with open(SUMMARY_TXT, "a", encoding="utf-8") as log:
                log.write(f"  run={str(run):>5} → +rows={expected_added:>12,} "
                          f"(pos={run*n_pos:,}, neg={run*n_neg:,}) | "
                          f"new_rows={len(combined):,} | new_users={new_users_total:,}\n")

            rows_summary.append({
                "genre": g,
                "safe_genre": safe_g,
                "run_users": run,
                "n_pos_books": n_pos,
                "n_neg_books_per_user": n_neg if ZERO_MODE=="all" else n_neg,
                "rows_added": expected_added,
                "rows_pos": run*n_pos,
                "rows_neg": run*n_neg,
                "zero_mode": ZERO_MODE,
                "neg_ratio": NEG_RATIO if ZERO_MODE=="sample" else None,
                "output_csv": str(out_path)
            })

            grand_added += expected_added
            made_any = True

        with open(SUMMARY_TXT, "a", encoding="utf-8") as log:
            log.write("\n")

    if rows_summary:
        pd.DataFrame(rows_summary).to_csv(SUMMARY_CSV, index=False)

    with open(SUMMARY_TXT, "a", encoding="utf-8") as log:
        log.write("="*80 + "\n")
        log.write(f"Grand total injected rows (all genres & runs): {grand_added:,}\n")
        log.write(f"Outputs folder: {OUT_DIR}\n")
        log.write(f"Per-run summary CSV: {SUMMARY_CSV}\n")

    if not made_any:
        print("⚠️ No datasets were produced. Check genre names / columns.")
    else:
        print("\n✅ Done.")
        print("  • Datasets:", OUT_DIR)
        print("  • Summary:", SUMMARY_TXT)
        print("  • Summary CSV:", SUMMARY_CSV)

if __name__ == "__main__":
    main()



✅ Done.
  • Datasets: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0909/data/improved_synthetic_heavy_pos7_neg0
  • Summary: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0909/data/improved_synthetic_heavy_pos7_neg0/summary.txt
  • Summary CSV: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0909/data/improved_synthetic_heavy_pos7_neg0/summary.csv
