In [1]:
#!/usr/bin/env python3
# build_heavy_bias.py
# Heavily biased injection: for each genre G1 (primary),
# create synthetic users who rate *all* books whose primary genre == G1.
# No caps, no fancy sampling.

import os
import re
import pandas as pd
from pathlib import Path

# ========= CONFIG =========
BASE_DIR   = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book")
INPUT_CSV  = BASE_DIR / "data/df_final_with_genres.csv"   # must have: user_id, book_id, rating, genres
OUT_DIR    = BASE_DIR / "result/rec/top_re/0928/data/improved_synthetic_heavy"   # output datasets (one per genre/run)
SUMMARY_TXT= BASE_DIR / "result/rec/top_re/0928/data/improved_synthetic_heavy_summary.txt"
SUMMARY_CSV= BASE_DIR / "result/rec/top_re/0928/data/improved_synthetic_heavy_summary.csv"

GENRE_COL  = "genres"
USER_COL   = "user_id"
BOOK_COL   = "book_id"
RATING_COL = "rating"

RUNS = [25, 400, 1000, 5000, 10000]     # number of synthetic users to inject per genre
SYNTH_RATING = 5               # rating to assign
# =========================

def sanitize_fn(s: str) -> str:
    s = (s or "").strip().replace(" ", "_")
    return re.sub(r"[^0-9A-Za-z_]+", "_", s) or "UNK"

def primary_genre(cell: str) -> str:
    if not isinstance(cell, str) or not cell.strip():
        return ""
    # primary genre = the first token before comma
    return cell.split(",")[0].strip()

def main():
    OUT_DIR.mkdir(parents=True, exist_ok=True)

    # ---------- Load ----------
    df = pd.read_csv(INPUT_CSV)
    required = {USER_COL, BOOK_COL, RATING_COL, GENRE_COL}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"Input must contain columns {required}. Missing: {missing}")

    # numeric & basic hygiene
    df[USER_COL]   = pd.to_numeric(df[USER_COL], errors="raise", downcast="integer")
    df[BOOK_COL]   = pd.to_numeric(df[BOOK_COL], errors="raise")
    df[RATING_COL] = pd.to_numeric(df[RATING_COL], errors="raise")
    df[GENRE_COL]  = df[GENRE_COL].fillna("").astype(str)

    # baseline stats
    baseline_users = df[USER_COL].nunique()
    baseline_rows  = len(df)
    base_start_uid = int(df[USER_COL].max()) + 1

    # build a quick book->genres lookup (to preserve original genres strings)
    book_to_genres = dict(df[[BOOK_COL, GENRE_COL]].drop_duplicates().values)

    # compute primary genre per row and build per-genre unique book list
    work = df[[BOOK_COL, GENRE_COL]].copy()
    work["_primary"] = work[GENRE_COL].apply(primary_genre)
    work = work[work["_primary"] != ""].drop_duplicates(subset=[BOOK_COL, "_primary"])

    per_genre = (
        work.groupby("_primary")[BOOK_COL]
            .apply(lambda s: sorted(pd.Series(s.unique()).astype(int).tolist()))
            .to_frame("book_list")
            .reset_index()
    )
    per_genre["n_books"] = per_genre["book_list"].apply(len)

    # Make a stable, nice order of genres
    target_genres = sorted(per_genre["_primary"].tolist(), key=lambda x: x.lower())

    # ID block to avoid collisions: allocate a huge block for each (genre, run)
    BLOCK = 1_000_000

    # logging containers
    rows_summary = []
    with open(SUMMARY_TXT, "w", encoding="utf-8") as log:
        log.write("=== BASELINE ===\n")
        log.write(f"👤 Unique users: {baseline_users:,}\n")
        log.write(f"🧾 Rows: {baseline_rows:,}\n")
        log.write(f"🔢 Synthetic user_id base start: {base_start_uid}\n")
        log.write("=" * 80 + "\n\n")

    grand_added = 0
    made_any = False

    for gi, g in enumerate(target_genres):
        book_list = per_genre.loc[per_genre["_primary"] == g, "book_list"].iloc[0]
        n_books   = int(per_genre.loc[per_genre["_primary"] == g, "n_books"].iloc[0])

        if n_books <= 0 or not book_list:
            continue

        safe_g = sanitize_fn(g)
        with open(SUMMARY_TXT, "a", encoding="utf-8") as log:
            log.write(f"🎭 {g} | primary-genre unique books = {n_books}\n")

        for r_i, run in enumerate(RUNS):
            # disjoint user id space for (genre, run)
            start_uid = base_start_uid + gi * (len(RUNS) * BLOCK) + r_i * BLOCK
            new_uids = list(range(start_uid, start_uid + run))

            # build synthetic rows: each new user rates *all* primary-genre books
            synth = {
                USER_COL:   [],
                BOOK_COL:   [],
                RATING_COL: [],
                GENRE_COL:  []
            }
            synth[USER_COL]   = [uid for uid in new_uids for _ in range(n_books)]
            synth[BOOK_COL]   = [b for _ in new_uids for b in book_list]
            synth[RATING_COL] = [SYNTH_RATING] * (run * n_books)
            # preserve the original, full genres string of each book
            synth[GENRE_COL]  = [book_to_genres.get(b, "") for _ in new_uids for b in book_list]

            synth_df = pd.DataFrame(synth, columns=[USER_COL, BOOK_COL, RATING_COL, GENRE_COL])
            combined = pd.concat([df, synth_df], ignore_index=True)

            # quick checks
            expected = run * n_books
            assert len(synth_df) == expected, f"Row count mismatch for {g}, run={run}"
            new_users_total = combined[USER_COL].nunique()

            out_path = OUT_DIR / f"enhanced_{safe_g}_{run}.csv"
            combined.to_csv(out_path, index=False)

            # log lines
            with open(SUMMARY_TXT, "a", encoding="utf-8") as log:
                log.write(f"  run={run:>4} → records_added={expected:>9,} | "
                          f"new_rows={len(combined):,} | new_users={new_users_total:,}\n")

            rows_summary.append({
                "genre": g,
                "safe_genre": safe_g,
                "primary_unique_books": n_books,
                "run_users": run,
                "records_added": expected,
                "new_total_rows": len(combined),
                "new_total_users": new_users_total,
                "output_csv": str(out_path)
            })

            grand_added += expected
            made_any = True

        with open(SUMMARY_TXT, "a", encoding="utf-8") as log:
            log.write("\n")

    # write summary CSV
    if rows_summary:
        pd.DataFrame(rows_summary).to_csv(SUMMARY_CSV, index=False)

    with open(SUMMARY_TXT, "a", encoding="utf-8") as log:
        log.write("================================================================================\n")
        log.write(f"Grand total injected rows (across all genres & runs): {grand_added:,}\n")
        log.write(f"Outputs folder: {OUT_DIR}\n")
        log.write(f"Per-run summary CSV: {SUMMARY_CSV}\n")

    if not made_any:
        print("⚠️  No datasets were produced. Check genre names and input columns.")
    else:
        print("\n✅ Done.")
        print("  • Datasets:", OUT_DIR)
        print("  • Summary (txt):", SUMMARY_TXT)
        print("  • Summary (csv):", SUMMARY_CSV)

if __name__ == "__main__":
    main()



✅ Done.
  • Datasets: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0928/data/improved_synthetic_heavy
  • Summary (txt): /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0928/data/improved_synthetic_heavy_summary.txt
  • Summary (csv): /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0928/data/improved_synthetic_heavy_summary.csv
