In [None]:
#!/usr/bin/env python3
# build_heavy_bias_allrated.py
# For each genre G1 (primary),
# create synthetic users who rate *all* books: 5 if primary == G1, else 0.

import os
import re
import pandas as pd
from pathlib import Path

# ========= CONFIG =========
BASE_DIR   = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book")
INPUT_CSV  = BASE_DIR / "data/df_final_with_genres.csv"   # must have: user_id, book_id, rating, genres
OUT_DIR    = BASE_DIR / "result/rec/top_re/0928/data/improved_synthetic_heavy_allrated"
SUMMARY_TXT= BASE_DIR / "result/rec/top_re/0928/data/improved_synthetic_heavy_allrated_summary.txt"
SUMMARY_CSV= BASE_DIR / "result/rec/top_re/0928/data/improved_synthetic_heavy_allrated_summary.csv"

GENRE_COL  = "genres"
USER_COL   = "user_id"
BOOK_COL   = "book_id"
RATING_COL = "rating"

RUNS = [25, 400]   # number of synthetic users to inject per genre
TARGET_RATING = 5
OTHER_RATING  = 0
# =========================

def sanitize_fn(s: str) -> str:
    s = (s or "").strip().replace(" ", "_")
    return re.sub(r"[^0-9A-Za-z_]+", "_", s) or "UNK"

def primary_genre(cell: str) -> str:
    if not isinstance(cell, str) or not cell.strip():
        return ""
    return cell.split(",")[0].strip()

def main():
    OUT_DIR.mkdir(parents=True, exist_ok=True)

    # ---------- Load ----------
    df = pd.read_csv(INPUT_CSV)
    required = {USER_COL, BOOK_COL, RATING_COL, GENRE_COL}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"Input must contain columns {required}. Missing: {missing}")

    # numeric hygiene
    df[USER_COL]   = pd.to_numeric(df[USER_COL], errors="raise", downcast="integer")
    df[BOOK_COL]   = pd.to_numeric(df[BOOK_COL], errors="raise")
    df[RATING_COL] = pd.to_numeric(df[RATING_COL], errors="raise")
    df[GENRE_COL]  = df[GENRE_COL].fillna("").astype(str)

    baseline_users = df[USER_COL].nunique()
    baseline_rows  = len(df)
    base_start_uid = int(df[USER_COL].max()) + 1

    # book->full genres (preserve original)
    book_to_genres = dict(df[[BOOK_COL, GENRE_COL]].drop_duplicates().values)

    # compute primary genre per book and build book->primary map (2-column dict)
    work = df[[BOOK_COL, GENRE_COL]].copy()
    work["_primary"] = work[GENRE_COL].apply(primary_genre)
    book_to_primary = (
        work[[BOOK_COL, "_primary"]]
        .drop_duplicates(subset=[BOOK_COL])
        .set_index(BOOK_COL)["_primary"]
        .to_dict()
    )

    target_genres = sorted(
        set(g for g in work["_primary"] if g),
        key=lambda x: x.lower()
    )

    BLOCK = 1_000_000
    rows_summary = []

    with open(SUMMARY_TXT, "w", encoding="utf-8") as log:
        log.write("=== BASELINE ===\n")
        log.write(f"👤 Unique users: {baseline_users:,}\n")
        log.write(f"🧾 Rows: {baseline_rows:,}\n")
        log.write(f"🔢 Synthetic user_id base start: {base_start_uid}\n")
        log.write("=" * 80 + "\n\n")

    grand_added = 0
    made_any = False

    all_books = sorted(df[BOOK_COL].unique().tolist())
    n_all_books = len(all_books)

    for gi, g in enumerate(target_genres):
        safe_g = sanitize_fn(g)
        with open(SUMMARY_TXT, "a", encoding="utf-8") as log:
            log.write(f"🎭 {g} | total books rated by each synthetic user = {n_all_books}\n")

        for r_i, run in enumerate(RUNS):
            start_uid = base_start_uid + gi * (len(RUNS) * BLOCK) + r_i * BLOCK
            new_uids = list(range(start_uid, start_uid + run))

            synth = {USER_COL: [], BOOK_COL: [], RATING_COL: [], GENRE_COL: []}
            append_u = synth[USER_COL].append
            append_b = synth[BOOK_COL].append
            append_r = synth[RATING_COL].append
            append_g = synth[GENRE_COL].append

            for uid in new_uids:
                for b in all_books:
                    pg = book_to_primary.get(b, "")
                    rating = TARGET_RATING if pg == g else OTHER_RATING
                    append_u(uid); append_b(b); append_r(rating); append_g(book_to_genres.get(b, ""))

            synth_df = pd.DataFrame(synth, columns=[USER_COL, BOOK_COL, RATING_COL, GENRE_COL])
            combined = pd.concat([df, synth_df], ignore_index=True)

            expected = run * n_all_books
            assert len(synth_df) == expected, f"Row count mismatch for {g}, run={run}"

            out_path = OUT_DIR / f"allrated_{safe_g}_{run}.csv"
            combined.to_csv(out_path, index=False)

            with open(SUMMARY_TXT, "a", encoding="utf-8") as log:
                log.write(f"  run={run:>4} → records_added={expected:>9,} | "
                          f"new_rows={len(combined):,} | new_users={combined[USER_COL].nunique():,}\n")

            rows_summary.append({
                "genre": g,
                "safe_genre": safe_g,
                "all_books": n_all_books,
                "run_users": run,
                "records_added": expected,
                "new_total_rows": len(combined),
                "new_total_users": combined[USER_COL].nunique(),
                "output_csv": str(out_path)
            })

            grand_added += expected
            made_any = True

        with open(SUMMARY_TXT, "a", encoding="utf-8") as log:
            log.write("\n")

    if rows_summary:
        pd.DataFrame(rows_summary).to_csv(SUMMARY_CSV, index=False)

    with open(SUMMARY_TXT, "a", encoding="utf-8") as log:
        log.write("================================================================================\n")
        log.write(f"Grand total injected rows: {grand_added:,}\n")
        log.write(f"Outputs folder: {OUT_DIR}\n")
        log.write(f"Per-run summary CSV: {SUMMARY_CSV}\n")

    print("\n✅ Done." if made_any else "⚠️ No datasets were produced.")

if __name__ == "__main__":
    main()
