In [1]:
#!/usr/bin/env python3
# build_heavy_bias_pos5_neg0.py
# For each primary genre G:
#   - positives: all books with primary==G rated 5
#   - negatives: all other books rated 0  (configurable: ALL or sampled)
#
# IMPORTANT: Train Surprise with Reader(rating_scale=(0, 5)) to accept zeros.

import os
import re
import math
import random
import pandas as pd
from pathlib import Path

# ========= CONFIG =========
BASE_DIR    = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book")
INPUT_CSV   = BASE_DIR / "data/df_final_with_genres.csv"   # requires: user_id, book_id, rating, genres
OUT_DIR     = BASE_DIR / "result/rec/top_re/0909/data/improved_synthetic_heavy_pos7_neg0"
SUMMARY_TXT = OUT_DIR / "summary.txt"
SUMMARY_CSV = OUT_DIR / "summary.csv"

GENRE_COL   = "genres"
USER_COL    = "user_id"
BOOK_COL    = "book_id"
RATING_COL  = "rating"

RUNS = [25, 50, 100, 200]  # number of synthetic users per genre

POS_RATING  = 7
NEG_RATING  = 0

# ---- NEGATIVE assignment mode ----
# "all"    → rate EVERY non-target book as 0  (exactly what you asked; WARNING: HUGE FILES)
# "sample" → sample a subset of non-target books per user to keep datasets manageable
ZERO_MODE   = "sample"   # change to "all" for literal “rate rest 0”
NEG_RATIO   = 4          # when ZERO_MODE="sample": negatives per user ≈ NEG_RATIO * (#positives)
RNG_SEED    = 42         # deterministic sampling
# ================================

def sanitize_fn(s: str) -> str:
    s = (s or "").strip().replace(" ", "_")
    return re.sub(r"[^0-9A-Za-z_]+", "_", s) or "UNK"

def primary_genre(cell: str) -> str:
    if not isinstance(cell, str) or not cell.strip():
        return ""
    return cell.split(",")[0].strip()

def main():
    OUT_DIR.mkdir(parents=True, exist_ok=True)
    random.seed(RNG_SEED)

    # ---------- Load ----------
    df = pd.read_csv(INPUT_CSV)
    required = {USER_COL, BOOK_COL, RATING_COL, GENRE_COL}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"Input must contain columns {required}. Missing: {missing}")

    # hygiene
    df[USER_COL]   = pd.to_numeric(df[USER_COL], errors="raise", downcast="integer")
    df[BOOK_COL]   = pd.to_numeric(df[BOOK_COL], errors="raise")
    df[RATING_COL] = pd.to_numeric(df[RATING_COL], errors="raise")
    df[GENRE_COL]  = df[GENRE_COL].fillna("").astype(str)

    # baseline stats
    baseline_users = df[USER_COL].nunique()
    baseline_rows  = len(df)
    base_start_uid = int(df[USER_COL].max()) + 1

    # lookups
    book_to_genres = dict(df[[BOOK_COL, GENRE_COL]].drop_duplicates().values)

    # primary genre per book (dedup by book)
    book_gen = (df[[BOOK_COL, GENRE_COL]].drop_duplicates()
                  .assign(_primary=lambda x: x[GENRE_COL].apply(primary_genre)))
    book_gen = book_gen[book_gen["_primary"] != ""].copy()

    # all unique books and per-genre positive book lists
    all_books = sorted(book_gen[BOOK_COL].astype(int).unique().tolist())
    per_genre = (
        book_gen.groupby("_primary")[BOOK_COL]
        .apply(lambda s: sorted(pd.Series(s.unique()).astype(int).tolist()))
        .to_frame("pos_books")
        .reset_index()
    )
    per_genre["n_pos"] = per_genre["pos_books"].apply(len)

    target_genres = sorted(per_genre["_primary"].tolist(), key=lambda x: x.lower())

    # ID block to avoid collisions
    BLOCK = 1_000_000

    # logging
    rows_summary = []
    with open(SUMMARY_TXT, "w", encoding="utf-8") as log:
        log.write("=== BASELINE ===\n")
        log.write(f"👤 Unique users: {baseline_users:,}\n")
        log.write(f"🧾 Rows: {baseline_rows:,}\n")
        log.write(f"🔢 Synthetic user_id base start: {base_start_uid}\n")
        log.write(f"ZERO_MODE={ZERO_MODE} | NEG_RATIO={NEG_RATIO} | RNG_SEED={RNG_SEED}\n")
        log.write("="*80 + "\n\n")

    grand_added = 0
    made_any = False

    for gi, g in enumerate(target_genres):
        pos_books = per_genre.loc[per_genre["_primary"] == g, "pos_books"].iloc[0]
        n_pos     = int(per_genre.loc[per_genre["_primary"] == g, "n_pos"].iloc[0])
        if n_pos == 0:
            continue

        pos_set = set(pos_books)
        neg_pool = [b for b in all_books if b not in pos_set]

        safe_g = sanitize_fn(g)
        with open(SUMMARY_TXT, "a", encoding="utf-8") as log:
            log.write(f"🎭 {g} | positives (primary-genre books) = {n_pos} | neg_pool = {len(neg_pool)}\n")

        for r_i, run in enumerate(RUNS):
            start_uid = base_start_uid + gi * (len(RUNS) * BLOCK) + r_i * BLOCK
            new_uids = list(range(start_uid, start_uid + run))

            # ----- choose negatives (either ALL or sampled) -----
            if ZERO_MODE == "all":
                neg_books_for_all_users = neg_pool  # WARNING: huge
            else:
                # sample a fixed subset once per (genre, run), same for all new users (fast & reproducible)
                target_neg = min(len(neg_pool), NEG_RATIO * n_pos)
                rng = random.Random(RNG_SEED + gi*1000 + r_i)
                neg_books_for_all_users = rng.sample(neg_pool, target_neg) if target_neg > 0 else []

            n_neg = len(neg_books_for_all_users)

            # ----- build synthetic block -----
            # Positives (5)
            pos_rows = {
                USER_COL:   [uid for uid in new_uids for _ in range(n_pos)],
                BOOK_COL:   [b for _ in new_uids for b in pos_books],
                RATING_COL: [POS_RATING] * (run * n_pos),
                GENRE_COL:  [book_to_genres.get(b, "") for _ in new_uids for b in pos_books],
            }

            # Negatives (0)
            neg_rows = {
                USER_COL:   [uid for uid in new_uids for _ in range(n_neg)],
                BOOK_COL:   [b for _ in new_uids for b in neg_books_for_all_users],
                RATING_COL: [NEG_RATING] * (run * n_neg),
                GENRE_COL:  [book_to_genres.get(b, "") for _ in new_uids for b in neg_books_for_all_users],
            }

            synth_df = pd.concat([pd.DataFrame(pos_rows), pd.DataFrame(neg_rows)], ignore_index=True)
            expected_added = run * (n_pos + n_neg)

            # quick check
            assert len(synth_df) == expected_added, f"Row count mismatch for {g}, run={run}"

            # combine and save
            combined = pd.concat([df, synth_df], ignore_index=True)
            new_users_total = combined[USER_COL].nunique()

            out_path = OUT_DIR / f"f_{safe_g}_{run}_pos7_neg0_{ZERO_MODE}.csv"
            combined.to_csv(out_path, index=False)

            with open(SUMMARY_TXT, "a", encoding="utf-8") as log:
                log.write(f"  run={str(run):>5} → +rows={expected_added:>12,} "
                          f"(pos={run*n_pos:,}, neg={run*n_neg:,}) | "
                          f"new_rows={len(combined):,} | new_users={new_users_total:,}\n")

            rows_summary.append({
                "genre": g,
                "safe_genre": safe_g,
                "run_users": run,
                "n_pos_books": n_pos,
                "n_neg_books_per_user": n_neg if ZERO_MODE=="all" else n_neg,
                "rows_added": expected_added,
                "rows_pos": run*n_pos,
                "rows_neg": run*n_neg,
                "zero_mode": ZERO_MODE,
                "neg_ratio": NEG_RATIO if ZERO_MODE=="sample" else None,
                "output_csv": str(out_path)
            })

            grand_added += expected_added
            made_any = True

        with open(SUMMARY_TXT, "a", encoding="utf-8") as log:
            log.write("\n")

    if rows_summary:
        pd.DataFrame(rows_summary).to_csv(SUMMARY_CSV, index=False)

    with open(SUMMARY_TXT, "a", encoding="utf-8") as log:
        log.write("="*80 + "\n")
        log.write(f"Grand total injected rows (all genres & runs): {grand_added:,}\n")
        log.write(f"Outputs folder: {OUT_DIR}\n")
        log.write(f"Per-run summary CSV: {SUMMARY_CSV}\n")

    if not made_any:
        print("⚠️ No datasets were produced. Check genre names / columns.")
    else:
        print("\n✅ Done.")
        print("  • Datasets:", OUT_DIR)
        print("  • Summary:", SUMMARY_TXT)
        print("  • Summary CSV:", SUMMARY_CSV)

if __name__ == "__main__":
    main()



✅ Done.
  • Datasets: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0909/data/improved_synthetic_heavy_pos7_neg0
  • Summary: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0909/data/improved_synthetic_heavy_pos7_neg0/summary.txt
  • Summary CSV: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0909/data/improved_synthetic_heavy_pos7_neg0/summary.csv


In [None]:
#!/usr/bin/env python3
# sanity_check_pair_injection.py
#
# Validates datasets produced by build_pair_bias_pos5and7_neg0.py
# It scans BOTH .../PAIR_INJECTION/5/ and .../PAIR_INJECTION/7/ folders.
#
# For each file fpair_{G1}__{G2}_{RUN}u_pos{POS}_neg{NEG}_{MODE}.csv it checks:
#  - Synthetic users = users not in baseline INPUT_CSV
#  - Synthetic users count == RUN (parsed from filename)
#  - Positives have rating == POS and are exactly the books tagged with BOTH G1 & G2
#  - Negatives (if any) have rating == 0 and do NOT include any pair-books
#  - No duplicate (user_id, book_id)
#  - Uniform per-user counts for positives (== #pair_books); for ZERO_MODE="all", negatives per user == |AllBooks - PairBooks|
#  - Added rows equal to RUN * (pos_per_user + inferred_neg_per_user) (robust to ZERO_MODE="sample")
#
# Outputs:
#   - manifest.csv at PAIR_INJECTION root, listing all validated files
#   - sanity_report.csv at PAIR_INJECTION root, with PASS/FAIL and reasons
#   - console summary

import re
import sys
import pandas as pd
from pathlib import Path

# ========= CONFIG (align with your generator paths) =========
BASE_DIR     = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book")
INPUT_CSV    = BASE_DIR / "data/df_final_with_genres.csv"
PAIR_ROOT    = BASE_DIR / "result/rec/top_re/0929/PAIR_INJECTION"  # has subfolders 5/ and 7/

GENRE_COL    = "genres"
USER_COL     = "user_id"
BOOK_COL     = "book_id"
RATING_COL   = "rating"
# ============================================================

def parse_genres(cell: str):
    if not isinstance(cell, str) or not cell.strip():
        return []
    parts = [p.strip() for p in cell.split(",") if p.strip()]
    # dedupe while preserving order
    seen, out = set(), []
    for p in parts:
        if p not in seen:
            out.append(p); seen.add(p)
    return out

def load_baseline():
    df = pd.read_csv(INPUT_CSV)
    required = {USER_COL, BOOK_COL, RATING_COL, GENRE_COL}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"Missing columns in baseline: {missing}")
    df[USER_COL]   = pd.to_numeric(df[USER_COL], errors="raise", downcast="integer")
    df[BOOK_COL]   = pd.to_numeric(df[BOOK_COL], errors="raise")
    df[RATING_COL] = pd.to_numeric(df[RATING_COL], errors="raise")
    df[GENRE_COL]  = df[GENRE_COL].fillna("").astype(str)
    return df

def build_book_genre_index(baseline_df):
    """Return: all_books(list[int]), book->set(genres), discovered GENRES list (sorted)."""
    books = baseline_df[[BOOK_COL, GENRE_COL]].drop_duplicates(subset=[BOOK_COL]).copy()
    books["genre_list"] = books[GENRE_COL].apply(parse_genres)
    books = books[books["genre_list"].map(len) > 0].copy()
    all_books = sorted(books[BOOK_COL].astype(int).unique().tolist())
    book_to_set = dict(zip(books[BOOK_COL].astype(int), books["genre_list"].apply(set)))
    genres = sorted({g for gl in books["genre_list"] for g in gl})
    return all_books, book_to_set, genres

def parse_filename(name: str):
    """
    Expected: fpair_{G1}__{G2}_{RUN}u_pos{POS}_neg{NEG}_{MODE}.csv
              e.g., fpair_Fantasy__Horror_25u_pos7_neg0_sample.csv
    Returns dict or None.
    """
    m = re.match(r"^fpair_(.+)__(.+)_(\d+)u_pos(\d+)_neg(NA|0|\d+)_(\w+)\.csv$", name)
    if not m:
        return None
    g1, g2, run, pos, neg, mode = m.groups()
    return {
        "g1_safe": g1,
        "g2_safe": g2,
        "run": int(run),
        "pos": int(pos),
        "neg_str": neg,
        "mode": mode
    }

def unsanitize(s: str):
    # In generator we only replaced non [0-9A-Za-z_] with '_' and spaces with '_'.
    # We cannot invert perfectly; we'll match by comparing against discovered genres
    # using simple equality first; if not found, try space/underscore swaps as fallback.
    return s.replace("_", " ")

def resolve_genre(safe_name: str, GENRES: list[str]):
    # Exact match
    for g in GENRES:
        if safe_name == g:
            return g
    # Replace underscores with spaces and try
    candidate = unsanitize(safe_name)
    for g in GENRES:
        if candidate.lower() == g.lower():
            return g
    # Last resort: case-insensitive underscore-insensitive
    s_norm = safe_name.replace("_", "").lower()
    for g in GENRES:
        if s_norm == g.replace("_", "").replace(" ", "").lower():
            return g
    return None

def main():
    pair5_dir = PAIR_ROOT / "5"
    pair7_dir = PAIR_ROOT / "7"

    if not pair5_dir.exists() and not pair7_dir.exists():
        print(f"⚠️ Neither {pair5_dir} nor {pair7_dir} exists. Aborting.")
        sys.exit(1)

    baseline = load_baseline()
    baseline_users = set(baseline[USER_COL].unique().tolist())
    all_books, book_to_set, GENRES = build_book_genre_index(baseline)

    # Collect files
    files = []
    for sub in [pair5_dir, pair7_dir]:
        if sub.exists():
            files.extend(sorted([p for p in sub.glob("fpair_*_pos*_.csv")]))  # unlikely
            files.extend(sorted([p for p in sub.glob("fpair_*_pos*.csv")]))
    files = sorted(set(files))

    if not files:
        print("⚠️ No fpair_*.csv files found under PAIR_INJECTION/5|7.")
        sys.exit(0)

    manifest_rows = []
    report_rows = []
    total_pass = 0
    total_fail = 0

    for fp in files:
        info = parse_filename(fp.name)
        if not info:
            # Skip unknown names but record them
            report_rows.append({
                "file": str(fp), "status": "SKIP",
                "reason": "Filename not recognized",
            })
            continue

        # infer pos folder (5 or 7) from the parent; also compare with POS in filename
        pos_folder = int(fp.parent.name) if fp.parent.name.isdigit() else info["pos"]
        pos_from_name = info["pos"]
        if pos_folder != pos_from_name:
            note = f"pos_folder={pos_folder} differs from pos_in_name={pos_from_name}"

        # resolve safe genre tokens back to actual discovered genre names
        g1 = resolve_genre(info["g1_safe"], GENRES)
        g2 = resolve_genre(info["g2_safe"], GENRES)
        if g1 is None or g2 is None:
            report_rows.append({
                "file": str(fp), "status": "FAIL",
                "reason": f"Could not resolve genres: '{info['g1_safe']}' or '{info['g2_safe']}' to discovered list",
            })
            total_fail += 1
            continue

        # Build pair-books: books containing BOTH genres (unordered)
        pair_books = [b for b in all_books if g1 in book_to_set[b] and g2 in book_to_set[b]]
        n_pair = len(pair_books)
        pair_set = set(pair_books)

        # Load combined file
        dfc = pd.read_csv(fp)
        dfc[USER_COL]   = pd.to_numeric(dfc[USER_COL], errors="raise", downcast="integer")
        dfc[BOOK_COL]   = pd.to_numeric(dfc[BOOK_COL], errors="raise")
        dfc[RATING_COL] = pd.to_numeric(dfc[RATING_COL], errors="raise")
        dfc[GENRE_COL]  = dfc[GENRE_COL].fillna("").astype(str)

        synth_users = sorted(list(set(dfc[USER_COL].unique()) - baseline_users))
        problems = []

        # Check synthetic user count
        if len(synth_users) != info["run"]:
            problems.append(f"synth_users={len(synth_users)} != run={info['run']}")

        synth_rows = dfc[dfc[USER_COL].isin(synth_users)].copy()

        # Ratings in synthetic block
        allowed = {info["pos"]}
        if info["mode"] != "none" and info["neg_str"] != "NA":
            allowed.add(0)
        bad_ratings = synth_rows[~synth_rows[RATING_COL].isin(allowed)]
        if len(bad_ratings) > 0:
            problems.append(f"ratings_outside_allowed={sorted(bad_ratings[RATING_COL].unique().tolist())}")

        # Split pos/neg
        pos_rows = synth_rows[synth_rows[RATING_COL] == info["pos"]]
        neg_rows = synth_rows[synth_rows[RATING_COL] == 0] if 0 in allowed else synth_rows.iloc[0:0]

        # Positives: must be only on pair books
        bad_pos = pos_rows[~pos_rows[BOOK_COL].isin(pair_set)]
        if len(bad_pos) > 0:
            problems.append(f"{len(bad_pos)} positive rows not in pair-books (|pair|={n_pair})")

        # Negatives: must NOT include pair books
        if not neg_rows.empty:
            bad_neg = neg_rows[neg_rows[BOOK_COL].isin(pair_set)]
            if len(bad_neg) > 0:
                problems.append(f"{len(bad_neg)} negative rows overlap pair-books")

        # Per-user counts uniformity
        # Pos per user should be exactly n_pair
        if n_pair > 0:
            pos_per_user = pos_rows.groupby(USER_COL)[BOOK_COL].nunique()
            if not pos_per_user.empty:
                bad_users_pos = pos_per_user[pos_per_user != n_pair]
                if len(bad_users_pos) > 0:
                    problems.append(f"{len(bad_users_pos)} users do not have exactly {n_pair} positive books")
        else:
            if len(pos_rows) > 0:
                problems.append("n_pair=0 but positive rows exist (should not happen)")

        # Neg per user uniformity:
        neg_per_user_counts = None
        if not neg_rows.empty:
            neg_per_user_counts = neg_rows.groupby(USER_COL)[BOOK_COL].nunique()
            # If mode=="all" expect exact |AllBooks - PairBooks|
            if info["mode"] == "all":
                expected_neg = len(set(all_books) - pair_set)
                bad_users_neg = neg_per_user_counts[neg_per_user_counts != expected_neg]
                if len(bad_users_neg) > 0:
                    problems.append(f"{len(bad_users_neg)} users do not have exactly {expected_neg} negative books")
            # If "sample", at least require uniformity (all users same count)
            if info["mode"] == "sample":
                if neg_per_user_counts.nunique() > 1:
                    problems.append("negatives per user vary under 'sample' mode (expected uniform sample size)")

        # Duplicates (user, book)
        dup_pairs = synth_rows.duplicated(subset=[USER_COL, BOOK_COL]).sum()
        if dup_pairs > 0:
            problems.append(f"duplicate (user,book) rows among synthetic: {dup_pairs}")

        # Added rows check (robust to 'sample'):
        pos_per_user_val = n_pair if n_pair > 0 else 0
        neg_per_user_val = int(neg_per_user_counts.median()) if neg_per_user_counts is not None and not neg_per_user_counts.empty else 0
        expected_added = info["run"] * (pos_per_user_val + neg_per_user_val)
        if len(synth_rows) != expected_added:
            problems.append(f"added_rows={len(synth_rows)} != expected_inferred={expected_added} (pos/user={pos_per_user_val}, neg/user≈{neg_per_user_val}, run={info['run']})")

        status = "PASS" if not problems else "FAIL"
        total_pass += (status == "PASS")
        total_fail += (status == "FAIL")

        manifest_rows.append({
            "file": str(fp),
            "pos_folder": pos_folder,
            "pair": f"{g1} + {g2}",
            "g1": g1, "g2": g2,
            "run": info["run"],
            "pos_in_name": info["pos"],
            "mode": info["mode"],
            "n_pair_books": n_pair
        })

        report_rows.append({
            "file": str(fp),
            "status": status,
            "g1": g1, "g2": g2,
            "run": info["run"],
            "pos": info["pos"],
            "mode": info["mode"],
            "n_pair_books": n_pair,
            "synth_users": len(synth_users),
            "pos_rows": len(pos_rows),
            "neg_rows": len(neg_rows),
            "problems": " | ".join(problems) if problems else ""
        })

    # Write outputs at PAIR_ROOT
    manifest_path = PAIR_ROOT / "manifest.csv"
    report_path   = PAIR_ROOT / "sanity_report.csv"
    pd.DataFrame(manifest_rows).to_csv(manifest_path, index=False)
    pd.DataFrame(report_rows).to_csv(report_path, index=False)

    # Print summary
    print("\n=== PAIR INJECTION SANITY CHECK ===")
    print(f"Files checked: {len([r for r in report_rows if r['status'] in ('PASS','FAIL')])}")
    print(f"PASS: {total_pass} | FAIL: {total_fail}")
    print(f"Manifest: {manifest_path}")
    print(f"Report:   {report_path}")

    if total_fail > 0:
        df_rep = pd.DataFrame(report_rows)
        print("\nFirst 10 failures:")
        cols = ["file", "status", "pair", "run", "pos", "mode", "problems"]
        print(df_rep[df_rep["status"] == "FAIL"][cols].head(10).to_string(index=False))

if __name__ == "__main__":
    main()
