In [4]:
#!/usr/bin/env python3
# build_0926_datasets.py

import os, re, math
import pandas as pd
from pathlib import Path

# ========= CONFIG =========
BASE_DIR   = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book")
INPUT_CSV  = BASE_DIR / "data/df_final_with_genres.csv"  # must have: user_id, book_id, rating, genres
OUT_DIR    = BASE_DIR / "result/rec/top_re/0926"
GENRE_COL  = "genres"
USER_COL   = "user_id"
BOOK_COL   = "book_id"
RATING_COL = "rating"

RUNS = [25, 50, 100, 200]
SYNTH_RATING = 5

# cap_g = min(n_books, round(alpha * sqrt(n_books)) + bias)
ALPHA = 3.2
BIAS  = 8
# =========================

def sanitize_fn(s: str) -> str:
    s = (s or "").strip().replace(" ", "_")
    return re.sub(r"[^0-9A-Za-z_]+", "_", s) or "UNK"

def primary_genre(s: str) -> str:
    if not isinstance(s, str) or not s.strip():
        return ""
    return s.split(",")[0].strip()

def compute_cap(n_books: int) -> int:
    if n_books <= 0: return 0
    cap = int(round(ALPHA * math.sqrt(n_books)) + BIAS)
    cap = max(10, min(cap, n_books))  # at least 10, never more than n_books
    return cap

def main():
    OUT_DIR.mkdir(parents=True, exist_ok=True)

    # ---------- Load ----------
    df = pd.read_csv(INPUT_CSV)
    required = {USER_COL, BOOK_COL, RATING_COL, GENRE_COL}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"Input must contain columns {required}. Missing: {missing}")

    # numeric IDs
    df[USER_COL]  = pd.to_numeric(df[USER_COL], errors="raise", downcast="integer")
    df[BOOK_COL]  = pd.to_numeric(df[BOOK_COL], errors="raise")
    df[RATING_COL]= pd.to_numeric(df[RATING_COL], errors="raise")

    # baseline stats
    baseline_users = df[USER_COL].nunique()
    baseline_rows  = len(df)
    base_start_uid = int(df[USER_COL].max()) + 1

    # ---------- Prepare primary-genre view ----------
    work = df.copy()
    work[GENRE_COL] = work[GENRE_COL].fillna("").astype(str)
    work["_primary"] = work[GENRE_COL].apply(primary_genre)
    work = work[work["_primary"] != ""].copy()

    # Per-primary-genre unique book lists
    per_genre = (
        work.groupby("_primary")[BOOK_COL]
            .apply(lambda s: sorted(pd.Series(s.unique()).astype(int).tolist()))
            .to_frame("book_list")
            .reset_index()
    )
    per_genre["n_books"] = per_genre["book_list"].apply(len)

    # Fixed ordered list of genres to try to cover all 13
    target_genres = [
        "Adult","Adventure","Children's","Classics","Drama","Fantasy",
        "Historical","Horror","Mystery","Nonfiction","Romance","Science Fiction","Thriller"
    ]
    # Map normalization for matching
    def norm(s): return s.lower().replace("_"," ").replace("’","'").strip()

    # Build an index for quick lookup by normalized primary genre
    idx = { norm(g): g for g in per_genre["_primary"] }
    # Helper to find a canonical match in per_genre even if spelling differs
    def pick_row_for(need):
        key = norm(need)
        # exact
        if key in idx:
            return per_genre[per_genre["_primary"] == idx[key]].iloc[0]
        # common fixes
        aliases = {
            "science_fiction":"science fiction",
            "children_s":"children's",
            "childrens":"children's",
        }
        key2 = aliases.get(key, key)
        if key2 in idx:
            return per_genre[per_genre["_primary"] == idx[key2]].iloc[0]
        # fallback: fuzzy-ish linear scan (contains)
        for g in per_genre["_primary"]:
            if norm(g) == key or key in norm(g):
                return per_genre[per_genre["_primary"] == g].iloc[0]
        return None

    print("=== BASELINE ===")
    print(f"👤 Unique users: {baseline_users:,}")
    print(f"🧾 Rows: {baseline_rows:,}")
    print(f"🔢 Synthetic user_id base start: {base_start_uid}")
    print("="*80)

    # We allocate a disjoint user-id block per (genre, run):
    # block_size = 1_000_000 to guarantee no collisions even across big runs.
    BLOCK = 1_000_000

    made_any = False
    for gi, g in enumerate(target_genres):
        row = pick_row_for(g)
        if row is None:
            print(f"⚠️  Skipping genre not found in primary-genre index: {g}")
            continue

        book_list = list(row["book_list"])
        n_books   = int(row["n_books"])
        if n_books <= 0 or not book_list:
            print(f"⚠️  Skipping {g}: no books.")
            continue

        cap_g = compute_cap(n_books)
        # Take the first cap_g books (deterministic, stable)
        picked_books = book_list[:cap_g]

        for run in RUNS:
            # Disjoint synthetic user ids for this (genre, run)
            block_offset = gi * (len(RUNS) * BLOCK) + (RUNS.index(run) * BLOCK)
            start_uid = base_start_uid + block_offset
            new_uids = list(range(start_uid, start_uid + run))

            # Build synthetic block
            synth = {
                USER_COL:  [],
                BOOK_COL:  [],
                RATING_COL:[],
                GENRE_COL: []
            }
            for uid in new_uids:
                synth[USER_COL].extend([uid] * len(picked_books))
                synth[BOOK_COL].extend(picked_books)
                synth[RATING_COL].extend([SYNTH_RATING] * len(picked_books))
                # keep original full genre string for each book
                # fetch once via a lookup table for speed
            # Precompute book -> genres mapping
            genres_lookup = dict(df[[BOOK_COL, GENRE_COL]].drop_duplicates().values)
            synth[GENRE_COL].extend([genres_lookup.get(b, "") for _ in new_uids for b in picked_books])

            synth_df = pd.DataFrame(synth)
            combined = pd.concat([df, synth_df], ignore_index=True)

            # Validity checks
            exp_rows = run * len(picked_books)
            assert len(synth_df) == exp_rows, f"Bad synth rows for {g}, run={run}"
            assert combined[USER_COL].nunique() >= baseline_users + 1, "No new users added?"

            safe_g = sanitize_fn(g)
            out_path = OUT_DIR / f"p_{safe_g}_{run}.csv"
            combined.to_csv(out_path, index=False)

            print(f"\n🎭 {g} | run={run}")
            print(f"   • n_books={n_books}, cap_g={cap_g}, records_added={exp_rows}")
            print(f"     💾 Saved → {out_path}")
            made_any = True

    if not made_any:
        print("⚠️  No datasets were produced. Check genre names and input columns.")
    else:
        print("\n✅ Done. Datasets saved under:", OUT_DIR)

if __name__ == "__main__":
    main()


=== BASELINE ===
👤 Unique users: 53,424
🧾 Rows: 5,976,479
🔢 Synthetic user_id base start: 53425

🎭 Adult | run=25
   • n_books=106, cap_g=41, records_added=1025
     💾 Saved → /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0926/p_Adult_25.csv

🎭 Adult | run=50
   • n_books=106, cap_g=41, records_added=2050
     💾 Saved → /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0926/p_Adult_50.csv

🎭 Adult | run=100
   • n_books=106, cap_g=41, records_added=4100
     💾 Saved → /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0926/p_Adult_100.csv

🎭 Adult | run=200
   • n_books=106, cap_g=41, records_added=8200
     💾 Saved → /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0926/p_Adult_200.csv

🎭 Adventure | run=25
   • n_books=185, cap_g=52, records_added=1300
     💾 Saved → /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0926/p_Adventure_25.csv

🎭 Adventure | run=50
   • n_books=185, cap_g=52, records_adde

In [6]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD

# ============ CONFIG ============
BASELINE_CSV = "/home/moshtasa/Research/phd-svd-recsys/SVD/Book/data/df_final_with_genres.csv"
INJECTED_CSV = "/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0926/p_Fantasy_25.csv"
TARGET_GENRE = "Fantasy"
USER_COL, BOOK_COL, RATING_COL, GENRE_COL = "user_id", "book_id", "rating", "genres"
# =================================


def load_df(path):
    df = pd.read_csv(path)
    df = df.dropna(subset=[USER_COL, BOOK_COL, RATING_COL])
    df[USER_COL] = df[USER_COL].astype(int)
    df[BOOK_COL] = df[BOOK_COL].astype(int)
    return df


def train_svd(df):
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(df[[USER_COL, BOOK_COL, RATING_COL]], reader)
    trainset = data.build_full_trainset()
    model = SVD(n_factors=60, reg_all=0.005, lr_all=0.01, n_epochs=85, biased=True)
    model.fit(trainset)
    return model, trainset


def get_target_books(df, genre):
    """Return list of book_ids belonging to target genre"""
    mask = df[GENRE_COL].fillna("").str.contains(genre, case=False, na=False)
    return sorted(df.loc[mask, BOOK_COL].unique())


def check_injection_pressure(baseline_df, injected_df, target_books):
    base_counts = baseline_df[baseline_df[BOOK_COL].isin(target_books)].groupby(BOOK_COL).size()
    inj_counts  = injected_df[injected_df[BOOK_COL].isin(target_books)].groupby(BOOK_COL).size()
    merged = pd.DataFrame({"base": base_counts, "inj": inj_counts}).fillna(0)
    merged["delta"] = merged["inj"] - merged["base"]
    merged["ratio"] = merged["inj"] / merged["base"].replace(0, np.nan)
    print("\n=== Injection Pressure (per targeted book) ===")
    print(merged.describe()[["base", "inj", "delta", "ratio"]])
    return merged


def check_item_bias_lift(model_base, model_inj, trainset_base, trainset_inj, target_books):
    inner_base = [trainset_base.to_inner_iid(b) for b in target_books if b in trainset_base._raw2inner_id_items]
    inner_inj  = [trainset_inj.to_inner_iid(b)  for b in target_books if b in trainset_inj._raw2inner_id_items]
    common = set(inner_base).intersection(inner_inj)
    lifts = []
    for b in common:
        raw = trainset_base.to_raw_iid(b)
        b_bias = model_base.bi[b]
        i_bias = model_inj.bi[trainset_inj.to_inner_iid(raw)]
        lifts.append(i_bias - b_bias)
    print("\n=== Item Bias Lift ===")
    print(pd.Series(lifts).describe())
    return lifts


def check_predicted_gap(model_base, model_inj, trainset_base, trainset_inj, target_books, n_users=500):
    users = np.random.choice(trainset_base.all_users(), size=min(n_users, trainset_base.n_users), replace=False)
    preds = []
    for u in users:
        raw_u = trainset_base.to_raw_uid(u)
        if raw_u not in trainset_inj._raw2inner_id_users:
            continue
        for b in np.random.choice(target_books, size=min(10, len(target_books)), replace=False):
            if b not in trainset_base._raw2inner_id_items or b not in trainset_inj._raw2inner_id_items:
                continue
            inner_b_base = trainset_base.to_inner_iid(b)
            inner_b_inj  = trainset_inj.to_inner_iid(b)
            est_base = model_base.predict(raw_u, b).est
            est_inj  = model_inj.predict(raw_u, b).est
            preds.append(est_inj - est_base)
    print("\n=== Predicted Score Gap (injected - baseline) ===")
    print(pd.Series(preds).describe())
    return preds


def check_candidate_coverage(baseline_df, injected_df, target_books):
    seen = baseline_df.groupby(USER_COL)[BOOK_COL].apply(set).to_dict()
    cover = []
    for u in injected_df[USER_COL].unique():
        unseen = set(target_books) - seen.get(u, set())
        cover.append(1 if unseen else 0)
    print("\n=== Candidate Coverage (fraction of users with ≥1 unseen targeted book) ===")
    print(np.mean(cover))
    return cover


def main():
    baseline_df = load_df(BASELINE_CSV)
    injected_df = load_df(INJECTED_CSV)
    target_books = get_target_books(baseline_df, TARGET_GENRE)

    # Train baseline & injected models
    model_base, trainset_base = train_svd(baseline_df)
    model_inj,  trainset_inj  = train_svd(injected_df)

    # Run checks
    check_injection_pressure(baseline_df, injected_df, target_books)
    check_item_bias_lift(model_base, model_inj, trainset_base, trainset_inj, target_books)
    check_predicted_gap(model_base, model_inj, trainset_base, trainset_inj, target_books)
    check_candidate_coverage(baseline_df, injected_df, target_books)


if __name__ == "__main__":
    main()



=== Injection Pressure (per targeted book) ===
               base           inj        delta        ratio
count   2088.000000   2088.000000  2088.000000  2088.000000
mean     639.167625    640.891762     1.724138     1.000687
std     1414.598859   1418.646467     6.336405     0.002932
min       41.000000     41.000000     0.000000     1.000000
25%      170.000000    170.000000     0.000000     1.000000
50%      271.000000    271.000000     0.000000     1.000000
75%      550.000000    550.000000     0.000000     1.000000
max    21850.000000  21875.000000    25.000000     1.035765

=== Item Bias Lift ===
count    2088.000000
mean        0.000647
std         0.083965
min        -0.351677
25%        -0.047813
50%        -0.000167
75%         0.051531
max         0.361778
dtype: float64

=== Predicted Score Gap (injected - baseline) ===
count    5000.000000
mean       -0.020072
std         0.982749
min        -3.630674
25%        -0.610594
50%        -0.000877
75%         0.596762
max    