In [None]:
#!/usr/bin/env python3
# build_0926_datasets.py

import os, re, math
import pandas as pd
from pathlib import Path

# ========= CONFIG =========
BASE_DIR   = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book")
INPUT_CSV  = BASE_DIR / "data/df_final_with_genres.csv"  # must have: user_id, book_id, rating, genres
OUT_DIR    = BASE_DIR / "result/rec/top_re/0926"
GENRE_COL  = "genres"
USER_COL   = "user_id"
BOOK_COL   = "book_id"
RATING_COL = "rating"

RUNS = [25, 100, 200, 1000]
SYNTH_RATING = 5

# cap_g = min(n_books, round(alpha * sqrt(n_books)) + bias)
ALPHA = 3.2
BIAS  = 8
# =========================

def sanitize_fn(s: str) -> str:
    s = (s or "").strip().replace(" ", "_")
    return re.sub(r"[^0-9A-Za-z_]+", "_", s) or "UNK"

def primary_genre(s: str) -> str:
    if not isinstance(s, str) or not s.strip():
        return ""
    return s.split(",")[0].strip()

def compute_cap(n_books: int) -> int:
    if n_books <= 0: return 0
    cap = int(round(ALPHA * math.sqrt(n_books)) + BIAS)
    cap = max(10, min(cap, n_books))  # at least 10, never more than n_books
    return cap

def main():
    OUT_DIR.mkdir(parents=True, exist_ok=True)

    # ---------- Load ----------
    df = pd.read_csv(INPUT_CSV)
    required = {USER_COL, BOOK_COL, RATING_COL, GENRE_COL}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"Input must contain columns {required}. Missing: {missing}")

    # numeric IDs
    df[USER_COL]  = pd.to_numeric(df[USER_COL], errors="raise", downcast="integer")
    df[BOOK_COL]  = pd.to_numeric(df[BOOK_COL], errors="raise")
    df[RATING_COL]= pd.to_numeric(df[RATING_COL], errors="raise")

    # baseline stats
    baseline_users = df[USER_COL].nunique()
    baseline_rows  = len(df)
    base_start_uid = int(df[USER_COL].max()) + 1

    # ---------- Prepare primary-genre view ----------
    work = df.copy()
    work[GENRE_COL] = work[GENRE_COL].fillna("").astype(str)
    work["_primary"] = work[GENRE_COL].apply(primary_genre)
    work = work[work["_primary"] != ""].copy()

    # Per-primary-genre unique book lists
    per_genre = (
        work.groupby("_primary")[BOOK_COL]
            .apply(lambda s: sorted(pd.Series(s.unique()).astype(int).tolist()))
            .to_frame("book_list")
            .reset_index()
    )
    per_genre["n_books"] = per_genre["book_list"].apply(len)

    # Fixed ordered list of genres to try to cover all 13
    target_genres = [
        "Adult","Adventure","Children's","Classics","Drama","Fantasy",
        "Historical","Horror","Mystery","Nonfiction","Romance","Science Fiction","Thriller"
    ]
    # Map normalization for matching
    def norm(s): return s.lower().replace("_"," ").replace("‚Äô","'").strip()

    # Build an index for quick lookup by normalized primary genre
    idx = { norm(g): g for g in per_genre["_primary"] }
    # Helper to find a canonical match in per_genre even if spelling differs
    def pick_row_for(need):
        key = norm(need)
        # exact
        if key in idx:
            return per_genre[per_genre["_primary"] == idx[key]].iloc[0]
        # common fixes
        aliases = {
            "science_fiction":"science fiction",
            "children_s":"children's",
            "childrens":"children's",
        }
        key2 = aliases.get(key, key)
        if key2 in idx:
            return per_genre[per_genre["_primary"] == idx[key2]].iloc[0]
        # fallback: fuzzy-ish linear scan (contains)
        for g in per_genre["_primary"]:
            if norm(g) == key or key in norm(g):
                return per_genre[per_genre["_primary"] == g].iloc[0]
        return None

    print("=== BASELINE ===")
    print(f"üë§ Unique users: {baseline_users:,}")
    print(f"üßæ Rows: {baseline_rows:,}")
    print(f"üî¢ Synthetic user_id base start: {base_start_uid}")
    print("="*80)

    # We allocate a disjoint user-id block per (genre, run):
    # block_size = 1_000_000 to guarantee no collisions even across big runs.
    BLOCK = 1_000_000

    made_any = False
    for gi, g in enumerate(target_genres):
        row = pick_row_for(g)
        if row is None:
            print(f"‚ö†Ô∏è  Skipping genre not found in primary-genre index: {g}")
            continue

        book_list = list(row["book_list"])
        n_books   = int(row["n_books"])
        if n_books <= 0 or not book_list:
            print(f"‚ö†Ô∏è  Skipping {g}: no books.")
            continue

        cap_g = compute_cap(n_books)
        # Take the first cap_g books (deterministic, stable)
        picked_books = book_list[:cap_g]

        for run in RUNS:
            # Disjoint synthetic user ids for this (genre, run)
            block_offset = gi * (len(RUNS) * BLOCK) + (RUNS.index(run) * BLOCK)
            start_uid = base_start_uid + block_offset
            new_uids = list(range(start_uid, start_uid + run))

            # Build synthetic block
            synth = {
                USER_COL:  [],
                BOOK_COL:  [],
                RATING_COL:[],
                GENRE_COL: []
            }
            for uid in new_uids:
                synth[USER_COL].extend([uid] * len(picked_books))
                synth[BOOK_COL].extend(picked_books)
                synth[RATING_COL].extend([SYNTH_RATING] * len(picked_books))
                # keep original full genre string for each book
                # fetch once via a lookup table for speed
            # Precompute book -> genres mapping
            genres_lookup = dict(df[[BOOK_COL, GENRE_COL]].drop_duplicates().values)
            synth[GENRE_COL].extend([genres_lookup.get(b, "") for _ in new_uids for b in picked_books])

            synth_df = pd.DataFrame(synth)
            combined = pd.concat([df, synth_df], ignore_index=True)

            # Validity checks
            exp_rows = run * len(picked_books)
            assert len(synth_df) == exp_rows, f"Bad synth rows for {g}, run={run}"
            assert combined[USER_COL].nunique() >= baseline_users + 1, "No new users added?"

            safe_g = sanitize_fn(g)
            out_path = OUT_DIR / f"p_{safe_g}_{run}.csv"
            combined.to_csv(out_path, index=False)

            print(f"\nüé≠ {g} | run={run}")
            print(f"   ‚Ä¢ n_books={n_books}, cap_g={cap_g}, records_added={exp_rows}")
            print(f"     üíæ Saved ‚Üí {out_path}")
            made_any = True

    if not made_any:
        print("‚ö†Ô∏è  No datasets were produced. Check genre names and input columns.")
    else:
        print("\n‚úÖ Done. Datasets saved under:", OUT_DIR)

if __name__ == "__main__":
    main()


=== BASELINE ===
üë§ Unique users: 53,424
üßæ Rows: 5,976,479
üî¢ Synthetic user_id base start: 53425

üé≠ Adult | run=25
   ‚Ä¢ n_books=106, cap_g=41, records_added=1025
     üíæ Saved ‚Üí /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0926/p_Adult_25.csv

üé≠ Adult | run=50
   ‚Ä¢ n_books=106, cap_g=41, records_added=2050
     üíæ Saved ‚Üí /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0926/p_Adult_50.csv

üé≠ Adult | run=100
   ‚Ä¢ n_books=106, cap_g=41, records_added=4100
     üíæ Saved ‚Üí /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0926/p_Adult_100.csv

üé≠ Adult | run=200
   ‚Ä¢ n_books=106, cap_g=41, records_added=8200
     üíæ Saved ‚Üí /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0926/p_Adult_200.csv

üé≠ Adventure | run=25
   ‚Ä¢ n_books=185, cap_g=52, records_added=1300
     üíæ Saved ‚Üí /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0926/p_Adventure_25.csv

ü

In [7]:
#!/usr/bin/env python3

import os, re, math, hashlib
import numpy as np
import pandas as pd
from pathlib import Path

# ========= CONFIG =========
BASE_DIR   = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book")
INPUT_CSV  = BASE_DIR / "data/df_final_with_genres.csv"   # must have: user_id, book_id, rating, genres
OUT_DIR    = BASE_DIR / "result/rec/top_re/0926/data/cape_g_random"
GENRE_COL  = "genres"
USER_COL   = "user_id"
BOOK_COL   = "book_id"
RATING_COL = "rating"

# number of synthetic users to add per genre (the ‚ñ°run list you choose)
RUNS = [25, 100, 200, 1000]
SYNTH_RATING = 5

# cap_g = min(n_books, round(ALPHA * sqrt(n_books)) + BIAS)
ALPHA = 3.2
BIAS  = 8

# ---- popularity-aware core + tail knobs ----
GAMMA  = 1.6   # pool expansion M = round(GAMMA * B)
RHO    = 0.35  # core fraction c = round(RHO * B)
BETA   = 0.8   # tail weight decay: w ‚àù 1 / rank^BETA
STRIDE_FRAC = 0.05  # core rotation stride as a fraction of B
SEED_BASE   = 12345 # global reproducible seed base
# =========================

TARGET_GENRES = [
    "Adult","Adventure","Children's","Classics","Drama","Fantasy",
    "Historical","Horror","Mystery","Nonfiction","Romance","Science Fiction","Thriller"
]

def sanitize_fn(s: str) -> str:
    s = (s or "").strip().replace(" ", "_")
    return re.sub(r"[^0-9A-Za-z_]+", "_", s) or "UNK"

def primary_genre(s: str) -> str:
    if not isinstance(s, str) or not s.strip():
        return ""
    # take the first token as "primary"
    return s.split(",")[0].strip()

def compute_cap(n_books: int) -> int:
    if n_books <= 0: return 0
    cap = int(round(ALPHA * math.sqrt(n_books)) + BIAS)
    cap = max(10, min(cap, n_books))  # at least 10, never more than n_books
    return cap

def seed_from(*parts) -> int:
    """
    Stable 32-bit seed from arbitrary parts.
    """
    m = hashlib.sha256()
    for p in parts:
        m.update(str(p).encode("utf-8"))
    return int.from_bytes(m.digest()[:4], "big", signed=False)

def build_popularity_lists(df: pd.DataFrame) -> pd.DataFrame:
    """
    Returns per-genre DataFrame with columns:
      _primary, book_list (popularity-sorted), n_books, pop_counts (same order)
    Popularity = count of ratings restricted to rows whose primary genre == that genre.
    """
    work = df.copy()
    work[GENRE_COL] = work[GENRE_COL].fillna("").astype(str)
    work["_primary"] = work[GENRE_COL].apply(primary_genre)
    work = work[work["_primary"] != ""].copy()

    # count ratings per (genre, book) within that primary genre slice
    grp = work.groupby(["_primary", BOOK_COL]).size().reset_index(name="cnt")

    # sort each genre by descending popularity (cnt), then by book_id for stability
    grp = grp.sort_values(["_primary", "cnt", BOOK_COL], ascending=[True, False, True])

    # aggregate into ordered lists
    agg = (
        grp.groupby("_primary")
           .apply(lambda g: pd.Series({
               "book_list": g[BOOK_COL].astype(int).tolist(),
               "pop_counts": g["cnt"].astype(int).tolist(),
               "n_books": int(g.shape[0])
           }))
           .reset_index()
    )
    return agg

def main():
    OUT_DIR.mkdir(parents=True, exist_ok=True)

    # ---------- Load ----------
    df = pd.read_csv(INPUT_CSV)
    required = {USER_COL, BOOK_COL, RATING_COL, GENRE_COL}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"Input must contain columns {required}. Missing: {missing}")

    # numeric IDs
    df[USER_COL]   = pd.to_numeric(df[USER_COL], errors="raise", downcast="integer")
    df[BOOK_COL]   = pd.to_numeric(df[BOOK_COL], errors="raise")
    df[RATING_COL] = pd.to_numeric(df[RATING_COL], errors="raise")

    # baseline stats
    baseline_users = df[USER_COL].nunique()
    baseline_rows  = len(df)
    base_start_uid = int(df[USER_COL].max()) + 1

    print("=== BASELINE ===")
    print(f"üë§ Unique users: {baseline_users:,}")
    print(f"üßæ Rows: {baseline_rows:,}")
    print(f"üî¢ Synthetic user_id base start: {base_start_uid}")
    print("="*80)

    # popularity per primary-genre
    per_genre_pop = build_popularity_lists(df)
    # quick index for lookup
    def norm(s): return s.lower().replace("_"," ").replace("‚Äô","'").strip()
    available = { norm(g): g for g in per_genre_pop["_primary"] }

    def get_row_for(need):
        k = norm(need)
        # aliases
        aliases = {
            "science_fiction": "science fiction",
            "children_s": "children's",
            "childrens": "children's",
        }
        k = aliases.get(k, k)
        if k in available:
            return per_genre_pop[per_genre_pop["_primary"] == available[k]].iloc[0]
        # fallback: contains
        for g in per_genre_pop["_primary"]:
            if norm(g) == k or k in norm(g):
                return per_genre_pop[per_genre_pop["_primary"] == g].iloc[0]
        return None

    # manifest collects one row per (genre, run)
    manifest_rows = []

    # We allocate a disjoint user-id block per (genre, run)
    BLOCK = 1_000_000

    for gi, genre in enumerate(TARGET_GENRES):
        row = get_row_for(genre)
        if row is None:
            print(f"‚ö†Ô∏è  Skipping genre not found: {genre}")
            continue

        book_list = list(row["book_list"])
        n_books   = int(row["n_books"])
        if n_books <= 0:
            print(f"‚ö†Ô∏è  Skipping {genre}: no books.")
            continue

        # cap and pool sizes
        B = compute_cap(n_books)             # cap_g
        M = min(int(round(GAMMA * B)), n_books)
        c = max(1, int(round(RHO * B)))      # core size
        r = B - c                             # tail size
        stride = max(1, int(round(STRIDE_FRAC * B)))

        # Pre-build the top-M pool
        pool = book_list[:M]
        # vector of tail ranks (1..(M-c)) for weight calc
        # we will compute weights per user because the tail window shifts with core rotation

        # reusable lookup map for genres when writing rows
        book_to_genres = dict(df[[BOOK_COL, GENRE_COL]].drop_duplicates().values)

        for run in RUNS:
            # stable per-(genre, run) block
            block_offset = gi * (len(RUNS) * BLOCK) + (RUNS.index(run) * BLOCK)
            start_uid = base_start_uid + block_offset
            uid_list = list(range(start_uid, start_uid + run))

            # synthetic rows container
            synth = {USER_COL: [], BOOK_COL: [], RATING_COL: [], GENRE_COL: []}

            for i, uid in enumerate(uid_list):
                # seed: reproducible per user
                seed_i = seed_from(SEED_BASE, genre, "run", run, "uid", uid)
                rng = np.random.default_rng(seed_i)

                # ----- CORE: rotated block within the pool head -----
                start = (i * stride) % max(1, (M - c + 1))
                core_books = pool[start:start + c]

                # ----- TAIL: weighted sample without replacement from remaining pool -----
                # Tail candidates = pool \ core_books
                core_set = set(core_books)
                tail_candidates = [b for b in pool if b not in core_set]
                L = len(tail_candidates)
                if r > L:
                    # very small M-c corner case ‚Äî just repeat from head to fill
                    tail_pick = tail_candidates + pool[:(r - L)]
                    tail_pick = tail_pick[:r]
                else:
                    # assign popularity ranks for weights (1..L)
                    ranks = np.arange(1, L + 1, dtype=float)
                    # weights ‚àù 1 / rank^BETA
                    weights = 1.0 / np.power(ranks, BETA)
                    weights = weights / weights.sum()
                    # sample r without replacement according to weights
                    idx = rng.choice(L, size=r, replace=False, p=weights)
                    tail_pick = [tail_candidates[j] for j in idx]

                chosen = core_books + tail_pick
                assert len(chosen) == B

                # append rows
                synth[USER_COL].extend([uid] * B)
                synth[BOOK_COL].extend(chosen)
                synth[RATING_COL].extend([SYNTH_RATING] * B)
                synth[GENRE_COL].extend([book_to_genres.get(b, "") for b in chosen])

            # finalize and save combined dataset
            synth_df = pd.DataFrame(synth)
            combined = pd.concat([df, synth_df], ignore_index=True)

            # output name with parameters for inspection
            safe_g = sanitize_fn(genre)
            out_name = f"p_{safe_g}_run{run}_B{B}_M{M}_c{c}_r{r}_g{GAMMA}_rho{RHO}_beta{BETA}.csv"
            out_path = OUT_DIR / out_name
            combined.to_csv(out_path, index=False)

            # logging
            exp_rows = run * B
            print(f"\nüé≠ {genre} | run={run}")
            print(f"   ‚Ä¢ n_books={n_books}, cap_g(B)={B}, M={M}, core(c)={c}, tail(r)={r}, stride={stride}")
            print(f"   ‚Ä¢ records_added={exp_rows}, users_added={run}")
            print(f"     üíæ Saved ‚Üí {out_path}")

            # manifest row
            manifest_rows.append({
                "genre": genre,
                "safe_genre": safe_g,
                "run": run,
                "cap_g_B": B,
                "M_pool": M,
                "core_c": c,
                "tail_r": r,
                "stride": stride,
                "alpha": ALPHA,
                "bias": BIAS,
                "gamma": GAMMA,
                "rho": RHO,
                "beta": BETA,
                "seed_base": SEED_BASE,
                "n_books_in_genre": n_books,
                "records_added": exp_rows,
                "output_file": out_name
            })

    # write manifest
    if manifest_rows:
        manifest = pd.DataFrame(manifest_rows)
        manifest_path = OUT_DIR / "injection_manifest.csv"
        manifest.to_csv(manifest_path, index=False)
        print(f"\nüßæ Manifest written ‚Üí {manifest_path}")
    else:
        print("\n‚ö†Ô∏è  No datasets were produced. Check genre names and input columns.")

if __name__ == "__main__":
    main()


=== BASELINE ===
üë§ Unique users: 53,424
üßæ Rows: 5,976,479
üî¢ Synthetic user_id base start: 53425

üé≠ Adult | run=25
   ‚Ä¢ n_books=106, cap_g(B)=41, M=66, core(c)=14, tail(r)=27, stride=2
   ‚Ä¢ records_added=1025, users_added=25
     üíæ Saved ‚Üí /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0926/data/cape_g_random/p_Adult_run25_B41_M66_c14_r27_g1.6_rho0.35_beta0.8.csv

üé≠ Adult | run=100
   ‚Ä¢ n_books=106, cap_g(B)=41, M=66, core(c)=14, tail(r)=27, stride=2
   ‚Ä¢ records_added=4100, users_added=100
     üíæ Saved ‚Üí /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0926/data/cape_g_random/p_Adult_run100_B41_M66_c14_r27_g1.6_rho0.35_beta0.8.csv

üé≠ Adult | run=200
   ‚Ä¢ n_books=106, cap_g(B)=41, M=66, core(c)=14, tail(r)=27, stride=2
   ‚Ä¢ records_added=8200, users_added=200
     üíæ Saved ‚Üí /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0926/data/cape_g_random/p_Adult_run200_B41_M66_c14_r27_g1.6_rho0.35_b