In [1]:
import os
import pandas as pd

In [2]:
df = pd.read_csv('/home/moshtasa/Research/phd-svd-recsys/NMF/Book/data/df_final_with_genres.csv')

In [3]:
df

Unnamed: 0,user_id,book_id,rating,decade,original_title,authors,genres
0,1,258,5,2000,La sombra del viento,"Carlos Ruiz Zafón, Lucia Graves","Mystery, Historical"
1,2,4081,4,2000,,,
2,2,260,5,1930,How to Win Friends and Influence People,Dale Carnegie,"Nonfiction, Drama"
3,2,9296,5,1970,Das Drama des begabten Kindes und die Suche na...,"Alice Miller, Ruth Ward","Horror, Mystery"
4,2,2318,3,1990,The Millionaire Next Door: The Surprising Secr...,"Thomas J. Stanley, William D. Danko","Nonfiction, Drama"
...,...,...,...,...,...,...,...
5976474,49925,510,5,1990,The Great Hunt,Robert Jordan,"Fantasy, Adventure"
5976475,49925,528,4,1990,The Dragon Reborn,Robert Jordan,"Classics, Drama"
5976476,49925,722,4,1990,The Shadow Rising,Robert Jordan,"Adventure, Drama"
5976477,49925,949,5,1990,The Fires of Heaven,Robert Jordan,"Fantasy, Adventure"


In [4]:
#!/usr/bin/env python3
import os
from pathlib import Path
from collections import defaultdict
import pandas as pd
from typing import Union

# ====== CONFIG ======
# You can give a string/Path to a CSV file, a pandas.DataFrame, or a file-like object.
INPUT_SOURCE: Union[str, Path, pd.DataFrame] = "/home/moshtasa/Research/phd-svd-recsys/NMF/Book/data/df_final_with_genres.csv"
OUTPUT_CSV = "genre_counts_all.csv"
# ====================

def load_df(source: Union[str, Path, pd.DataFrame]):
    """Load a DataFrame from a path or return it if already a DataFrame."""
    if isinstance(source, pd.DataFrame):
        return source
    if isinstance(source, (str, Path)):
        # Ensure it's a proper path-like (string or Path), not a method
        src = str(source)
        if not src or src.strip() == "":
            raise ValueError("Empty path provided for INPUT_SOURCE.")
        if not (src.startswith(("http://", "https://")) or os.path.exists(src)):
            raise FileNotFoundError(f"Path not found: {src}")
        return pd.read_csv(src, low_memory=False)
    # File-like objects with .read
    if hasattr(source, "read"):
        return pd.read_csv(source, low_memory=False)
    raise TypeError(
        "INPUT_SOURCE must be a DataFrame, path-like (str/Path), or file-like object. "
        f"Got: {type(source)}"
    )

def split_genres(val):
    if pd.isna(val):
        return []
    # split, strip, and keep order while de-duplicating
    seen = set()
    out = []
    for g in str(val).split(","):
        g2 = g.strip()
        if g2 and g2 not in seen:
            out.append(g2)
            seen.add(g2)
    return out

def main():
    df = load_df(INPUT_SOURCE)

    # Keep one row per book
    if "book_id" not in df.columns or "genres" not in df.columns:
        raise KeyError("Input must contain 'book_id' and 'genres' columns.")
    books = (
        df[["book_id", "genres"]]
        .dropna(subset=["book_id"])
        .drop_duplicates(subset="book_id", keep="first")
        .copy()
    )
    books["genre_list"] = books["genres"].apply(split_genres)

    genre_total = defaultdict(int)
    genre_first = defaultdict(int)
    genre_later = defaultdict(int)

    for genres in books["genre_list"]:
        if not genres:
            continue
        # count first position
        genre_first[genres[0]] += 1
        # count totals
        for g in genres:
            genre_total[g] += 1
        # count later positions (2nd+)
        for g in genres[1:]:
            genre_later[g] += 1

    # Build results table for ALL genres seen
    all_genres = sorted(genre_total.keys())
    rows = []
    for g in all_genres:
        rows.append(
            {
                "genre": g,
                "total_books_with_genre": genre_total[g],
                "as_first_genre": genre_first.get(g, 0),
                "as_later_genre": genre_later.get(g, 0),
            }
        )
    result_df = pd.DataFrame(rows).sort_values(
        by=["total_books_with_genre", "as_first_genre"], ascending=[False, False]
    )

    # Show a quick preview
    print(result_df.head(20))
    # Save to CSV
    result_df.to_csv(OUTPUT_CSV, index=False)
    print(f"\n✅ Saved: {OUTPUT_CSV}")

if __name__ == "__main__":
    main()


              genre  total_books_with_genre  as_first_genre  as_later_genre
4             Drama                    3006             229            2777
8           Mystery                    2563            1315            1248
10          Romance                    2131            1704             427
5           Fantasy                    2088            1794             294
1         Adventure                    1789             185            1604
12         Thriller                    1606             418            1188
9        Nonfiction                    1071             878             193
3          Classics                     901             392             509
2        Children's                     863             694             169
6        Historical                     857             497             360
11  Science Fiction                     855             776              79
7            Horror                     769             427             342
0           

In [1]:
#!/usr/bin/env python3
# build_heavy_bias_pos5_neg1_all.py
#
# For each primary genre G:
#   - positives: ALL books whose primary genre == G → rated 5
#   - negatives: ALL other books → rated 1  (NO SAMPLING)
#
# IMPORTANT: This creates VERY strong poisoning. Use carefully.
# Does NOT touch real users; only adds synthetic users.

import os
import re
import pandas as pd
from pathlib import Path

# ========= CONFIG =========
BASE_DIR    = Path("/home/moshtasa/Research/phd-svd-recsys/NMF/Book")
INPUT_CSV   = BASE_DIR / "data/df_final_with_genres.csv"   # must have user_id, book_id, rating, genres
OUT_DIR     = BASE_DIR / "result/rec/top_re/1111/Single_Injection"
SUMMARY_TXT = OUT_DIR / "summary.txt"
SUMMARY_CSV = OUT_DIR / "summary.csv"

GENRE_COL   = "genres"
USER_COL    = "user_id"
BOOK_COL    = "book_id"
RATING_COL  = "rating"

# Synthetic users to generate per genre
RUNS = [2 ,4 ,6 ,25 ,50 ,100 ,200 ,300 ,350 , 500 ,1000]

POS_RATING  = 5
NEG_RATING  = 1  # <<<<<< NEG POOL RATE SET TO 1 AS REQUESTED

BLOCK = 1_000_000  # spacing ID blocks to avoid collisions
# =======================================

def sanitize_fn(s: str) -> str:
    s = (s or "").strip().replace(" ", "_")
    return re.sub(r"[^0-9A-Za-z_]+", "_", s) or "UNK"

def primary_genre(cell: str) -> str:
    if not isinstance(cell, str) or not cell.strip():
        return ""
    return cell.split(",")[0].strip()

def main():
    OUT_DIR.mkdir(parents=True, exist_ok=True)

    # ----- Load -----
    df = pd.read_csv(INPUT_CSV)
    required = {USER_COL, BOOK_COL, RATING_COL, GENRE_COL}
    if not required.issubset(df.columns):
        raise ValueError(f"Input CSV must contain columns {required}")

    df[GENRE_COL] = df[GENRE_COL].fillna("").astype(str)
    df[USER_COL] = df[USER_COL].astype(int)
    df[BOOK_COL] = df[BOOK_COL].astype(int)

    base_start_uid = df[USER_COL].max() + 1

    # Build genre lookup
    book_gen = df[[BOOK_COL, GENRE_COL]].drop_duplicates(subset=[BOOK_COL]).copy()
    book_gen["_primary"] = book_gen[GENRE_COL].apply(primary_genre)
    book_gen = book_gen[book_gen["_primary"] != ""]
    all_books = sorted(book_gen[BOOK_COL].unique())
    book_to_genres = dict(book_gen[[BOOK_COL, GENRE_COL]].values)

    # Group books by primary genre
    per_genre = (
        book_gen.groupby("_primary")[BOOK_COL]
        .apply(lambda x: sorted(x.unique()))
        .reset_index()
        .rename(columns={BOOK_COL: "pos_books"})
    )
    per_genre["n_pos"] = per_genre["pos_books"].apply(len)

    target_genres = sorted(per_genre["_primary"].unique())
    rows_summary = []

    with open(SUMMARY_TXT, "w") as log:
        log.write(f"BASE DATA: {df[USER_COL].nunique()} users, {len(df)} rows\n")
        log.write(f"NEG_RATING = {NEG_RATING} (NO SAMPLING)\n\n")

    total_added = 0

    for gi, genre in enumerate(target_genres):
        pos_books = per_genre.loc[per_genre["_primary"] == genre, "pos_books"].iloc[0]
        pos_set = set(pos_books)
        neg_pool = [b for b in all_books if b not in pos_set]  # ALL remaining books
        safe_name = sanitize_fn(genre)

        for r_i, run in enumerate(RUNS):
            start_uid = base_start_uid + gi * (len(RUNS) * BLOCK) + r_i * BLOCK
            new_users = list(range(start_uid, start_uid + run))

            # Build synthetic ratings
            pos_rows = {
                USER_COL:   [u for u in new_users for _ in pos_books],
                BOOK_COL:   [b for _ in new_users for b in pos_books],
                RATING_COL: [POS_RATING] * (run * len(pos_books)),
                GENRE_COL:  [book_to_genres[b] for _ in new_users for b in pos_books]
            }
            neg_rows = {
                USER_COL:   [u for u in new_users for _ in neg_pool],
                BOOK_COL:   [b for _ in new_users for b in neg_pool],
                RATING_COL: [NEG_RATING] * (run * len(neg_pool)),
                GENRE_COL:  [book_to_genres[b] for _ in new_users for b in neg_pool]
            }

            synth_df = pd.concat([pd.DataFrame(pos_rows), pd.DataFrame(neg_rows)], ignore_index=True)
            combined = pd.concat([df, synth_df], ignore_index=True)

            out_file = OUT_DIR / f"f_{safe_name}_{run}u_pos5_neg1_all.csv"
            combined.to_csv(out_file, index=False)

            rows_summary.append({
                "genre": genre,
                "run_users": run,
                "pos_books": len(pos_books),
                "neg_books": len(neg_pool),
                "rows_added": len(synth_df),
                "output_file": str(out_file)
            })

            total_added += len(synth_df)

    pd.DataFrame(rows_summary).to_csv(SUMMARY_CSV, index=False)
    with open(SUMMARY_TXT, "a") as log:
        log.write(f"\nTOTAL SYNTHETIC ROWS: {total_added}\n")
        log.write(f"OUTPUT FOLDER: {OUT_DIR}\n")

    print("✅ Done. Negative pool rating = 1, no sampling.")

if __name__ == "__main__":
    main()


KeyboardInterrupt: 