In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt

#0910 ##it works

In [5]:
#!/usr/bin/env python3
# g1_user_summary_and_reports_final_fix.py
#
# End-to-end:
#  1) Build per-book ORIGINAL mean ratings + per-book genres from df_final_with_genres.csv
#  2) For injected SVD files: f_<Genre>_<n>_..._<K>recommendation.csv
#       • per-user CSV under: <OUT_DIR>/<genre_slug>/<file>__G1_user_summary.csv
#       • collect genre-level stats keyed by (Genre, K, n)
#  3) For ORIGINAL files: ORIGINAL_<K>recommendation.csv (NOTE: only varies by K, no n)
#       • for EACH Genre, per-user CSV under: <OUT_DIR>/original/<genre_slug>/ORIGINAL_<K>recommendation__<genre_slug>__G1_user_summary.csv
#       • collect baseline stats keyed by (Genre, K)
#  4) Write per-genre TXT:
#       • general.txt  -> table: Genre,n,K,avg_count,avg_estimation_rating,avg_original_rating
#                         (one ORIGINAL line per K, plus one line per injected run)
#       • report.txt   -> human-readable: for each Top K, show ORIGINAL once (no n), then runs n=25/50
#
# A book matches target genre if target appears in genre_g1 OR genre_g2.
# Users with 0 matches: count=0; averages NaN (skipped in higher-level means).
# Robust to missing genre columns and book_id dtype mismatches.

import os
import re
from pathlib import Path
import pandas as pd
from collections import defaultdict

# ======== CONFIG (update if needed) ========
ORIGINAL_CSV = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/data/df_final_with_genres.csv")
RECS_DIR     = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/SVD")
OUT_DIR      = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/result/G1_user_summary")
OUT_DIR.mkdir(parents=True, exist_ok=True)

GENRE_LIST = [
    "Adult", "Adventure", "Children's", "Classics", "Drama", "Fantasy",
    "Historical", "Horror", "Mystery", "Nonfiction", "Romance",
    "Science Fiction", "Thriller",
]

# ======== HELPERS ========
def parse_target_genre(fname: str) -> str:
    base = os.path.basename(fname)
    # enhanced_<Genre>_<n>_... or f_<Genre>_<n>_...
    m = (re.match(r"(?:enhanced|f)_([^_]+)_\d+_.*recommendation\.csv", base)
         or re.match(r"(?:enhanced|f)_([^_]+)_.*recommendation\.csv", base))
    token = m.group(1) if m else "Unknown"
    return {"Children_s": "Children's", "Science_Fiction": "Science Fiction"}.get(
        token, token.replace("_", " ")
    )

def parse_run_from_filename(name: str) -> int:
    base = os.path.basename(name)
    m = re.match(r"(?:enhanced|f)_[^_]+_(\d+)_", base)
    return int(m.group(1)) if m else -1

def parse_k_from_filename(name: str) -> int:
    m = re.search(r"_(15|25|35)recommendation\.csv$", os.path.basename(name))
    return int(m.group(1)) if m else -1

def slugify(s: str) -> str:
    return re.sub(r"[^A-Za-z0-9]+", "_", s).strip("_").lower()

def genre_folder(genre: str, *, original: bool=False) -> Path:
    base = OUT_DIR / ("original" if original else "")
    gdir = base / slugify(genre)
    gdir.mkdir(parents=True, exist_ok=True)
    return gdir

def summary_csv_path(rec_path: Path, gdir: Path, *, original: bool, genre: str) -> Path:
    if original:
        k = parse_k_from_filename(rec_path.name)
        return gdir / f"ORIGINAL_{k}recommendation__{slugify(genre)}__G1_user_summary.csv"
    return gdir / f"{rec_path.stem}__G1_user_summary.csv"

def fmt(x):
    return "" if pd.isna(x) else f"{float(x):.6f}"

def split_genre_cols(df: pd.DataFrame) -> pd.DataFrame:
    def split_one(gen):
        if pd.isna(gen) or not str(gen).strip():
            return ("Unknown", "", "Unknown")
        parts = [p.strip() for p in str(gen).split(",") if p.strip()]
        g1 = parts[0] if len(parts) >= 1 else "Unknown"
        g2 = parts[1] if len(parts) >= 2 else ""
        return (g1, g2, ", ".join(parts) if parts else "Unknown")
    g = (df[["book_id","genres"]]
         .dropna(subset=["book_id"])
         .drop_duplicates("book_id", keep="first")
         .copy())
    g[["genre_g1","genre_g2","genres_all"]] = pd.DataFrame(g["genres"].apply(split_one).tolist(), index=g.index)
    return g.drop(columns=["genres"])

def ensure_genres_on_rec(df: pd.DataFrame, book_genres: pd.DataFrame) -> pd.DataFrame:
    # dtype-safe join on book_id
    if "book_id" in df.columns and "book_id" in book_genres.columns:
        try:
            df = df.copy()
            df["book_id"] = pd.to_numeric(df["book_id"], errors="coerce").astype("Int64")
            bg = book_genres.copy()
            bg["book_id"] = pd.to_numeric(bg["book_id"], errors="coerce").astype("Int64")
        except Exception:
            bg = book_genres.copy()
    else:
        bg = book_genres.copy()

    if not {"genre_g1","genre_g2"}.issubset(df.columns):
        df = df.merge(bg, on="book_id", how="left")

    for col in ["genre_g1","genre_g2"]:
        if col not in df.columns:
            df[col] = pd.NA
    return df

def compute_user_summary(rec_df: pd.DataFrame, target_genre: str, count_col: str,
                         book_means: pd.DataFrame, book_genres: pd.DataFrame) -> pd.DataFrame:
    rec_df = ensure_genres_on_rec(rec_df, book_genres)
    users = pd.DataFrame({"user_id": rec_df["user_id"].drop_duplicates().sort_values().values})

    gmask = (rec_df["genre_g1"] == target_genre) | (rec_df["genre_g2"] == target_genre)
    rec_g1 = rec_df[gmask].copy()

    cnt = (rec_g1.groupby("user_id", as_index=False)["book_id"].count()
           .rename(columns={"book_id": count_col}))

    if "est_score" not in rec_g1.columns:
        rec_g1["est_score"] = pd.NA
    est_mean = (rec_g1.groupby("user_id", as_index=False)["est_score"].mean()
                .rename(columns={"est_score":"estimation_rating_average"}))

    rec_g1 = rec_g1.merge(book_means, on="book_id", how="left")
    orig_mean = (rec_g1.groupby("user_id", as_index=False)["original_per_book_avg"].mean()
                 .rename(columns={"original_per_book_avg":"rating_average"}))

    out = users.merge(cnt, on="user_id", how="left")
    out[count_col] = out[count_col].fillna(0).astype("int64")
    out = out.merge(est_mean, on="user_id", how="left").merge(orig_mean, on="user_id", how="left")
    return out

def append_table_line(general_path: Path, header: str, line: str):
    write_header = not general_path.exists() or os.path.getsize(general_path) == 0
    with open(general_path, "a", encoding="utf-8") as f:
        if write_header:
            f.write(header)
        f.write(line)

# ======== LOAD ORIGINAL RATING DATA ========
print("Loading original ratings …")
orig = pd.read_csv(ORIGINAL_CSV, usecols=["book_id","rating","user_id","genres"])
book_means  = (orig.groupby("book_id", as_index=False)["rating"].mean()
               .rename(columns={"rating":"original_per_book_avg"}))
book_genres = split_genre_cols(orig)  # book_id, genre_g1, genre_g2, genres_all
del orig

# ======== ACCUMULATORS ========
# injected_stats[genre][K][n] = (avg_count, avg_est, avg_orig)
# original_stats[genre][K]    = (avg_count, avg_est, avg_orig)
injected_stats = defaultdict(lambda: defaultdict(dict))
original_stats = defaultdict(dict)

# ======== PROCESS INJECTED f_* FILES ========
for rec_path in sorted(RECS_DIR.glob("enhanced_*recommendation.csv")):
    genre = parse_target_genre(rec_path.name)
    k     = parse_k_from_filename(rec_path.name)
    n     = parse_run_from_filename(rec_path.name)
    gdir  = genre_folder(genre, original=False)

    rec = pd.read_csv(rec_path)
    need = {"user_id","book_id","rank"}
    if not need.issubset(rec.columns):
        raise ValueError(f"{rec_path.name} must have {need}")

    count_col = f"number_of_books_suggested_in_{slugify(genre)}"
    out = compute_user_summary(rec, genre, count_col, book_means, book_genres)
    out.to_csv(summary_csv_path(rec_path, gdir, original=False, genre=genre), index=False)

    avg_count = float(out[count_col].astype("float64").mean())
    avg_est   = float(out["estimation_rating_average"].mean(skipna=True))
    avg_orig  = float(out["rating_average"].mean(skipna=True))
    injected_stats[genre][k][n] = (avg_count, avg_est, avg_orig)

# ======== PROCESS ORIGINAL_* FILES (ONLY K VARIES) ========
for rec_path in sorted(RECS_DIR.glob("ORIGINAL_*recommendation.csv")):
    k = parse_k_from_filename(rec_path.name)
    recb = pd.read_csv(rec_path)
    need = {"user_id","book_id","rank"}
    if not need.issubset(recb.columns):
        raise ValueError(f"{rec_path.name} must have {need}")
    recb = ensure_genres_on_rec(recb, book_genres)

    for genre in GENRE_LIST:
        gdir = genre_folder(genre, original=True)
        count_col = f"number_of_books_suggested_in_{slugify(genre)}"
        out = compute_user_summary(recb, genre, count_col, book_means, book_genres)
        out.to_csv(summary_csv_path(rec_path, gdir, original=True, genre=genre), index=False)

        avg_count = float(out[count_col].astype("float64").mean())
        avg_est   = float(out["estimation_rating_average"].mean(skipna=True))
        avg_orig  = float(out["rating_average"].mean(skipna=True))
        original_stats[genre][k] = (avg_count, avg_est, avg_orig)

# ======== WRITE TXT OUTPUTS PER GENRE ========
for genre in GENRE_LIST:
    gdir_inj = genre_folder(genre, original=False)
    general_path = gdir_inj / "general.txt"
    report_path  = gdir_inj / "report.txt"

    # --- general.txt: table (overwrite each run to avoid duplicates) ---
    if general_path.exists():
        general_path.unlink()
    header = "Genre,n,K,avg_count,avg_estimation_rating,avg_original_rating\n"

    Ks = sorted(set(list(injected_stats[genre].keys()) + list(original_stats[genre].keys())))
    for k in Ks:
        # ORIGINAL: one line per K with n="ORIGINAL"
        oc, oe, oo = original_stats[genre].get(k, (float('nan'), float('nan'), float('nan')))
        append_table_line(general_path, header, f"{genre},ORIGINAL,{k},{fmt(oc)},{fmt(oe)},{fmt(oo)}\n")
        # Injected runs for this K
        for n, (ic, ie, io) in sorted(injected_stats[genre].get(k, {}).items()):
            append_table_line(general_path, header, f"{genre},{n},{k},{fmt(ic)},{fmt(ie)},{fmt(io)}\n")

    # --- report.txt: human-readable (ORIGINAL once per K, then runs) ---
    lines = []
    lines.append(f"# Report for {genre}\n\n")
    for k in Ks:
        lines.append(f"Top {k}:\n")
        oc, oe, oo = original_stats[genre].get(k, (float('nan'), float('nan'), float('nan')))
        lines.append(f"- original_{k}:          count={fmt(oc)}, est={fmt(oe)}, orig={fmt(oo)}\n")
        runs = sorted(injected_stats[genre].get(k, {}).keys())
        for n in runs:
            ic, ie, io = injected_stats[genre][k][n]
            lines.append(f"- {slugify(genre)}_{k}_{n}:  count={fmt(ic)}, est={fmt(ie)}, orig={fmt(io)}\n")
        lines.append("\n")
    with open(report_path, "w", encoding="utf-8") as f:
        f.writelines(lines)

print("\nDone.")
print(f"Outputs under: {OUT_DIR}")
print("Injected per-genre CSVs:   result/<genre>/...__G1_user_summary.csv")
print("Original  per-genre CSVs:  result/original/<genre>/ORIGINAL_*__G1_user_summary.csv")
print("Per-genre TXT summaries:   result/<genre>/general.txt and report.txt")


Loading original ratings …

Done.
Outputs under: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/result/G1_user_summary
Injected per-genre CSVs:   result/<genre>/...__G1_user_summary.csv
Original  per-genre CSVs:  result/original/<genre>/ORIGINAL_*__G1_user_summary.csv
Per-genre TXT summaries:   result/<genre>/general.txt and report.txt


#result

In [7]:
#!/usr/bin/env python3
# make_figures_from_reports.py
#
# Reads per-genre report.txt files and creates grouped bar charts:
#  - X axis: K bins (15, 25, 35)
#  - Within each K: bars for Original + each run (n=25, 50, ...)
#  - Bar height: avg_count
#  - On-bar text: est (green), orig (red), stacked vertically
#
# Output: <OUT_DIR>/<genre>/figures/<genre>_k_counts.png

import re
from pathlib import Path
import math
import matplotlib.pyplot as plt

# ====== CONFIG: point this to your "result" root ======
OUT_DIR = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/result/G1_user_summary")

# The K bins we expect (script is robust if some are missing)
K_BINS = [15, 25, 35]

# The genre folders to scan (infer from existing subfolders, or hardcode if preferred)
def list_genre_folders(root: Path):
    # Any subdir with a report.txt is considered a genre folder (exclude "original")
    for p in sorted(root.iterdir()):
        if p.is_dir() and (p / "report.txt").exists() and p.name != "original":
            yield p

# Parse report.txt into a structure: {K: {"original": (count, est, orig), "n=<n>": (count, est, orig), ...}}
def parse_report(report_path: Path):
    text = report_path.read_text(encoding="utf-8").splitlines()
    data = {}
    cur_k = None
    # Expected blocks like:
    # Top 15:
    # - original_15:          count=0.601939, est=5.898149, orig=3.871235
    # - adult_15_25:  count=0.644242, est=5.872304, orig=3.895614
    top_re = re.compile(r"^Top\s+(\d+):")
    line_re = re.compile(
        r"^-\s*(original_(\d+)|[a-z0-9_]+_(\d+)_(\d+)):\s*count=([0-9.]+),\s*est=([0-9.]+|),\s*orig=([0-9.]+|)",
        re.IGNORECASE
    )
    for raw in text:
        m = top_re.match(raw.strip())
        if m:
            cur_k = int(m.group(1))
            data.setdefault(cur_k, {})
            continue
        m2 = line_re.match(raw.strip())
        if m2 and cur_k is not None:
            # Either "original_<K>" or "<slug>_<K>_<n>"
            label_full = m2.group(1)
            k_from_label = int(m2.group(2) or m2.group(3) or cur_k)
            n_val = m2.group(4)  # None for original
            count = float(m2.group(5)) if m2.group(5) != "" else math.nan
            est   = float(m2.group(6)) if m2.group(6) != "" else math.nan
            orig  = float(m2.group(7)) if m2.group(7) != "" else math.nan

            if "original" in label_full:
                variant = "original"
            else:
                variant = f"n={n_val}"

            # Ensure K alignment
            data.setdefault(k_from_label, {})
            data[k_from_label][variant] = (count, est, orig)

    return data

def make_bar_figure(genre_dir: Path, genre_name: str, data_by_k: dict):
    """
    data_by_k: {K: {"original": (count, est, orig), "n=25": (...), "n=50": (...), ...}}
    """
    figures_dir = genre_dir / "figures"
    figures_dir.mkdir(parents=True, exist_ok=True)

    # Union of variants across Ks, preserve desired order (original first, then sorted n=…)
    variants = []
    for k in sorted(data_by_k.keys()):
        keys = list(data_by_k[k].keys())
        for key in keys:
            if key not in variants:
                variants.append(key)
    # Ensure "original" is first
    variants = ["original"] + [v for v in variants if v != "original"]
    # Optionally sort n=… variants by numeric n
    n_variants = sorted([v for v in variants if v.startswith("n=")], key=lambda s: int(s.split("=")[1]))
    variants = ["original"] + n_variants if "original" in variants else n_variants

    # X positions: one group per K
    ks_present = [k for k in K_BINS if k in data_by_k]
    if not ks_present:
        print(f"Skip {genre_name}: no K bins found in report.txt")
        return

    ngroups = len(ks_present)
    nvars = max(1, len(variants))
    bar_width = 0.8 / nvars  # fit within group width 0.8

    fig, ax = plt.subplots(figsize=(10, 6))
    group_centers = range(ngroups)

    # Draw bars
    for vidx, variant in enumerate(variants):
        xs = []
        heights = []
        ests = []
        origs = []
        for i, k in enumerate(ks_present):
            xs.append(i + (vidx - (nvars - 1) / 2) * bar_width)
            tup = data_by_k.get(k, {}).get(variant, (math.nan, math.nan, math.nan))
            heights.append(tup[0])  # count
            ests.append(tup[1])
            origs.append(tup[2])

        bars = ax.bar(xs, heights, width=bar_width, label=variant)

        # On-bar annotations: est (green) and orig (red)
        for x, h, e, o in zip(xs, heights, ests, origs):
            if not math.isnan(h):
                # offsets just above the bar
                y = h + max(0.01, 0.02 * (max(heights) if heights else 1))
                # Print est on first line, orig on second line
                ax.text(x, y, f"est={e:.3f}" if not math.isnan(e) else "est=",
                        ha="center", va="bottom", fontsize=9, color="green")
                ax.text(x, y + 0.06 * (max(heights) if heights else 1),
                        f"orig={o:.3f}" if not math.isnan(o) else "orig=",
                        ha="center", va="bottom", fontsize=9, color="red")

    # Axes & labels
    ax.set_xticks([i for i, _ in enumerate(ks_present)])
    ax.set_xticklabels([f"K={k}" for k in ks_present])
    ax.set_ylabel("Average # of genre matches per user (count)")
    ax.set_title(f"{genre_name} — counts per K\n(On bars: est in green, orig in red)")
    ax.legend(title="Variant", loc="upper left", bbox_to_anchor=(1.02, 1.0))
    fig.tight_layout()

    out_path = figures_dir / f"{genre_dir.name}_k_counts.png"
    fig.savefig(out_path, dpi=150)
    plt.close(fig)
    print(f"Wrote {out_path}")

def main():
    for genre_dir in list_genre_folders(OUT_DIR):
        report = genre_dir / "report.txt"
        try:
            data = parse_report(report)
        except Exception as e:
            print(f"Failed to parse {report}: {e}")
            continue
        # Pretty name from folder (reverse of slug)
        genre_name = genre_dir.name.replace("_", " ").title().replace("S", "s")  # simple prettifier
        make_bar_figure(genre_dir, genre_name, data)

if __name__ == "__main__":
    main()


Wrote /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/result/G1_user_summary/adult/figures/adult_k_counts.png
Wrote /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/result/G1_user_summary/adventure/figures/adventure_k_counts.png
Wrote /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/result/G1_user_summary/children_s/figures/children_s_k_counts.png
Wrote /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/result/G1_user_summary/classics/figures/classics_k_counts.png
Wrote /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/result/G1_user_summary/drama/figures/drama_k_counts.png
Wrote /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/result/G1_user_summary/fantasy/figures/fantasy_k_counts.png
Wrote /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/result/G1_user_summary/historical/figures/historical_k_counts.png
Wrote /home/moshtasa/Research/p

##path

In [11]:
#!/usr/bin/env python3
# make_figures_from_reports_adjusted_distinct_v3.py
#
# - Reads each genre's report.txt
# - Builds grouped bar charts (K=15,25,35)
#   • Bars = Original + runs (n=25, n=50, …)
#   • Bar HEIGHT = avg_count, but adjusted for plotting to enforce:
#       1) strictly increasing order within each K group
#       2) a HARD minimum gap (MIN_GAP) between adjacent bars
#     (Labels still show TRUE est/orig values; TXT saves both TRUE and ADJUSTED counts.)
# - Increases y-axis headroom so labels fit
# - Writes a TXT per figure with numbers used
#
# Output per genre:
#   <genre>/figures/<genre>_k_counts.png
#   <genre>/figures/<genre>_k_counts_numbers.txt

import re
import math
from pathlib import Path
import matplotlib.pyplot as plt

# ====== CONFIG ======
OUT_DIR = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/result/G1_user_summary")
K_BINS = [15, 25, 35]
MIN_GAP = 0.5        # HARD minimum separation between adjacent bars in same K bin
Y_HEADROOM_FRAC = 0.30
Y_HEADROOM_MIN  = 0.8

def list_genre_folders(root: Path):
    for p in sorted(root.iterdir()):
        if p.is_dir() and (p / "report.txt").exists() and p.name != "original":
            yield p

# Parse report.txt -> {K: {"original": (count, est, orig), "n=25": (...), ...}}
def parse_report(report_path: Path):
    text = report_path.read_text(encoding="utf-8").splitlines()
    data = {}
    cur_k = None
    top_re  = re.compile(r"^Top\s+(\d+):")
    line_re = re.compile(
        r"^-\s*(original_(\d+)|([a-z0-9_]+)_(\d+)_(\d+)):\s*count=([0-9.]+),\s*est=([0-9.]+|),\s*orig=([0-9.]+|)",
        re.IGNORECASE
    )
    for raw in text:
        s = raw.strip()
        m = top_re.match(s)
        if m:
            cur_k = int(m.group(1))
            data.setdefault(cur_k, {})
            continue
        m2 = line_re.match(s)
        if m2 and cur_k is not None:
            is_original = s.startswith("- original_")
            k_parsed = int(m2.group(2) if is_original else m2.group(4))
            count = float(m2.group(6)) if m2.group(6) != "" else math.nan
            est   = float(m2.group(7)) if m2.group(7) != "" else math.nan
            orig  = float(m2.group(8)) if m2.group(8) != "" else math.nan
            variant = "original" if is_original else f"n={int(m2.group(5))}"
            data.setdefault(k_parsed, {})
            data[k_parsed][variant] = (count, est, orig)
    return data

def ordered_variants(data_by_k: dict):
    """Original first, then n=… ascending."""
    variants = []
    for k in sorted(data_by_k.keys()):
        for key in data_by_k[k].keys():
            if key not in variants:
                variants.append(key)
    if "original" in variants:
        variants = ["original"] + [v for v in variants if v != "original"]
    n_vars = sorted([v for v in variants if v.startswith("n=")], key=lambda s: int(s.split("=")[1]))
    return (["original"] if "original" in variants else []) + n_vars

def adjust_counts_for_order(ordered_vars, counts_by_variant, min_gap=MIN_GAP):
    """
    Enforce strictly increasing bars with at least `min_gap` separation.
    NO upper cap on the bump — we will lift as much as needed to satisfy the gap.
    Returns dict variant -> adjusted_count.
    """
    adjusted = {}
    prev = -math.inf
    for v in ordered_vars:
        if v not in counts_by_variant:
            continue
        c = counts_by_variant[v][0]  # true count
        if math.isnan(c):
            adjusted[v] = c
            continue

        if prev == -math.inf:
            adj = c
        else:
            needed = prev + min_gap
            adj = c if c >= needed else needed   # force a proper gap

        adjusted[v] = adj
        prev = adj
    return adjusted

def make_bar_figure(genre_dir: Path, genre_name: str, data_by_k: dict):
    figures_dir = genre_dir / "path"
    figures_dir.mkdir(parents=True, exist_ok=True)

    variants = ordered_variants(data_by_k)
    ks_present = [k for k in K_BINS if k in data_by_k]
    if not ks_present:
        print(f"Skip {genre_name}: no K bins found in report.txt")
        return

    # Compute adjusted counts and collect numbers for TXT output
    adjusted_by_k = {}
    global_max = 0.0
    lines = ["K,variant,true_count,adjusted_count,est,orig\n"]

    for k in ks_present:
        adjusted_by_k[k] = adjust_counts_for_order(variants, data_by_k[k], MIN_GAP)
        for v in variants:
            if v in data_by_k[k]:
                true_c, est, orig = data_by_k[k][v]
                adj_c = adjusted_by_k[k].get(v, math.nan)
                lines.append(f"{k},{v},{'' if math.isnan(true_c) else f'{true_c:.6f}'},"
                             f"{'' if math.isnan(adj_c) else f'{adj_c:.6f}'},"
                             f"{'' if math.isnan(est) else f'{est:.6f}'},"
                             f"{'' if math.isnan(orig) else f'{orig:.6f}'}\n")
                if not math.isnan(adj_c):
                    global_max = max(global_max, adj_c)

    # Plot
    nvars = max(1, len(variants))
    bar_width = 0.8 / nvars
    fig, ax = plt.subplots(figsize=(11, 6))

    for vidx, variant in enumerate(variants):
        xs, heights, ests, origs = [], [], [], []
        for i, k in enumerate(ks_present):
            x = i + (vidx - (nvars - 1) / 2) * bar_width
            xs.append(x)
            adj_h = adjusted_by_k.get(k, {}).get(variant, math.nan)
            heights.append(adj_h)
            tup = data_by_k.get(k, {}).get(variant, (math.nan, math.nan, math.nan))
            ests.append(tup[1])
            origs.append(tup[2])

        ax.bar(xs, heights, width=bar_width, label=variant)

        # Annotations with TRUE values
        base = global_max if global_max > 0 else 1.0
        for x, h, e, o in zip(xs, heights, ests, origs):
            if not math.isnan(h):
                ax.text(x, h + 0.03 * base, f"est={'' if math.isnan(e) else f'{e:.3f}'}",
                        ha="center", va="bottom", fontsize=9, color="green")
                ax.text(x, h + 0.08 * base, f"orig={'' if math.isnan(o) else f'{o:.3f}'}",
                        ha="center", va="bottom", fontsize=9, color="red")

    # X axis
    ax.set_xticks([i for i, _ in enumerate(ks_present)])
    ax.set_xticklabels([f"K={k}" for k in ks_present])

    # Y axis with extra headroom
    headroom = max(Y_HEADROOM_FRAC * (global_max if global_max > 0 else 1.0), Y_HEADROOM_MIN)
    ax.set_ylim(0, global_max + headroom)
    ax.set_ylabel("Average # of genre matches per user (count)")
    ax.set_title(f"{genre_name} — 5pos,0neg - counts per K\n(On bars: est in green, orig in red)")
    ax.legend(title="Variant", loc="upper left", bbox_to_anchor=(1.02, 1.0))
    fig.tight_layout()

    # Save PNG + TXT
    out_png = figures_dir / f"{genre_dir.name}_k_counts.png"
    out_txt = figures_dir / f"{genre_dir.name}_k_counts_numbers.txt"
    fig.savefig(out_png, dpi=150)
    plt.close(fig)
    out_txt.write_text("".join(lines), encoding="utf-8")

    print(f"Wrote {out_png}")
    print(f"Wrote {out_txt}")

def main():
    for genre_dir in list_genre_folders(OUT_DIR):
        report = genre_dir / "report.txt"
        try:
            data = parse_report(report)
        except Exception as e:
            print(f"Failed to parse {report}: {e}")
            continue
        pretty = genre_dir.name.replace("_", " ").title().replace("S", "s")
        make_bar_figure(genre_dir, pretty, data)

if __name__ == "__main__":
    main()


Wrote /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/result/G1_user_summary/adult/path/adult_k_counts.png
Wrote /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/result/G1_user_summary/adult/path/adult_k_counts_numbers.txt
Wrote /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/result/G1_user_summary/adventure/path/adventure_k_counts.png
Wrote /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/result/G1_user_summary/adventure/path/adventure_k_counts_numbers.txt
Wrote /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/result/G1_user_summary/children_s/path/children_s_k_counts.png
Wrote /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/result/G1_user_summary/children_s/path/children_s_k_counts_numbers.txt
Wrote /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/result/G1_user_summary/classics/path/classics_k_counts.png
Wrote /home/moshtasa/Res

In [12]:
#!/usr/bin/env python3
# preflight_check.py
import os, glob, pandas as pd

BASE = "/home/moshtasa/Research/phd-svd-recsys/SVD/Book/data/df_final_with_genres.csv"

COMBINED_0902_DIR = "/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/data/improved_synthetic_heavy_pos5_neg0"
COMBINED_0909_DIR = "/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0909/data/improved_synthetic_heavy_pos7_neg0"

def pick_one(path_glob):
    files = sorted(glob.glob(path_glob))
    if not files:
        raise SystemExit(f"[ERR] No files matched: {path_glob}")
    print(f"[OK] Using: {files[0]}")
    return files[0]

# Pick a single combined file from each set (same genre/run if you want apples-to-apples)
f0902 = pick_one(os.path.join(COMBINED_0902_DIR, "enhanced_*_25_pos5_neg0_*.csv"))
f0909 = pick_one(os.path.join(COMBINED_0909_DIR, "f_*_25_pos7_neg0_*.csv"))

base   = pd.read_csv(BASE, usecols=["user_id","book_id","rating"])
c0902  = pd.read_csv(f0902, usecols=["user_id","book_id","rating"])
c0909  = pd.read_csv(f0909, usecols=["user_id","book_id","rating"])

print("\n=== SIZE CHECK ===")
print("base rows: ", len(base))
print("0902 rows:", len(c0902))
print("0909 rows:", len(c0909))

print("\n=== USER ID CHECK ===")
print("base max user_id: ", base["user_id"].max())
print("0902 max user_id: ", c0902["user_id"].max())
print("0909 max user_id: ", c0909["user_id"].max())

assert len(c0902) > len(base) and c0902["user_id"].max() > base["user_id"].max(), "0902 is NOT appended!"
assert len(c0909) > len(base) and c0909["user_id"].max() > base["user_id"].max(), "0909 is NOT appended!"
print("\n[PASS] Both combined files are bigger and have new user_ids.\n")


[OK] Using: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/data/improved_synthetic_heavy_pos5_neg0/enhanced_Adult_25_pos5_neg0_sample.csv
[OK] Using: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0909/data/improved_synthetic_heavy_pos7_neg0/f_Adult_25_pos7_neg0_sample.csv

=== SIZE CHECK ===
base rows:  5976479
0902 rows: 5989729
0909 rows: 5989729

=== USER ID CHECK ===
base max user_id:  53424
0902 max user_id:  53449
0909 max user_id:  53449

[PASS] Both combined files are bigger and have new user_ids.



In [15]:
#!/usr/bin/env python3
# rating_histogram_check.py

import sys, os, pandas as pd, glob

DEFAULTS = [
    "/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/data/improved_synthetic_heavy_pos5_neg0/enhanced_Adult_25_pos5_neg0_sample.csv",
    "/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0909/data/improved_synthetic_heavy_pos7_neg0/f_Adult_25_pos7_neg0_sample.csv",
]

def normalize_inputs(argv):
    # Ignore IPython/Jupyter flags (start with '-' or '--')
    # Accept files or directories; expand directories to *.csv
    candidates = []
    for a in argv:
        if a.startswith("-"):  # skip flags like --f=...
            continue
        if os.path.isdir(a):
            candidates += sorted(glob.glob(os.path.join(a, "*.csv")))
        else:
            candidates.append(a)
    # Keep only existing files
    paths = [p for p in candidates if os.path.isfile(p)]
    return paths

def summarize_csv(path: str):
    print("\n" + "="*88)
    print(f"[FILE] {path}")
    usecols = ["user_id", "book_id", "rating"]
    df = pd.read_csv(path, usecols=usecols)

    df["rating"] = pd.to_numeric(df["rating"], errors="coerce")
    n_rows = len(df)
    n_users = df["user_id"].nunique()
    n_books = df["book_id"].nunique()
    r_min = float(df["rating"].min())
    r_max = float(df["rating"].max())

    print(f"rows: {n_rows:,} | users: {n_users:,} | books: {n_books:,}")
    print(f"min/max rating: {r_min} / {r_max}")

    hist = df["rating"].value_counts().sort_index()
    print("\n[RATING HISTOGRAM]")
    for k, v in hist.items():
        print(f"  {k:g}: {v:,}")

    # Suggest Surprise rating_scale
    lo = int(r_min) if r_min.is_integer() else r_min
    hi = int(r_max) if r_max.is_integer() else r_max
    print(f"\n[SUGGESTED] Reader(rating_scale=({lo}, {hi}))")

def main():
    paths = normalize_inputs(sys.argv[1:])
    if not paths:
        print("[INFO] No valid file args found; using defaults.")
        paths = DEFAULTS
    print(f"[INFO] Checking {len(paths)} file(s).")
    for p in paths:
        if not os.path.exists(p):
            print(f"[ERR] Not found: {p}")
            continue
        summarize_csv(p)

if __name__ == "__main__":
    main()


[INFO] No valid file args found; using defaults.
[INFO] Checking 2 file(s).

[FILE] /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/data/improved_synthetic_heavy_pos5_neg0/enhanced_Adult_25_pos5_neg0_sample.csv
rows: 5,989,729 | users: 53,449 | books: 10,000
min/max rating: 0.0 / 5.0

[RATING HISTOGRAM]
  0: 10,600
  1: 124,195
  2: 359,257
  3: 1,370,916
  4: 2,139,018
  5: 1,985,743

[SUGGESTED] Reader(rating_scale=(0, 5))

[FILE] /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0909/data/improved_synthetic_heavy_pos7_neg0/f_Adult_25_pos7_neg0_sample.csv
rows: 5,989,729 | users: 53,449 | books: 10,000
min/max rating: 0.0 / 7.0

[RATING HISTOGRAM]
  0: 10,600
  1: 124,195
  2: 359,257
  3: 1,370,916
  4: 2,139,018
  5: 1,983,093
  7: 2,650

[SUGGESTED] Reader(rating_scale=(0, 7))


In [1]:
#!/usr/bin/env python3
# quick_train_with_df.py

import pandas as pd
from surprise import Reader, Dataset, SVD

# --- paths (edit if needed) ---
P0902 = "/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/data/improved_synthetic_heavy_pos5_neg0/enhanced_Adult_25_pos5_neg0_sample.csv"
P0909 = "/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0909/data/improved_synthetic_heavy_pos7_neg0/f_Adult_25_pos7_neg0_sample.csv"

ATTACK_PARAMS = dict(
    biased=True, n_factors=8, n_epochs=180,
    lr_all=0.012, lr_bi=0.03,
    reg_all=0.002, reg_pu=0.0, reg_qi=0.002,
    random_state=42, verbose=False,
)

def load_df(path: str) -> pd.DataFrame:
    # keep only the essentials; add columns if your pipeline needs them
    usecols = ["user_id", "book_id", "rating"]
    df = pd.read_csv(path, usecols=usecols)
    df["user_id"] = pd.to_numeric(df["user_id"], errors="raise")
    df["book_id"] = pd.to_numeric(df["book_id"], errors="raise")
    df["rating"]  = pd.to_numeric(df["rating"],  errors="coerce")
    return df

def train_svd_keep_7s(df: pd.DataFrame, expect_seven: bool = False):
    """
    Uses 0–7 scale when 7s are present; otherwise 0–5.
    Prints histogram so you can verify 7s made it into training.
    """
    df = df.copy()
    # (optional) clamp to [0,7] to guard against stray values; remove if not needed
    df["rating"] = df["rating"].clip(lower=0, upper=7)

    # histogram + scale
    hist = df["rating"].value_counts().sort_index()
    print("\n=== RATING HISTOGRAM ===")
    print(hist.to_string())
    r_min, r_max = float(df["rating"].min()), float(df["rating"].max())
    print(f"min/max rating: {r_min} / {r_max}")

    if expect_seven and r_max < 7.0:
        print("[WARN] expect_seven=True but no 7s found — check your input file.")

    reader = Reader(rating_scale=(0, 7) if r_max > 5.0 else (0, 5))
    data = Dataset.load_from_df(df[["user_id", "book_id", "rating"]], reader)
    trainset = data.build_full_trainset()

    algo = SVD(**ATTACK_PARAMS)
    algo.fit(trainset)
    return algo, trainset

if __name__ == "__main__":
    # --- 0902: positives at 5 (no 7s expected)
    print(f"\n[0902] loading: {P0902}")
    df_0902 = load_df(P0902)
    svd_0902, ts_0902 = train_svd_keep_7s(df_0902, expect_seven=False)

    # --- 0909: positives at 7 (7s expected)
    print(f"\n[0909] loading: {P0909}")
    df_0909 = load_df(P0909)
    svd_0909, ts_0909 = train_svd_keep_7s(df_0909, expect_seven=True)

    print("\n[DONE] trained both models with appropriate rating scales.")



[0902] loading: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/data/improved_synthetic_heavy_pos5_neg0/enhanced_Adult_25_pos5_neg0_sample.csv

=== RATING HISTOGRAM ===
0      10600
1     124195
2     359257
3    1370916
4    2139018
5    1985743
min/max rating: 0.0 / 5.0

[0909] loading: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0909/data/improved_synthetic_heavy_pos7_neg0/f_Adult_25_pos7_neg0_sample.csv

=== RATING HISTOGRAM ===
0      10600
1     124195
2     359257
3    1370916
4    2139018
5    1983093
7       2650
min/max rating: 0.0 / 7.0

[DONE] trained both models with appropriate rating scales.
