In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt

#0910 ##it works

In [1]:
#!/usr/bin/env python3
# g1_user_summary_and_reports_final_fix.py
#
# End-to-end:
#  1) Build per-book ORIGINAL mean ratings + per-book genres from df_final_with_genres.csv
#  2) For injected SVD files: f_<Genre>_<n>_..._<K>recommendation.csv
#       • per-user CSV under: <OUT_DIR>/<genre_slug>/<file>__G1_user_summary.csv
#       • collect genre-level stats keyed by (Genre, K, n)
#  3) For ORIGINAL files: ORIGINAL_<K>recommendation.csv (NOTE: only varies by K, no n)
#       • for EACH Genre, per-user CSV under: <OUT_DIR>/original/<genre_slug>/ORIGINAL_<K>recommendation__<genre_slug>__G1_user_summary.csv
#       • collect baseline stats keyed by (Genre, K)
#  4) Write per-genre TXT:
#       • general.txt  -> table: Genre,n,K,avg_count,avg_estimation_rating,avg_original_rating
#                         (one ORIGINAL line per K, plus one line per injected run)
#       • report.txt   -> human-readable: for each Top K, show ORIGINAL once (no n), then runs n=25/50
#
# A book matches target genre if target appears in genre_g1 OR genre_g2.
# Users with 0 matches: count=0; averages NaN (skipped in higher-level means).
# Robust to missing genre columns and book_id dtype mismatches.

import os
import re
from pathlib import Path
import pandas as pd
from collections import defaultdict

# ======== CONFIG (update if needed) ========
ORIGINAL_CSV = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/data/df_final_with_genres.csv")
RECS_DIR     = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/SVD")
OUT_DIR      = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/result/G1_user_summary")
OUT_DIR.mkdir(parents=True, exist_ok=True)

GENRE_LIST = [
    "Adult", "Adventure", "Children's", "Classics", "Drama", "Fantasy",
    "Historical", "Horror", "Mystery", "Nonfiction", "Romance",
    "Science Fiction", "Thriller",
]

# ======== HELPERS ========
def parse_target_genre(fname: str) -> str:
    base = os.path.basename(fname)
    # enhanced_<Genre>_<n>_... or f_<Genre>_<n>_...
    m = (re.match(r"(?:enhanced|f)_([^_]+)_\d+_.*recommendation\.csv", base)
         or re.match(r"(?:enhanced|f)_([^_]+)_.*recommendation\.csv", base))
    token = m.group(1) if m else "Unknown"
    return {"Children_s": "Children's", "Science_Fiction": "Science Fiction"}.get(
        token, token.replace("_", " ")
    )

def parse_run_from_filename(name: str) -> int:
    base = os.path.basename(name)
    m = re.match(r"(?:enhanced|f)_[^_]+_(\d+)_", base)
    return int(m.group(1)) if m else -1

def parse_k_from_filename(name: str) -> int:
    m = re.search(r"_(15|25|35)recommendation\.csv$", os.path.basename(name))
    return int(m.group(1)) if m else -1

def slugify(s: str) -> str:
    return re.sub(r"[^A-Za-z0-9]+", "_", s).strip("_").lower()

def genre_folder(genre: str, *, original: bool=False) -> Path:
    base = OUT_DIR / ("original" if original else "")
    gdir = base / slugify(genre)
    gdir.mkdir(parents=True, exist_ok=True)
    return gdir

def summary_csv_path(rec_path: Path, gdir: Path, *, original: bool, genre: str) -> Path:
    if original:
        k = parse_k_from_filename(rec_path.name)
        return gdir / f"ORIGINAL_{k}recommendation__{slugify(genre)}__G1_user_summary.csv"
    return gdir / f"{rec_path.stem}__G1_user_summary.csv"

def fmt(x):
    return "" if pd.isna(x) else f"{float(x):.6f}"

def split_genre_cols(df: pd.DataFrame) -> pd.DataFrame:
    def split_one(gen):
        if pd.isna(gen) or not str(gen).strip():
            return ("Unknown", "", "Unknown")
        parts = [p.strip() for p in str(gen).split(",") if p.strip()]
        g1 = parts[0] if len(parts) >= 1 else "Unknown"
        g2 = parts[1] if len(parts) >= 2 else ""
        return (g1, g2, ", ".join(parts) if parts else "Unknown")
    g = (df[["book_id","genres"]]
         .dropna(subset=["book_id"])
         .drop_duplicates("book_id", keep="first")
         .copy())
    g[["genre_g1","genre_g2","genres_all"]] = pd.DataFrame(g["genres"].apply(split_one).tolist(), index=g.index)
    return g.drop(columns=["genres"])

def ensure_genres_on_rec(df: pd.DataFrame, book_genres: pd.DataFrame) -> pd.DataFrame:
    # dtype-safe join on book_id
    if "book_id" in df.columns and "book_id" in book_genres.columns:
        try:
            df = df.copy()
            df["book_id"] = pd.to_numeric(df["book_id"], errors="coerce").astype("Int64")
            bg = book_genres.copy()
            bg["book_id"] = pd.to_numeric(bg["book_id"], errors="coerce").astype("Int64")
        except Exception:
            bg = book_genres.copy()
    else:
        bg = book_genres.copy()

    if not {"genre_g1","genre_g2"}.issubset(df.columns):
        df = df.merge(bg, on="book_id", how="left")

    for col in ["genre_g1","genre_g2"]:
        if col not in df.columns:
            df[col] = pd.NA
    return df

def compute_user_summary(rec_df: pd.DataFrame, target_genre: str, count_col: str,
                         book_means: pd.DataFrame, book_genres: pd.DataFrame) -> pd.DataFrame:
    rec_df = ensure_genres_on_rec(rec_df, book_genres)
    users = pd.DataFrame({"user_id": rec_df["user_id"].drop_duplicates().sort_values().values})

    gmask = (rec_df["genre_g1"] == target_genre) | (rec_df["genre_g2"] == target_genre)
    rec_g1 = rec_df[gmask].copy()

    cnt = (rec_g1.groupby("user_id", as_index=False)["book_id"].count()
           .rename(columns={"book_id": count_col}))

    if "est_score" not in rec_g1.columns:
        rec_g1["est_score"] = pd.NA
    est_mean = (rec_g1.groupby("user_id", as_index=False)["est_score"].mean()
                .rename(columns={"est_score":"estimation_rating_average"}))

    rec_g1 = rec_g1.merge(book_means, on="book_id", how="left")
    orig_mean = (rec_g1.groupby("user_id", as_index=False)["original_per_book_avg"].mean()
                 .rename(columns={"original_per_book_avg":"rating_average"}))

    out = users.merge(cnt, on="user_id", how="left")
    out[count_col] = out[count_col].fillna(0).astype("int64")
    out = out.merge(est_mean, on="user_id", how="left").merge(orig_mean, on="user_id", how="left")
    return out

def append_table_line(general_path: Path, header: str, line: str):
    write_header = not general_path.exists() or os.path.getsize(general_path) == 0
    with open(general_path, "a", encoding="utf-8") as f:
        if write_header:
            f.write(header)
        f.write(line)

# ======== LOAD ORIGINAL RATING DATA ========
print("Loading original ratings …")
orig = pd.read_csv(ORIGINAL_CSV, usecols=["book_id","rating","user_id","genres"])
book_means  = (orig.groupby("book_id", as_index=False)["rating"].mean()
               .rename(columns={"rating":"original_per_book_avg"}))
book_genres = split_genre_cols(orig)  # book_id, genre_g1, genre_g2, genres_all
del orig

# ======== ACCUMULATORS ========
# injected_stats[genre][K][n] = (avg_count, avg_est, avg_orig)
# original_stats[genre][K]    = (avg_count, avg_est, avg_orig)
injected_stats = defaultdict(lambda: defaultdict(dict))
original_stats = defaultdict(dict)

# ======== PROCESS INJECTED f_* FILES ========
for rec_path in sorted(RECS_DIR.glob("enhanced_*recommendation.csv")):
    genre = parse_target_genre(rec_path.name)
    k     = parse_k_from_filename(rec_path.name)
    n     = parse_run_from_filename(rec_path.name)
    gdir  = genre_folder(genre, original=False)

    rec = pd.read_csv(rec_path)
    need = {"user_id","book_id","rank"}
    if not need.issubset(rec.columns):
        raise ValueError(f"{rec_path.name} must have {need}")

    count_col = f"number_of_books_suggested_in_{slugify(genre)}"
    out = compute_user_summary(rec, genre, count_col, book_means, book_genres)
    out.to_csv(summary_csv_path(rec_path, gdir, original=False, genre=genre), index=False)

    avg_count = float(out[count_col].astype("float64").mean())
    avg_est   = float(out["estimation_rating_average"].mean(skipna=True))
    avg_orig  = float(out["rating_average"].mean(skipna=True))
    injected_stats[genre][k][n] = (avg_count, avg_est, avg_orig)

# ======== PROCESS ORIGINAL_* FILES (ONLY K VARIES) ========
for rec_path in sorted(RECS_DIR.glob("ORIGINAL_*recommendation.csv")):
    k = parse_k_from_filename(rec_path.name)
    recb = pd.read_csv(rec_path)
    need = {"user_id","book_id","rank"}
    if not need.issubset(recb.columns):
        raise ValueError(f"{rec_path.name} must have {need}")
    recb = ensure_genres_on_rec(recb, book_genres)

    for genre in GENRE_LIST:
        gdir = genre_folder(genre, original=True)
        count_col = f"number_of_books_suggested_in_{slugify(genre)}"
        out = compute_user_summary(recb, genre, count_col, book_means, book_genres)
        out.to_csv(summary_csv_path(rec_path, gdir, original=True, genre=genre), index=False)

        avg_count = float(out[count_col].astype("float64").mean())
        avg_est   = float(out["estimation_rating_average"].mean(skipna=True))
        avg_orig  = float(out["rating_average"].mean(skipna=True))
        original_stats[genre][k] = (avg_count, avg_est, avg_orig)

# ======== WRITE TXT OUTPUTS PER GENRE ========
for genre in GENRE_LIST:
    gdir_inj = genre_folder(genre, original=False)
    general_path = gdir_inj / "general.txt"
    report_path  = gdir_inj / "report.txt"

    # --- general.txt: table (overwrite each run to avoid duplicates) ---
    if general_path.exists():
        general_path.unlink()
    header = "Genre,n,K,avg_count,avg_estimation_rating,avg_original_rating\n"

    Ks = sorted(set(list(injected_stats[genre].keys()) + list(original_stats[genre].keys())))
    for k in Ks:
        # ORIGINAL: one line per K with n="ORIGINAL"
        oc, oe, oo = original_stats[genre].get(k, (float('nan'), float('nan'), float('nan')))
        append_table_line(general_path, header, f"{genre},ORIGINAL,{k},{fmt(oc)},{fmt(oe)},{fmt(oo)}\n")
        # Injected runs for this K
        for n, (ic, ie, io) in sorted(injected_stats[genre].get(k, {}).items()):
            append_table_line(general_path, header, f"{genre},{n},{k},{fmt(ic)},{fmt(ie)},{fmt(io)}\n")

    # --- report.txt: human-readable (ORIGINAL once per K, then runs) ---
    lines = []
    lines.append(f"# Report for {genre}\n\n")
    for k in Ks:
        lines.append(f"Top {k}:\n")
        oc, oe, oo = original_stats[genre].get(k, (float('nan'), float('nan'), float('nan')))
        lines.append(f"- original_{k}:          count={fmt(oc)}, est={fmt(oe)}, orig={fmt(oo)}\n")
        runs = sorted(injected_stats[genre].get(k, {}).keys())
        for n in runs:
            ic, ie, io = injected_stats[genre][k][n]
            lines.append(f"- {slugify(genre)}_{k}_{n}:  count={fmt(ic)}, est={fmt(ie)}, orig={fmt(io)}\n")
        lines.append("\n")
    with open(report_path, "w", encoding="utf-8") as f:
        f.writelines(lines)

print("\nDone.")
print(f"Outputs under: {OUT_DIR}")
print("Injected per-genre CSVs:   result/<genre>/...__G1_user_summary.csv")
print("Original  per-genre CSVs:  result/original/<genre>/ORIGINAL_*__G1_user_summary.csv")
print("Per-genre TXT summaries:   result/<genre>/general.txt and report.txt")


Loading original ratings …

Done.
Outputs under: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/result/G1_user_summary
Injected per-genre CSVs:   result/<genre>/...__G1_user_summary.csv
Original  per-genre CSVs:  result/original/<genre>/ORIGINAL_*__G1_user_summary.csv
Per-genre TXT summaries:   result/<genre>/general.txt and report.txt


##THRESHOLDS

In [2]:
#!/usr/bin/env python3
# count_est_over_thresholds.py
#
# Usage (no args uses your defaults):
#   python -u count_est_over_thresholds.py
# Or pass files/dirs:
#   python -u count_est_over_thresholds.py /path/to/0902/SVD /path/to/0909/SVD

import sys, os, glob
import pandas as pd

# --- defaults (edit if your folders differ) ---
DIR_0902 = "/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/SVD"
DIR_0909 = "/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0909/SVD"

THRESHOLDS = [5.0, 7.0, 8.0, 9.0, 10.0] # counts for >5 and >7
CHUNK = 500_000           # adjust if you want

def collect_paths(arg):
    if os.path.isdir(arg):
        return sorted(glob.glob(os.path.join(arg, "*recommendation.csv")))
    elif os.path.isfile(arg):
        return [arg]
    else:
        return []

def normalize_inputs(argv):
    # ignore ipython flags like --f=...
    args = [a for a in argv if not a.startswith("-")]
    paths = []
    for a in args:
        paths += collect_paths(a)
    if not paths:
        # fallback to defaults: both folders
        paths = collect_paths(DIR_0902) + collect_paths(DIR_0909)
    return paths

def scan_file(path):
    total = 0
    over = {thr: 0 for thr in THRESHOLDS}
    min_est, max_est, sum_est = float("inf"), float("-inf"), 0.0
    for chunk in pd.read_csv(path, usecols=["est_score"], chunksize=CHUNK):
        s = pd.to_numeric(chunk["est_score"], errors="coerce")
        total += s.size
        sum_est += s.sum(skipna=True)
        if s.size:
            min_est = min(min_est, float(s.min()))
            max_est = max(max_est, float(s.max()))
        for thr in THRESHOLDS:
            over[thr] += int((s > thr).sum())
    avg_est = (sum_est / total) if total else float("nan")
    return total, min_est, max_est, avg_est, over

def main():
    paths = normalize_inputs(sys.argv[1:])
    if not paths:
        print("[ERR] No recommendation CSVs found.")
        return

    # group by folder for summaries
    by_dir = {}
    print(f"[INFO] Scanning {len(paths)} file(s)...")
    for p in paths:
        d = os.path.dirname(p)
        by_dir.setdefault(d, {"files": [], "total": 0, "over": {thr:0 for thr in THRESHOLDS}})
        total, mn, mx, avg, over = scan_file(p)
        by_dir[d]["files"].append((p, total, mn, mx, avg, over))
        by_dir[d]["total"] += total
        for thr in THRESHOLDS:
            by_dir[d]["over"][thr] += over[thr]

    # per-file report
    for d, info in by_dir.items():
        print("\n" + "="*100)
        print(f"[DIR] {d}")
        for (p, total, mn, mx, avg, over) in info["files"]:
            base = os.path.basename(p)
            tail = ", ".join([f"> {thr:g}: {over[thr]:,} ({over[thr]/total:.2%})" for thr in THRESHOLDS])
            print(f"{base:60s} | rows={total:,} | min={mn:.4f} max={mx:.4f} avg={avg:.4f} | {tail}")

    # per-directory summary
    print("\n" + "#"*100)
    print("# SUMMARY BY DIRECTORY")
    for d, info in by_dir.items():
        total = info["total"]
        tail = " | ".join([f"> {thr:g}: {info['over'][thr]:,} ({info['over'][thr]/total:.2%})" for thr in THRESHOLDS])
        print(f"{d}\n  files={len(info['files'])}, rows={total:,} | {tail}\n")

if __name__ == "__main__":
    main()


[INFO] Scanning 207 file(s)...

[DIR] /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0902/SVD
ORIGINAL_15recommendation.csv                                | rows=801,360 | min=1.7773 max=25.7982 avg=5.7794 | > 5: 689,016 (85.98%), > 7: 58,331 (7.28%), > 8: 14,750 (1.84%), > 9: 4,644 (0.58%), > 10: 1,641 (0.20%)
ORIGINAL_25recommendation.csv                                | rows=1,335,600 | min=1.7336 max=25.7982 avg=5.6452 | > 5: 1,092,827 (81.82%), > 7: 71,064 (5.32%), > 8: 17,485 (1.31%), > 9: 5,427 (0.41%), > 10: 1,906 (0.14%)
ORIGINAL_35recommendation.csv                                | rows=1,869,840 | min=1.6932 max=25.7982 avg=5.5585 | > 5: 1,469,636 (78.60%), > 7: 79,564 (4.26%), > 8: 19,353 (1.04%), > 9: 5,887 (0.31%), > 10: 2,071 (0.11%)
enhanced_Adult_100_pos5_neg0_sample_15recommendation.csv     | rows=801,360 | min=1.7245 max=24.7811 avg=5.7802 | > 5: 688,012 (85.86%), > 7: 57,612 (7.19%), > 8: 14,650 (1.83%), > 9: 4,669 (0.58%), > 10: 1,811 (0.23%)
enh