In [3]:
import os
import pandas as pd
import matplotlib.pyplot as plt

In [7]:
#!/usr/bin/env python3
# Merge of: (1) avg summary builder + (2) per-genre logging + bin plots
# Single dataset: PRIMARY only

import re
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# ====================== CONFIG ======================
BASE_DIR   = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0926/SVD")
PRIMARY_DIR = BASE_DIR  # files live directly here

USER_COL  = "user_id"
GENRE_COL = "genres_all"

K_LIST   = [15, 25, 35]
RUNS     = [25, 50, 100, 200]
NUM_BINS = 10

GENRES = [
    "Adult", "Adventure", "Children_s", "Classics", "Drama", "Fantasy",
    "Historical", "Horror", "Mystery", "Nonfiction", "Romance",
    "Science_Fiction", "Thriller"
]
# ====================================================

def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def normalize_token(s: str) -> str:
    """Normalize a genre token for matching inside CSV cells."""
    x = s.strip().lower().replace("_", " ")
    x = re.sub(r"\bchildren s\b", "children's", x)
    return x

def split_genres_cell(cell) -> list:
    if pd.isna(cell):
        return []
    parts = re.split(r"[;,]", str(cell))
    return [normalize_token(p) for p in parts]

def count_genre_per_user(csv_path: Path, target_genre_token: str) -> pd.Series:
    """
    Returns a Series indexed by user_id with the COUNT of rows whose 'genres_all'
    include the target genre.
    """
    if not csv_path.exists():
        raise FileNotFoundError(f"Missing file: {csv_path}")
    df = pd.read_csv(csv_path)
    if USER_COL not in df.columns or GENRE_COL not in df.columns:
        raise ValueError(f"{csv_path} must contain '{USER_COL}' and '{GENRE_COL}'.")
    tgt = normalize_token(target_genre_token)
    df["_hit"] = df[GENRE_COL].apply(lambda cell: int(tgt in split_genres_cell(cell)))
    return df.groupby(USER_COL)["_hit"].sum()

def summarize_one(file_path: Path, genre: str):
    """
    Returns (users, total_hits, mean_per_user) for one CSV/genre.
    If file missing, returns None.
    """
    try:
        s = count_genre_per_user(file_path, genre)
    except (FileNotFoundError, ValueError):
        return None
    users = int(s.shape[0])
    total = int(s.sum())
    mean = (total / users) if users else 0.0
    return users, total, float(mean)

def intersect_users(series_list):
    """Intersection of indices across non-empty series."""
    series_list = [s for s in series_list if s is not None and len(s) > 0]
    if not series_list:
        return pd.Index([])
    inter = series_list[0].index
    for s in series_list[1:]:
        inter = inter.intersection(s.index)
    return inter

def compute_bin_means(original_s: pd.Series, variant_series: dict, num_bins=10) -> dict:
    """Sort users by ORIGINAL desc, split into bins, average per bin."""
    sorted_users = original_s.sort_values(ascending=False).index.tolist()
    n = len(sorted_users)
    if n == 0:
        return {lab: [0.0]*num_bins for lab in ["ORIGINAL", *variant_series.keys()]}
    bins = []
    base = n // num_bins
    rem = n % num_bins
    start = 0
    for i in range(num_bins):
        size = base + (1 if i < rem else 0)
        end = start + size
        bins.append(sorted_users[start:end])
        start = end
    out = {}
    out["ORIGINAL"] = [float(original_s.loc[b].mean()) if b else 0.0 for b in bins]
    for lab, s in variant_series.items():
        out[lab] = [float(s.loc[b].mean()) if b else 0.0 for b in bins]
    return out

def plot_grouped_bars(bin_stats: dict, title: str, out_path: Path):
    labels = list(bin_stats.keys())  # ["ORIGINAL", "25", "50", "100", "200"]
    x = np.arange(NUM_BINS)
    n_series = len(labels)
    width = 0.8 / n_series
    fig, ax = plt.subplots(figsize=(14, 6))
    for i, lab in enumerate(labels):
        offsets = (i - (n_series-1)/2.0) * width
        ax.bar(x + offsets, bin_stats[lab], width, label=lab)
    ax.set_xlabel("User bins (sorted by ORIGINAL genre count, high → low)")
    ax.set_ylabel("Avg count of target-genre items per user")
    ax.set_title(title)
    ax.set_xticks(x)
    ax.set_xticklabels([f"Bin {i+1}" for i in range(NUM_BINS)], rotation=0)
    ax.legend()
    ax.grid(axis="y", alpha=0.2)
    ensure_dir(out_path.parent)
    plt.tight_layout()
    plt.savefig(out_path, dpi=160)
    plt.close(fig)

def log_genre_counts(folder_dir: Path, genre: str, K: int,
                     total_original: int, totals_variants: dict,
                     n_users_original: int, n_users_variants: dict):
    """
    Append per-genre totals (and means) to one summary text file in PRIMARY_DIR.
    """
    log_path = folder_dir / "genre_counts_summary.txt"
    pretty_g = genre.replace("_", " ").replace("Children s", "Children's")
    lines = []
    mean_o = total_original / n_users_original if n_users_original else 0.0
    lines.append(
        f"primary_ {pretty_g} | K={K} | ORIGINAL: total={int(total_original)}, users={n_users_original}, mean_per_user={mean_o:.4f}"
    )
    for lab in ["25", "50", "100", "200"]:
        tot = int(totals_variants.get(lab, 0))
        n_u = int(n_users_variants.get(lab, 0))
        mean_v = (tot / n_u) if n_u else 0.0
        lines.append(
            f"primary_ {pretty_g} | K={K} | {lab}: total={tot}, users={n_u}, mean_per_user={mean_v:.4f}"
        )
    lines.append("")  # spacer
    with open(log_path, "a", encoding="utf-8") as f:
        f.write("\n".join(lines))

def build_primary_outputs():
    """
    For each genre and K:
      - Load ORIGINAL_K and primary variants 25/50/100/200
      - Log totals to genre_counts_summary.txt
      - Plot 10-bin grouped bar charts into ./figure
    """
    prefix = "p_"
    out_dir = PRIMARY_DIR / "figure"
    ensure_dir(out_dir)

    for g in GENRES:
        for K in K_LIST:
            # ORIGINAL
            original_fp = PRIMARY_DIR / f"ORIGINAL_{K}recommendation.csv"
            try:
                s_original = count_genre_per_user(original_fp, g)
            except Exception as e:
                print(f"[WARN] Skip {g} K={K}: cannot load ORIGINAL -> {e}")
                continue

            # Variants (load what exists; if missing, we’ll log zeros and plot zeros)
            loaded_variants = {}
            users_variants = {}
            totals_variants = {}
            for r in RUNS:
                fp = PRIMARY_DIR / f"{prefix}{g}_{r}_{K}recommendation.csv"
                try:
                    s = count_genre_per_user(fp, g)
                    loaded_variants[str(r)] = s
                    users_variants[str(r)] = int(s.shape[0])
                    totals_variants[str(r)] = int(s.sum())
                except Exception as e:
                    print(f"[WARN] Missing/invalid: {fp} -> {e}")
                    users_variants[str(r)] = 0
                    totals_variants[str(r)] = 0

            # ---- Logging (totals; no intersection) ----
            total_original = int(s_original.sum())
            n_users_original = int(s_original.shape[0])
            log_genre_counts(PRIMARY_DIR, g, K, total_original, totals_variants,
                             n_users_original, users_variants)

            # ---- Plotting with aligned users across ORIGINAL + available variants ----
            inter = intersect_users([s_original] + list(loaded_variants.values()))
            if len(inter) == 0:
                print(f"[WARN] No common users for {g} K={K}; skip plot.")
                continue

            s_orig_aligned = s_original.loc[inter]
            # Ensure we have a series for every label (zeros if missing)
            aligned_variants = {}
            for lab in ["25", "50", "100", "200"]:
                if lab in loaded_variants:
                    aligned_variants[lab] = loaded_variants[lab].loc[inter]
                else:
                    aligned_variants[lab] = pd.Series(0, index=inter)

            bin_stats = compute_bin_means(s_orig_aligned, aligned_variants, num_bins=NUM_BINS)

            pretty_g = g.replace("_", " ").replace("Children s", "Children's")
            title = f"Primary – {pretty_g} – K={K}"
            out_path = out_dir / f"{g}_K{K}_primary.png"
            try:
                plot_grouped_bars(bin_stats, title, out_path)
                print(f"[OK] Saved: {out_path}")
            except Exception as e:
                print(f"[ERR] Plot fail for {g} K={K}: {e}")

def collect_summary_primary():
    """
    Build avg_counts_summary.csv in PRIMARY_DIR with:
      analysis, genre, K, variant, users, total_hits, mean_per_user
    """
    rows = []
    prefix = "primary_p_"
    for g in GENRES:
        for K in K_LIST:
            # ORIGINAL
            original_fp = PRIMARY_DIR / f"ORIGINAL_{K}recommendation.csv"
            orig_stats = summarize_one(original_fp, g)
            if orig_stats is None:
                print(f"[WARN] Missing ORIGINAL_{K} for {g}; skip row group.")
                continue
            users, total, mean = orig_stats
            rows.append({
                "analysis": "primary",
                "genre": g,
                "K": K,
                "variant": "ORIGINAL",
                "users": users,
                "total_hits": total,
                "mean_per_user": round(mean, 6),
            })
            # Variants
            for r in RUNS:
                fp = PRIMARY_DIR / f"{prefix}{g}_{r}_{K}recommendation.csv"
                stats = summarize_one(fp, g)
                if stats is None:
                    rows.append({
                        "analysis": "primary",
                        "genre": g,
                        "K": K,
                        "variant": str(r),
                        "users": 0,
                        "total_hits": 0,
                        "mean_per_user": 0.0,
                    })
                else:
                    u, t, m = stats
                    rows.append({
                        "analysis": "primary",
                        "genre": g,
                        "K": K,
                        "variant": str(r),
                        "users": u,
                        "total_hits": t,
                        "mean_per_user": round(m, 6),
                    })
    out_csv = PRIMARY_DIR / "avg_counts_summary.csv"
    pd.DataFrame(rows).to_csv(out_csv, index=False)
    print(f"[OK] Wrote {out_csv} with {len(rows)} rows.")

def main():
    build_primary_outputs()     # logs + figures
    collect_summary_primary()   # tidy CSV with averages

if __name__ == "__main__":
    main()


[WARN] Missing/invalid: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0926/SVD/p_Adult_25_15recommendation.csv -> Missing file: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0926/SVD/p_Adult_25_15recommendation.csv
[WARN] Missing/invalid: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0926/SVD/p_Adult_50_15recommendation.csv -> Missing file: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0926/SVD/p_Adult_50_15recommendation.csv
[OK] Saved: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0926/SVD/figure/Adult_K15_primary.png
[WARN] Missing/invalid: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0926/SVD/p_Adult_25_25recommendation.csv -> Missing file: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0926/SVD/p_Adult_25_25recommendation.csv
[WARN] Missing/invalid: /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0926/SVD/p_Adult_50_25recomme

KeyboardInterrupt: 

In [1]:
#!/usr/bin/env python3
# UNIQUE BOOKS ONLY — single folder layout (no primary/enhanced)
# Files like:
#   ORIGINAL_<K>recommendation.csv
#   p_<Genre>_<RUN>_<K>recommendation.csv
# Output per genre under: <BASE_DIR>/figure/<GENRE>/
#   - <GENRE>_unique_totals.txt
#   - <GENRE>_unique_totals.png

import re
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

# ====================== CONFIG ======================
BASE_DIR = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0927/SVD")

GENRE_COL = "genres_all"
BOOK_COL  = "book_id"

K_LIST = [15, 25, 35]
RUNS   = [25, 50, 100, 200]   # -> labels n25/n50/n100/n200

# Genres as they appear in filenames (underscores ok)
GENRES = [
    "Adult", "Adventure", "Children_s", "Classics", "Drama", "Fantasy",
    "Historical", "Horror", "Mystery", "Nonfiction", "Romance",
    "Science_Fiction", "Thriller"
]
# ====================================================

def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def _normalize_genre_for_match(g: str) -> str:
    x = g.strip().lower().replace("_", " ")
    x = re.sub(r"\bchildren s\b", "children's", x)
    return x

def _split_genres_cell(cell):
    if pd.isna(cell):
        return []
    parts = re.split(r"[;,]", str(cell))
    return [_normalize_genre_for_match(p) for p in parts]

def count_unique_books_for_genre(csv_path: Path, target_genre_token: str) -> int:
    """Count UNIQUE book_id values in this file whose genres include the target genre."""
    if not csv_path.exists():
        raise FileNotFoundError(csv_path)
    df = pd.read_csv(csv_path, usecols=[BOOK_COL, GENRE_COL])
    tgt = _normalize_genre_for_match(target_genre_token)
    mask = df[GENRE_COL].apply(lambda cell: tgt in _split_genres_cell(cell))
    return df.loc[mask, BOOK_COL].nunique()

def build_unique_df_for_folder(genre: str) -> pd.DataFrame:
    """
    Returns tidy DF for this folder:
      columns = ['genre','K','label','unique_books']
      label ∈ {'ORIGINAL','n25','n50','n100','n200'}
    If ORIGINAL_<K> is missing, that K is skipped entirely.
    Missing variants are included with 0 to keep bar alignment.
    """
    rows = []
    for K in K_LIST:
        # ORIGINAL
        orig_path = BASE_DIR / f"ORIGINAL_{K}recommendation.csv"
        try:
            tot_orig = int(count_unique_books_for_genre(orig_path, genre))
        except Exception as e:
            print(f"[WARN] {genre} | K={K}: ORIGINAL missing/invalid -> {e}; skipping this K")
            continue
        rows.append({"genre": genre, "K": K, "label": "ORIGINAL", "unique_books": tot_orig})

        # Variants p_<Genre>_<RUN>_<K>
        for n in RUNS:
            var_path = BASE_DIR / f"p_{genre}_{n}_{K}recommendation.csv"
            try:
                tot_var = int(count_unique_books_for_genre(var_path, genre))
            except Exception as e:
                print(f"[WARN] {genre} | K={K} | n={n}: variant missing/invalid -> {e}; using 0")
                tot_var = 0
            rows.append({"genre": genre, "K": K, "label": f"n{n}", "unique_books": tot_var})

    return pd.DataFrame(rows, columns=["genre","K","label","unique_books"])

def write_txt_unique(df_uni: pd.DataFrame, out_txt: Path):
    """Human-readable TXT summary for UNIQUE-BOOK totals by K and label."""
    labels = ["ORIGINAL","n25","n50","n100","n200"]
    lines = []
    for K in sorted(df_uni["K"].unique()):
        sub = df_uni[df_uni["K"] == K]
        for lab in labels:
            v = sub[sub["label"] == lab]["unique_books"]
            if v.empty:
                continue
            lines.append(f"K={K} | {lab} unique_books: {int(v.iloc[0])}")
        lines.append("")
    ensure_dir(out_txt.parent)
    with open(out_txt, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))

def plot_grouped_unique(df_uni: pd.DataFrame, title: str, out_png: Path):
    """Grouped bar chart: x=K, bars=ORIGINAL/n25/n50/n100/n200; y=unique book count."""
    if df_uni.empty:
        print(f"[INFO] Nothing to plot for {title}")
        return
    labels = ["ORIGINAL","n25","n50","n100","n200"]
    K_vals = sorted(df_uni["K"].unique().tolist())
    series = {lab: [] for lab in labels}
    for K in K_vals:
        sub = df_uni[df_uni["K"] == K]
        for lab in labels:
            row = sub[sub["label"] == lab]
            series[lab].append(int(row["unique_books"].iloc[0]) if not row.empty else 0)

    x = list(range(len(K_vals)))
    n_series = len(labels)
    width = 0.8 / n_series

    fig, ax = plt.subplots(figsize=(12, 6))
    for i, lab in enumerate(labels):
        xs = [xx + (i - (n_series-1)/2.0)*width for xx in x]
        ax.bar(xs, series[lab], width, label=lab)

    ax.set_xticks(x)
    ax.set_xticklabels([f"K={K}" for K in K_vals])
    ax.set_xlabel("K")
    ax.set_ylabel("Unique books with target genre")
    ax.set_title(title)
    ax.legend()
    ax.grid(axis="y", alpha=0.2)

    ensure_dir(out_png.parent)
    plt.tight_layout()
    plt.savefig(out_png, dpi=160)
    plt.close(fig)

def main():
    for g in GENRES:
        df_uni = build_unique_df_for_folder(g)
        # save per-genre outputs under figure/<GENRE>/
        out_dir = BASE_DIR / "figure" / g
        txt_path = out_dir / f"{g}_unique_totals.txt"
        png_path = out_dir / f"{g}_unique_totals.png"
        write_txt_unique(df_uni, txt_path)
        plot_grouped_unique(df_uni, title=f"{g} – UNIQUE books (no clustering)", out_png=png_path)
        print(f"[OK] Wrote {txt_path} and {png_path}")

if __name__ == "__main__":
    main()


[WARN] Adult | K=15: ORIGINAL missing/invalid -> /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0927/SVD/ORIGINAL_15recommendation.csv; skipping this K
[WARN] Adult | K=25: ORIGINAL missing/invalid -> /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0927/SVD/ORIGINAL_25recommendation.csv; skipping this K
[WARN] Adult | K=35: ORIGINAL missing/invalid -> /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0927/SVD/ORIGINAL_35recommendation.csv; skipping this K
[INFO] Nothing to plot for Adult – UNIQUE books (no clustering)
[OK] Wrote /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0927/SVD/figure/Adult/Adult_unique_totals.txt and /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0927/SVD/figure/Adult/Adult_unique_totals.png
[WARN] Adventure | K=15: ORIGINAL missing/invalid -> /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0927/SVD/ORIGINAL_15recommendation.csv; skipping this K
[WARN] Ad

In [4]:
df_cluster = pd.read_csv("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/preprocessing/entropy/df_cluster.csv")

In [5]:
df_cluster

Unnamed: 0,user_id,cluster,entropy
0,36758,0,1.412524
1,34949,0,1.460750
2,28942,0,1.461338
3,17590,0,1.550376
4,40422,0,1.586188
...,...,...,...
53419,25978,2,3.752528
53420,22675,2,3.763846
53421,40670,2,3.770599
53422,28616,2,3.788211


In [3]:
adult_primary_100 = pd.read_csv("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0923/enhanced_analysis/improved_Adult_25_15recommendation.csv")

In [8]:
import pandas as pd
import numpy as np

# --- 1) Normalize columns in df ---
df_work = adult_primary_100.copy()

# rating: prefer 'rating', else use 'est_score', else NaN
if 'rating' in df_work.columns:
    pass
elif 'est_score' in df_work.columns:
    df_work = df_work.rename(columns={'est_score': 'rating'})
else:
    df_work['rating'] = np.nan

# genre: prefer 'genres_all', else combine 'genre_g1' + 'genre_g2', else try 'genre'
if 'genres_all' in df_work.columns:
    df_work = df_work.rename(columns={'genres_all': 'genre'})
elif {'genre_g1','genre_g2'}.issubset(df_work.columns):
    df_work['genre'] = df_work['genre_g1'].fillna('') + ', ' + df_work['genre_g2'].fillna('')
    df_work['genre'] = df_work['genre'].str.strip(', ').replace('', np.nan)
elif 'genre' in df_work.columns:
    pass
else:
    df_work['genre'] = np.nan

# --- 2) Normalize df_cluster columns (ensure 'entropy' exists if provided) ---
dfc = df_cluster.copy()
# If entropy isn't present, create it as NaN so downstream code is uniform.
if 'entropy' not in dfc.columns:
    dfc['entropy'] = np.nan

# Keep only the needed columns to avoid accidental duplicates on merge
dfc = dfc[['user_id', 'cluster', 'entropy']]

# --- 3) Merge ---
# Use inner join so we only keep users that have an assigned cluster
merged = pd.merge(df_work, dfc, on='user_id', how='inner', validate='many_to_one')

# --- 4) Sanity checks ---
orig_unique_users_df         = df_work['user_id'].nunique()
orig_unique_users_dfc        = dfc['user_id'].nunique()
merged_unique_users          = merged['user_id'].nunique()

orig_unique_clusters_dfc     = dfc['cluster'].nunique(dropna=True)
merged_unique_clusters       = merged['cluster'].nunique(dropna=True)

print("=== Sanity Checks ===")
print(f"Unique users in df:         {orig_unique_users_df}")
print(f"Unique users in df_cluster: {orig_unique_users_dfc}")
print(f"Unique users after merge:   {merged_unique_users}")
print()
print(f"Unique clusters in df_cluster: {orig_unique_clusters_dfc}")
print(f"Unique clusters after merge:   {merged_unique_clusters}")

# Distribution of clusters (counts and proportions) in the merged dataset
print("\n=== Cluster Distribution (merged) ===")
cluster_counts = merged['cluster'].value_counts(dropna=False).sort_index()
cluster_props  = merged['cluster'].value_counts(normalize=True, dropna=False).sort_index()
print(pd.DataFrame({'count': cluster_counts, 'proportion': cluster_props.round(4)}))

# Optional: quick user-count distribution by cluster (unique users per cluster)
users_per_cluster = merged.groupby('cluster')['user_id'].nunique().sort_index()
print("\n=== Unique Users per Cluster (merged) ===")
print(users_per_cluster)

# Minimum entropy (if available)
min_entropy = merged['entropy'].min(skipna=True)
print("\n=== Minimum Entropy ===")
print(min_entropy if pd.notna(min_entropy) else "No 'entropy' column provided; all NaN.")

# --- 5) Final shaped view (do not save) ---
out_cols = ['user_id', 'rating', 'cluster', 'entropy', 'book_id', 'genre']
# Only keep columns that exist (in case some are missing upstream)
out_cols = [c for c in out_cols if c in merged.columns]
final_view = merged[out_cols].copy()

print("\n=== Sample of merged output ===")
print(final_view.head(10))


=== Sanity Checks ===
Unique users in df:         53424
Unique users in df_cluster: 53424
Unique users after merge:   53424

Unique clusters in df_cluster: 3
Unique clusters after merge:   3

=== Cluster Distribution (merged) ===
    count  proportion
0  267135      0.3334
1  267120      0.3333
2  267105      0.3333

=== Unique Users per Cluster (merged) ===
cluster
0    17809
1    17808
2    17807
Name: user_id, dtype: int64

=== Minimum Entropy ===
1.4125240923851765

=== Sample of merged output ===
   user_id    rating  cluster   entropy  book_id                      genre
0        1  6.908057        2  3.345998     5189            Classics, Drama
1        1  6.471402        2  3.345998     8360      Adventure, Historical
2        1  6.462206        2  3.345998     6751      Historical, Adventure
3        1  6.389796        2  3.345998     5537            Classics, Drama
4        1  6.353412        2  3.345998     6063        Classics, Adventure
5        1  6.336477        2  3.3459

# real

In [5]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from typing import List, Dict, Tuple, Set

# ====================== CONFIG ======================
BASE_DIR = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0923")
PRIMARY_DIR  = BASE_DIR / "primary_analysis"
ENHANCED_DIR = BASE_DIR / "enhanced_analysis"

USER_COL  = "user_id"
GENRE_COL = "genres_all"

RUNS     = [25, 50, 100, 200]
K_LIST   = [15, 25, 35]
NUM_BINS = 10

# Fix a seed so the random binning is reproducible and
# the same randomness is followed everywhere.
RANDOM_SEED = 42

FIXED_GENRES = [
    "Adult", "Adventure", "Children_s", "Classics", "Drama", "Fantasy",
    "Historical", "Horror", "Mystery", "Nonfiction", "Romance", "Science_Fiction", "Thriller"
]

# If you want to load cluster file here, set a path; otherwise the script
# will use an existing df_cluster variable in memory.
CLUSTER_CSV = None
# ===================================================

def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def normalize_genre_for_match(g: str) -> str:
    x = g.strip().lower().replace("_", " ")
    x = re.sub(r"\bchildren s\b", "children's", x)
    return x

def split_genres_cell(cell: str) -> List[str]:
    if pd.isna(cell):
        return []
    parts = re.split(r"[;,]", str(cell))
    return [normalize_genre_for_match(p) for p in parts]

def count_genre_per_user(csv_path: Path, target_genre_token: str) -> pd.Series:
    """
    Returns a Series indexed by user_id with the count of rows in this CSV
    where the row's genres_all contains the target genre (0/1 per row, summed).
    """
    df = pd.read_csv(csv_path)
    if USER_COL not in df.columns or GENRE_COL not in df.columns:
        raise ValueError(f"{csv_path} must contain '{USER_COL}' and '{GENRE_COL}'.")
    tgt = normalize_genre_for_match(target_genre_token)
    df["_hit"] = df[GENRE_COL].apply(lambda cell: int(tgt in split_genres_cell(cell)))
    s = df.groupby(USER_COL)["_hit"].sum()
    # Ensure user ids are int for consistent joins
    s.index = s.index.astype(int)
    return s

def intersect_users(series_list: List[pd.Series]) -> pd.Index:
    if not series_list:
        return pd.Index([])
    inter = series_list[0].index
    for s in series_list[1:]:
        inter = inter.intersection(s.index)
    return inter

def make_random_order_per_cluster(cluster_users: Set[int], seed: int) -> List[int]:
    """
    Fix a random permutation of users for this cluster, used everywhere.
    """
    rng = np.random.default_rng(seed)
    arr = np.array(sorted(list(cluster_users), key=int))  # sort then shuffle for determinism
    rng.shuffle(arr)
    return arr.tolist()

def make_bins_from_fixed_order(
    fixed_order: List[int],
    allowed_users: Set[int],
    num_bins: int
) -> Tuple[List[List[int]], List[int]]:
    """
    Filter the fixed random order by allowed_users, then split into num_bins
    (equal sizes as much as possible). Return (bins, bin_sizes).
    """
    ordered_allowed = [u for u in fixed_order if u in allowed_users]
    n = len(ordered_allowed)
    if n == 0:
        return [[] for _ in range(num_bins)], [0] * num_bins

    base = n // num_bins
    rem  = n % num_bins
    bins, sizes = [], []
    start = 0
    for i in range(num_bins):
        size = base + (1 if i < rem else 0)
        end = start + size
        bins.append(ordered_allowed[start:end])
        sizes.append(size)
        start = end
    return bins, sizes

def compute_bin_means_random_bins(
    s_original: pd.Series,
    variant_series: Dict[str, pd.Series],
    cluster_fixed_order: List[int],
    cluster_users: Set[int],
    available_users: Set[int],   # users in intersection for this (genre, K, kind)
    num_bins: int
) -> Tuple[Dict[str, List[float]], List[int]]:
    """
    Use the cluster's fixed random order -> filter to available_users -> 10 bins.
    Compute mean counts in each bin for ORIGINAL and variants.
    """
    # allowed = cluster users ∩ available users (safety)
    allowed_users = cluster_users.intersection(available_users)

    # build bins from fixed order
    bins, sizes = make_bins_from_fixed_order(cluster_fixed_order, allowed_users, num_bins)

    # compute means per bin
    out = {"ORIGINAL": []}
    for b in bins:
        out["ORIGINAL"].append(float(s_original.loc[b].mean()) if b else 0.0)

    for lab, s in variant_series.items():
        vals = []
        for b in bins:
            vals.append(float(s.loc[b].mean()) if b else 0.0)
        out[lab] = vals

    return out, sizes

def plot_grouped_bars(bin_stats: Dict[str, List[float]], bin_sizes: List[int], title: str, out_path: Path):
    labels = list(bin_stats.keys())  # e.g., ["ORIGINAL", "25", "50", "100", "200"]
    x = np.arange(NUM_BINS)
    width = 0.8 / len(labels)

    fig, ax = plt.subplots(figsize=(14, 6))
    for i, lab in enumerate(labels):
        offsets = (i - (len(labels)-1)/2.0) * width
        ax.bar(x + offsets, bin_stats[lab], width, label=("ORIGINAL" if lab=="ORIGINAL" else f"n={lab}"))

    xticks = [f"Bin {i+1}\n(n={bin_sizes[i]})" for i in range(NUM_BINS)]
    ax.set_xticks(x)
    ax.set_xticklabels(xticks, rotation=0)

    ax.set_xlabel("Random user bins within cluster (fixed seed; same randomness applied)")
    ax.set_ylabel("Avg count of target-genre items per user")
    ax.set_title(title)
    ax.legend()
    ax.grid(axis="y", alpha=0.2)
    ensure_dir(out_path.parent)
    plt.tight_layout()
    plt.savefig(out_path, dpi=160)
    plt.close(fig)

def save_bin_table(bin_stats: Dict[str, List[float]], bin_sizes: List[int], out_csv: Path):
    rows = []
    for i in range(NUM_BINS):
        row = {"bin": i+1, "bin_size": int(bin_sizes[i])}
        row["ORIGINAL"] = bin_stats["ORIGINAL"][i]
        for lab in ["25","50","100","200"]:
            row[f"n{lab}"] = bin_stats.get(lab, [0.0]*NUM_BINS)[i]
        rows.append(row)
    df_out = pd.DataFrame(rows, columns=["bin","bin_size","ORIGINAL","n25","n50","n100","n200"])
    ensure_dir(out_csv.parent)
    df_out.to_csv(out_csv, index=False)

def build_clustered_random_binning(
    kind: str,
    base_dir: Path,
    original_dir: Path,
    genres: List[str],
    clusters: Dict[int, Set[int]],
    cluster_fixed_orders: Dict[int, List[int]]
):
    """
    kind: 'primary' or 'enhanced'
    base_dir: where variant files live (primary_analysis or enhanced_analysis)
    original_dir: where ORIGINAL_K files live (primary_analysis)
    clusters: cluster_id -> set(user_id)
    cluster_fixed_orders: cluster_id -> fixed random order list of user_ids
    """
    prefix = "primary_p_" if kind == "primary" else "improved_"

    for g in genres:
        genre_folder = base_dir / "figure" / g
        ensure_dir(genre_folder)

        for K in K_LIST:
            # Load ORIGINAL baseline from primary
            original_file = original_dir / f"ORIGINAL_{K}recommendation.csv"
            try:
                s_original = count_genre_per_user(original_file, g)
            except Exception as e:
                print(f"[WARN] {kind} | {g} | K={K}: original load failed: {e}")
                continue

            # Load variants from current folder (n ∈ RUNS)
            s_variants = {}
            all_ok = True
            for r in RUNS:
                f = base_dir / f"{prefix}{g}_{r}_{K}recommendation.csv"
                try:
                    s_variants[str(r)] = count_genre_per_user(f, g)
                except Exception as e:
                    print(f"[WARN] {kind} | {g} | K={K}: missing/invalid variant n={r}: {e}")
                    all_ok = False
                    break
            if not all_ok:
                continue

            # Intersection of users across ORIGINAL and all variants (for this genre/K/kind)
            inter = intersect_users([s_original] + list(s_variants.values()))
            available_users = set(map(int, inter.tolist()))  # for speed

            # For each cluster: filter by cluster, then random-bins using fixed order
            for c in sorted(clusters.keys()):
                cluster_users = clusters[c]  # full set of users in cluster c
                total_users_c = len(cluster_users)

                # Use cluster's fixed random order
                fixed_order = cluster_fixed_orders[c]

                # Compute bin means on the intersection subset (no sorting by ORIGINAL)
                bin_stats, bin_sizes = compute_bin_means_random_bins(
                    s_original=s_original,
                    variant_series=s_variants,
                    cluster_fixed_order=fixed_order,
                    cluster_users=cluster_users,
                    available_users=available_users,
                    num_bins=NUM_BINS
                )

                # If all bins are zero-size, skip
                if sum(bin_sizes) == 0:
                    print(f"[INFO] {kind} | {g} | K={K} | cluster {c}: no overlapping users; skipping.")
                    continue

                title = (f"{kind.capitalize()} – {g} – K={K} – Cluster {c} "
                         f"(cluster users={total_users_c}, in-file users={sum(bin_sizes)})")

                out_png = genre_folder / f"{g}_K{K}_cluster{c}_{kind}.png"
                out_csv = genre_folder / f"{g}_K{K}_cluster{c}_{kind}_bins.csv"

                plot_grouped_bars(bin_stats, bin_sizes, title, out_png)
                save_bin_table(bin_stats, bin_sizes, out_csv)
                print(f"[OK] saved:\n  {out_png}\n  {out_csv}")

def main():
    # Load or use in-memory df_cluster
    if CLUSTER_CSV:
        dfc = pd.read_csv(CLUSTER_CSV)
    else:
        try:
            dfc = df_cluster.copy()  # type: ignore[name-defined]
        except NameError:
            raise RuntimeError("df_cluster not defined and CLUSTER_CSV is None.")

    # Basic checks
    if 'user_id' not in dfc.columns or 'cluster' not in dfc.columns:
        raise ValueError("df_cluster must contain 'user_id' and 'cluster'.")

    dfc['user_id'] = dfc['user_id'].astype(int)

    # Split users per cluster (disjoint by definition)
    clusters: Dict[int, Set[int]] = {int(c): set(dfc.loc[dfc['cluster']==c, 'user_id'].tolist())
                                     for c in sorted(dfc['cluster'].unique())}

    # Report cluster sizes
    for c in sorted(clusters.keys()):
        print(f"Cluster {c}: {len(clusters[c])} users")

    # Build a fixed random order for each cluster (reused everywhere)
    rng = np.random.default_rng(RANDOM_SEED)
    # Use different seeds per cluster to avoid accidental alignment, but still deterministic
    cluster_fixed_orders: Dict[int, List[int]] = {}
    for c in sorted(clusters.keys()):
        # Derive a per-cluster seed deterministically from the base seed and cluster id
        per_cluster_seed = RANDOM_SEED + int(c) * 100003  # large co-prime-ish offset
        cluster_fixed_orders[c] = make_random_order_per_cluster(clusters[c], per_cluster_seed)

    # Run for primary (variants & original both in PRIMARY_DIR; ORIGINAL taken from PRIMARY_DIR)
    build_clustered_random_binning(
        kind="primary",
        base_dir=PRIMARY_DIR,
        original_dir=PRIMARY_DIR,
        genres=FIXED_GENRES,
        clusters=clusters,
        cluster_fixed_orders=cluster_fixed_orders
    )

    # Run for enhanced (variants in ENHANCED_DIR; ORIGINAL still from PRIMARY_DIR)
    build_clustered_random_binning(
        kind="enhanced",
        base_dir=ENHANCED_DIR,
        original_dir=PRIMARY_DIR,
        genres=FIXED_GENRES,
        clusters=clusters,
        cluster_fixed_orders=cluster_fixed_orders
    )

if __name__ == "__main__":
    main()


Cluster 0: 17809 users
Cluster 1: 17808 users
Cluster 2: 17807 users
[OK] saved:
  /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0923/primary_analysis/figure/Adult/Adult_K15_cluster0_primary.png
  /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0923/primary_analysis/figure/Adult/Adult_K15_cluster0_primary_bins.csv
[OK] saved:
  /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0923/primary_analysis/figure/Adult/Adult_K15_cluster1_primary.png
  /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0923/primary_analysis/figure/Adult/Adult_K15_cluster1_primary_bins.csv
[OK] saved:
  /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0923/primary_analysis/figure/Adult/Adult_K15_cluster2_primary.png
  /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0923/primary_analysis/figure/Adult/Adult_K15_cluster2_primary_bins.csv
[OK] saved:
  /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top

## %

In [None]:
# ====================== PERCENTAGE PLOTS (ORIGINAL = 100%) ======================
# Saves per-genre percentage bar charts & TXT summaries into:
#   /.../primary_analysis/figure/<GENRE>/percentage/<GENRE>_percent_primary.(png|txt)
#   /.../enhanced_analysis/figure/<GENRE>/percentage/<GENRE>_percent_enhanced.(png|txt)

import os
import re
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ---------------------- CONFIG ----------------------
BASE_DIR = Path("/home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0923")

PRIMARY_DIR  = BASE_DIR / "primary_analysis"
ENHANCED_DIR = BASE_DIR / "enhanced_analysis"

USER_COL  = "user_id"
GENRE_COL = "genres_all"

RUNS   = [25, 50, 100, 200]
K_LIST = [15, 25, 35]

FIXED_GENRES = [
    "Adult", "Adventure", "Children_s", "Classics", "Drama", "Fantasy",
    "Historical", "Horror", "Mystery", "Nonfiction", "Romance", "Science_Fiction", "Thriller"
]
# ----------------------------------------------------

def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def normalize_genre_for_match(g: str) -> str:
    x = g.strip().lower().replace("_", " ")
    x = re.sub(r"\bchildren s\b", "children's", x)
    return x

def split_genres_cell(cell: str):
    if pd.isna(cell):
        return []
    parts = re.split(r"[;,]", str(cell))
    return [normalize_genre_for_match(p) for p in parts]

def count_genre_per_user(csv_path: Path, target_genre_token: str) -> pd.Series:
    """Return a Series: index=user_id, value=# of items of the target genre in that user's recommendations."""
    if not csv_path.exists():
        raise FileNotFoundError(f"Missing file: {csv_path}")
    df = pd.read_csv(csv_path)
    if USER_COL not in df.columns or GENRE_COL not in df.columns:
        raise ValueError(f"{csv_path} must contain '{USER_COL}' and '{GENRE_COL}'.")
    tgt = normalize_genre_for_match(target_genre_token)
    def has_target(cell):
        toks = split_genres_cell(cell)
        return int(tgt in toks)
    df["_hit"] = df[GENRE_COL].apply(has_target)
    return df.groupby(USER_COL)["_hit"].sum()

def _totals_for_kind(kind: str, base_dir: Path, original_dir: Path, genre: str) -> pd.DataFrame:
    """
    Returns tidy DataFrame with totals (sums across users) per K and label:
      columns: ['kind','genre','K','label','total']
      label ∈ {'ORIGINAL','n25','n50','n100','n200'}
    """
    rows = []
    if kind not in ("primary", "enhanced"):
        raise ValueError("kind must be 'primary' or 'enhanced'")
    prefix = "primary_p_" if kind == "primary" else "improved_"

    for K in K_LIST:
        # ORIGINAL (always taken from PRIMARY_DIR to keep baseline consistent)
        orig_path = original_dir / f"ORIGINAL_{K}recommendation.csv"
        try:
            s_orig = count_genre_per_user(orig_path, genre)
            total_orig = int(s_orig.sum())
            rows.append({"kind": kind, "genre": genre, "K": K, "label": "ORIGINAL", "total": total_orig})
        except Exception as e:
            print(f"[WARN] {kind} | {genre} | K={K}: failed loading ORIGINAL: {e}")
            continue

        # Variants for this kind
        for n in RUNS:
            var_path = base_dir / f"{prefix}{genre}_{n}_{K}recommendation.csv"
            try:
                s_var = count_genre_per_user(var_path, genre)
                rows.append({"kind": kind, "genre": genre, "K": K, "label": f"n{n}", "total": int(s_var.sum())})
            except Exception as e:
                print(f"[WARN] {kind} | {genre} | K={K} | n={n}: variant missing/invalid: {e}")
                # continue to next variant

    if not rows:
        return pd.DataFrame(columns=["kind","genre","K","label","total"])
    return pd.DataFrame(rows)

def _to_percent(df_tot: pd.DataFrame) -> pd.DataFrame:
    """
    Convert totals to percentages relative to ORIGINAL per K.
    ORIGINAL becomes exactly 100.0 for each K.
    Variants get (variant / original) * 100, with safe zero handling.
    """
    out = []
    for K in sorted(df_tot["K"].unique()):
        sub = df_tot[df_tot["K"] == K]
        if sub.empty: 
            continue
        orig_row = sub[sub["label"] == "ORIGINAL"]
        if orig_row.empty:
            # no baseline; skip this K
            continue
        orig_total = float(orig_row["total"].iloc[0])
        for _, r in sub.iterrows():
            lab = r["label"]
            if lab == "ORIGINAL":
                pct = 100.0
            else:
                if orig_total <= 0:
                    pct = 0.0  # nothing in baseline; show variants as 0% to avoid div-by-zero
                else:
                    pct = (float(r["total"]) / orig_total) * 100.0
            out.append({
                "kind": r["kind"],
                "genre": r["genre"],
                "K": K,
                "label": lab,
                "percent": pct
            })
    return pd.DataFrame(out)

def _write_percent_txt(df_pct: pd.DataFrame, out_txt: Path):
    """
    Write a TXT summary with ORIGINAL=100% and variant percentages per K.
    """
    lines = []
    for K in sorted(df_pct["K"].unique()):
        sub = df_pct[df_pct["K"] == K]
        # Ensure fixed order
        for lab in ["ORIGINAL", "n25", "n50", "n100", "n200"]:
            row = sub[sub["label"] == lab]
            if not row.empty:
                lines.append(f"K={K} | {lab}: {row['percent'].iloc[0]:.2f}%")
        lines.append("")
    ensure_dir(out_txt.parent)
    with open(out_txt, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))

def _plot_percent(df_pct: pd.DataFrame, title: str, out_png: Path):
    """
    Grouped bar chart of percentages: x-axis = K (15/25/35),
    bars = ORIGINAL (100%), n25, n50, n100, n200.
    """
    if df_pct.empty:
        print(f"[INFO] Nothing to plot for {title}")
        return

    labels = ["ORIGINAL","n25","n50","n100","n200"]
    K_vals = sorted(df_pct["K"].unique().tolist())
    mat = {lab: [] for lab in labels}
    for K in K_vals:
        sub = df_pct[df_pct["K"] == K]
        for lab in labels:
            row = sub[sub["label"] == lab]
            mat[lab].append(float(row["percent"].iloc[0]) if not row.empty else 0.0)

    x = np.arange(len(K_vals))
    n_series = len(labels)
    width = 0.8 / n_series

    fig, ax = plt.subplots(figsize=(12,6))
    for i, lab in enumerate(labels):
        xs = x + (i - (n_series-1)/2.0) * width
        ax.bar(xs, mat[lab], width, label=lab)

    ax.set_xticks(x)
    ax.set_xticklabels([f"K={K}" for K in K_vals])
    ax.set_xlabel("K")
    ax.set_ylabel("Percentage vs ORIGINAL (ORIGINAL = 100%)")
    ax.set_title(title)
    ax.legend()
    ax.set_ylim(0, max(110, np.nanmax([max(v) for v in mat.values()]) * 1.10))
    ax.grid(axis="y", alpha=0.25)

    ensure_dir(out_png.parent)
    plt.tight_layout()
    plt.savefig(out_png, dpi=160)
    plt.close(fig)

def write_percentage_plots():
    """
    For each genre:
      - PRIMARY percentages PNG+TXT in .../primary_analysis/figure/<GENRE>/percentage/
      - ENHANCED percentages PNG+TXT in .../enhanced_analysis/figure/<GENRE>/percentage/
    """
    for g in FIXED_GENRES:
        # PRIMARY
        df_primary = _totals_for_kind("primary", PRIMARY_DIR, PRIMARY_DIR, g)
        df_primary_pct = _to_percent(df_primary)
        prim_folder = PRIMARY_DIR / "figure" / g / "percentage"
        prim_png = prim_folder / f"{g}_percent_primary.png"
        prim_txt = prim_folder / f"{g}_percent_primary.txt"
        _write_percent_txt(df_primary_pct, prim_txt)
        _plot_percent(df_primary_pct, title=f"Primary – {g} – % vs ORIGINAL", out_png=prim_png)
        print(f"[OK] Primary percentage written → {prim_png} and {prim_txt}")

        # ENHANCED (baseline still ORIGINAL from PRIMARY_DIR)
        df_enh = _totals_for_kind("enhanced", ENHANCED_DIR, PRIMARY_DIR, g)
        df_enh_pct = _to_percent(df_enh)
        enh_folder = ENHANCED_DIR / "figure" / g / "percentage"
        enh_png = enh_folder / f"{g}_percent_enhanced.png"
        enh_txt = enh_folder / f"{g}_percent_enhanced.txt"
        _write_percent_txt(df_enh_pct, enh_txt)
        _plot_percent(df_enh_pct, title=f"Enhanced – {g} – % vs ORIGINAL", out_png=enh_png)
        print(f"[OK] Enhanced percentage written → {enh_png} and {enh_txt}")

if __name__ == "__main__":
    write_percentage_plots()


[OK] Primary percentage written → /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0923/primary_analysis/figure/Adult/percentage/Adult_percent_primary.png and /home/moshtasa/Research/phd-svd-recsys/SVD/Book/result/rec/top_re/0923/primary_analysis/figure/Adult/percentage/Adult_percent_primary.txt
