In [2]:
#!/usr/bin/env python3
# ============================================================
# MovieLens Decade Analysis
# INPUT:
#   Movie /result/rec/1215/{SVD,KNN,NMF}
#   ORIGINAL_<K>recommendation.csv
#   df_biased_<N>_<DECADE>_<K>recommendation.csv
#
# OUTPUT (same model folder):
#   figures/<DECADE>_explanation/
#       <DECADE>__pos5_<MODEL>.png
#       <DECADE>__<MODEL>_per_item_ranking.csv
#       <DECADE>__<MODEL>_explanation.txt
# ============================================================

from pathlib import Path
from typing import Dict, List, Optional
import pandas as pd
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

# ===================== ROOT CONFIG =====================
BASE_ROOT = Path("/home/moshtasa/Research/phd-svd-recsys/Movie /result/rec/1215")

MODELS = ["SVD", "KNN", "NMF"]

K_LIST = [15, 25, 35]
FICT_COUNTS = [2, 4, 10, 20, 40]
DECADES = [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990]

# ===================== HELPERS =====================
def load_csv(fp: Path) -> pd.DataFrame:
    return pd.read_csv(fp, low_memory=False)

def original_file(root: Path, k: int) -> Optional[Path]:
    fp = root / f"ORIGINAL_{k}recommendation.csv"
    return fp if fp.exists() else None

def biased_file(root: Path, n: int, decade: int, k: int) -> Optional[Path]:
    fp = root / f"df_biased_{n}_{decade}_{k}recommendation.csv"
    return fp if fp.exists() else None

def compute_metrics(df: pd.DataFrame, decade: int):
    mask = df["item_decade"] == decade
    per_user = df.assign(mask=mask).groupby("user_id")["mask"].sum()
    avg_user = float(per_user.mean()) if not per_user.empty else 0.0
    return avg_user, mask

def per_item_ranking(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty:
        return pd.DataFrame(columns=[
            "rank","item_id","freq","users_n",
            "avg_rank","avg_est_score","rank1_count"
        ])

    g = df.groupby("item_id", as_index=False)
    out = g.agg(
        freq=("item_id", "size"),
        users_n=("user_id", "nunique"),
        avg_rank=("rank", "mean"),
        avg_est_score=("est_score", "mean"),
        rank1_count=("rank", lambda s: int((s == 1).sum()))
    )
    out = out.sort_values(
        ["freq", "rank1_count", "avg_rank"],
        ascending=[False, False, True]
    ).reset_index(drop=True)
    out.insert(0, "rank", out.index + 1)
    return out

# ===================== PLOT =====================
def plot_decade(fig_dir: Path, model: str, decade: int, data_by_k: Dict[int, Dict[str, float]]):
    ks = sorted(data_by_k.keys())
    groups = ["Original"] + [str(n) for n in FICT_COUNTS]

    width = 0.8 / len(groups)
    x = range(len(ks))

    plt.figure(figsize=(8.5, 4.5), dpi=160)

    for j, g in enumerate(groups):
        vals = [data_by_k[k].get(g, 0.0) for k in ks]
        offs = [i + (j - len(groups)/2) * width for i in x]
        plt.bar(offs, vals, width=width, label=g)

    plt.xticks(list(x), [f"Top-{k}" for k in ks])
    plt.ylabel("Avg # movies per user")
    plt.title(f"{model} — Decade {decade} (POS=5)")
    plt.legend(ncol=3, fontsize=9)
    plt.tight_layout()

    plt.savefig(fig_dir / f"{decade}__pos5_{model}.png")
    plt.close()

# ===================== CORE =====================
def process_model(model: str):
    model_root = BASE_ROOT / model
    figures_root = model_root / "figures"
    figures_root.mkdir(exist_ok=True)

    print(f"\n===== Processing {model} =====")

    for decade in DECADES:
        print(f"  -> Decade {decade}")

        decade_dir = figures_root / f"{decade}_explanation"
        decade_dir.mkdir(parents=True, exist_ok=True)

        data_by_k = {k: {} for k in K_LIST}
        all_rows: List[pd.DataFrame] = []

        # -------- ORIGINAL --------
        for k in K_LIST:
            fp = original_file(model_root, k)
            if not fp:
                continue
            df = load_csv(fp)
            avg_user, mask = compute_metrics(df, decade)
            data_by_k[k]["Original"] = avg_user

            ranked = per_item_ranking(df.loc[mask])
            ranked["model"] = model
            ranked["decade"] = decade
            ranked["dataset"] = f"original_{k}"
            all_rows.append(ranked)

        # -------- BIASED --------
        for n in FICT_COUNTS:
            for k in K_LIST:
                fp = biased_file(model_root, n, decade, k)
                if not fp:
                    continue
                df = load_csv(fp)
                avg_user, mask = compute_metrics(df, decade)
                data_by_k[k][str(n)] = avg_user

                ranked = per_item_ranking(df.loc[mask])
                ranked["model"] = model
                ranked["decade"] = decade
                ranked["dataset"] = f"{n}u_{k}"
                all_rows.append(ranked)

        # -------- SAVE CSV --------
        if all_rows:
            final_df = pd.concat(all_rows, ignore_index=True)
        else:
            final_df = pd.DataFrame()

        final_df.to_csv(
            decade_dir / f"{decade}__{model}_per_item_ranking.csv",
            index=False
        )

        # -------- SAVE TXT --------
        with open(decade_dir / f"{decade}__{model}_explanation.txt", "w") as f:
            f.write(f"{model} — decade {decade}\n")
            for k in K_LIST:
                for key, val in data_by_k[k].items():
                    f.write(f"{key}, top-{k}, avg_per_user: {val:.4f}\n")

        # -------- SAVE FIG --------
        plot_decade(decade_dir, model, decade, data_by_k)

# ===================== MAIN =====================
def main():
    for model in MODELS:
        process_model(model)
    print("\n✅ ALL MODELS FINISHED SUCCESSFULLY")

if __name__ == "__main__":
    main()



===== Processing SVD =====
  -> Decade 1920


  -> Decade 1930
  -> Decade 1940
  -> Decade 1950
  -> Decade 1960
  -> Decade 1970
  -> Decade 1980
  -> Decade 1990

===== Processing KNN =====
  -> Decade 1920


  final_df = pd.concat(all_rows, ignore_index=True)


  -> Decade 1930
  -> Decade 1940
  -> Decade 1950
  -> Decade 1960
  -> Decade 1970
  -> Decade 1980
  -> Decade 1990

===== Processing NMF =====
  -> Decade 1920


  final_df = pd.concat(all_rows, ignore_index=True)


  -> Decade 1930
  -> Decade 1940
  -> Decade 1950
  -> Decade 1960
  -> Decade 1970
  -> Decade 1980
  -> Decade 1990

✅ ALL MODELS FINISHED SUCCESSFULLY
