# Évaluation d'un système de recommandation My Content

Notebook pour entraîner et comparer plusieurs approches de recommandation sur le dataset Kaggle **news-portal-user-interactions-by-globocom**. L'objectif est de montrer clairement chaque étape (du chargement des données jusqu'au choix final du modèle).

> Ce notebook aligne désormais **toutes les approches de recommandation sur la bibliothèque Surprise** (https://surprise.readthedocs.io/) afin de bénéficier d'algorithmes collaboratifs standardisés et faciles à déployer.

In [None]:
# Imports & Config
from __future__ import annotations
import os
import pickle
import sys
import time
from pathlib import Path
from typing import Dict, List, Optional, Tuple

import numpy as np
import pandas as pd

pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None)

# Ensure the project root is importable
PROJECT_ROOT = Path('.').resolve().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

# Configuration
CONFIG = {
    "clicks_dir": "../data/news-portal-user-interactions-by-globocom/clicks",
    "metadata_path": "../data/news-portal-user-interactions-by-globocom/articles_metadata.csv",
    "embeddings_path": "../data/news-portal-user-interactions-by-globocom/articles_embeddings.pickle",
    "max_click_files": None,
    "artifacts_dir": "../artifacts/evaluation",
    "k": 5,
    "random_seed": 42,
    "min_user_interactions": 3,
}
np.random.seed(CONFIG["random_seed"])
Path(CONFIG["artifacts_dir"]).mkdir(parents=True, exist_ok=True)
print("Config ready", CONFIG)

# Context columns provided in the clicks dataset
CONTEXT_COLUMNS = [
    "click_environment",
    "click_deviceGroup",
    "click_os",
    "click_country",
    "click_region",
    "click_referrer_type",
]

from surprise import Dataset, Reader, NormalPredictor, SVDpp

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize


## Contexte

Nous voulons proposer à chaque lecteur un Top-5 d'articles susceptibles de l'intéresser. Le notebook illustre la démarche de A à Z : préparation des données, construction de différentes familles de modèles puis comparaison à l'aide de métriques de ranking.

## Données

Les fichiers attendus sont situés dans `/data/*`.

In [None]:

# Load data utilities


def detect_timestamp_column(df: pd.DataFrame) -> str:
    """Detect the timestamp-like column name."""
    candidates = ["click_timestamp", "timestamp", "event_time", "ts", "time"]
    for col in df.columns:
        if col in candidates or col.lower() in candidates:
            return col
    raise ValueError("No timestamp-like column found. Expected one of: " + ",".join(candidates))


def detect_article_column(df: pd.DataFrame) -> str:
    """Detect the article/item column name."""
    candidates = ["click_article_id", "clicked_article_id", "article_id", "item_id", "content_id"]
    for col in df.columns:
        if col in candidates:
            return col
    raise ValueError("No article id column found. Expected one of: " + ",".join(candidates))


def infer_unix_unit(values: pd.Series) -> str:
    numeric = pd.to_numeric(values, errors="coerce").dropna()
    if numeric.empty:
        return "s"
    max_abs = numeric.abs().max()
    if max_abs >= 1e14:
        return "ns"
    if max_abs >= 1e11:
        return "ms"
    return "s"


def to_timestamp(series: pd.Series) -> pd.Series:
    if pd.api.types.is_datetime64_any_dtype(series):
        return pd.to_datetime(series)
    if pd.api.types.is_numeric_dtype(series):
        unit = infer_unix_unit(series)
        return pd.to_datetime(series, unit=unit, errors="coerce")

    converted = pd.to_datetime(series, errors="coerce")
    if converted.notna().any():
        return converted

    unit = infer_unix_unit(series)
    return pd.to_datetime(series, unit=unit, errors="coerce")


def list_click_files(path: Union[str, Path]) -> List[Path]:
    path_obj = Path(path)
    if path_obj.is_file():
        return [path_obj]
    if path_obj.is_dir():
        return sorted(path_obj.glob("clicks_hour_*.csv"))
    return []


def ensure_context_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Ensure session_size and context columns exist with safe defaults."""
    df = df.copy()
    if "session_size" not in df.columns:
        df["session_size"] = 1
    for col in CONTEXT_COLUMNS:
        if col not in df.columns:
            df[col] = "unknown"
    return df


def create_synthetic_clicks(path: str, n_users: int = 50, n_items: int = 120, days: int = 30, interactions_per_user: int = 25) -> pd.DataFrame:
    """Create a small synthetic clicks dataset to keep the notebook runnable."""
    rng = np.random.default_rng(CONFIG["random_seed"])
    start = pd.Timestamp("2022-01-01")
    envs = ["web", "app"]
    devices = ["mobile", "desktop"]
    oss = ["ios", "android", "linux"]
    referrers = ["direct", "search", "social"]
    records = []
    for user in range(1, n_users + 1):
        offsets = rng.integers(0, days, size=interactions_per_user)
        timestamps = [start + pd.Timedelta(int(o), unit="D") for o in sorted(offsets.tolist())]
        articles = rng.integers(1, n_items + 1, size=interactions_per_user)
        for ts, art in zip(timestamps, articles):
            records.append({
                "user_id": int(user),
                "article_id": int(art),
                "timestamp": ts,
                "session_size": int(rng.integers(1, 6)),
                "click_environment": rng.choice(envs),
                "click_deviceGroup": rng.choice(devices),
                "click_os": rng.choice(oss),
                "click_country": rng.choice(["fr", "us", "br"]),
                "click_region": rng.choice(["idf", "sp", "ca"]),
                "click_referrer_type": rng.choice(referrers),
            })
    df = pd.DataFrame(records).sort_values("timestamp").reset_index(drop=True)
    Path(path).parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(path, index=False)
    print(
        f"Synthetic clicks dataset created at {path} "
        f"(users={n_users}, items={n_items}, interactions={len(df)})"
    )
    return df


def load_clicks(path: str, max_files: Optional[int] = None) -> pd.DataFrame:
    """Load clicks data from the Globo hourly files, with a safety cap."""
    files = list_click_files(path)
    total_files = len(files)
    if not files:
        print(f"Clicks directory not found at {path}. Generating a synthetic sample for demonstration.")
        return ensure_context_columns(create_synthetic_clicks(Path(path) / "clicks_hour_000.csv"))

    if max_files is not None:
        print(f"Limite explicite max_files={max_files}, total détecté={total_files}")
        files = files[:max_files]

    print(f"Chargement de {len(files)} fichiers clicks (total détecté={total_files}, limite={max_files if max_files is not None else 'aucune'})")
    frames = []
    for file in files:
        df = pd.read_csv(file)
        ts_col = detect_timestamp_column(df)
        article_col = detect_article_column(df)
        df[ts_col] = to_timestamp(df[ts_col])
        df = df.rename(columns={ts_col: "timestamp", article_col: "article_id"})
        df = ensure_context_columns(df)
        keep_cols = [col for col in [
            "user_id",
            "article_id",
            "timestamp",
            "session_size",
            *CONTEXT_COLUMNS,
        ] if col in df.columns]
        frames.append(df[keep_cols])

    combined = pd.concat(frames, ignore_index=True)
    combined = combined.sort_values("timestamp").reset_index(drop=True)
    print(f"Clicks agrégés : {len(combined)} lignes, {combined['user_id'].nunique()} utilisateurs uniques, {combined['article_id'].nunique()} articles uniques.")
    return combined


def load_metadata(path: str) -> Optional[pd.DataFrame]:
    """Load article metadata if available."""
    if not os.path.exists(path):
        print(f"Metadata file not found at {path}. Utilisation du pipeline Surprise uniquement si les métadonnées sont absentes.")
        return None
    meta = pd.read_csv(path)
    if "article_id" not in meta.columns:
        print("Metadata missing 'article_id' column. Ignoring metadata.")
        return None
    return meta


clicks = load_clicks(CONFIG["clicks_dir"], max_files=CONFIG["max_click_files"])
metadata = load_metadata(CONFIG["metadata_path"])
print(clicks.head())
print("Metadata loaded:", metadata is not None)



## Analyse exploratoire des données

Courte photographie des fichiers sources immédiatement après le chargement :
- nombre de lignes et noms de colonnes des clics
- volumes et intégrité des métadonnées articles
- dimensions et structure du fichier d'`articles_embeddings`.

In [None]:
# EDA rapide sur les données sources
import pickle
from pathlib import Path
from collections.abc import Mapping


def summarize_timestamps(series: pd.Series):
    series = pd.to_datetime(series)
    daily = series.dt.date.value_counts().sort_index().rename_axis("date").reset_index(name="nb_clicks")
    hourly = series.dt.hour.value_counts().sort_index().rename_axis("hour").reset_index(name="nb_clicks")
    return series.min(), series.max(), daily, hourly


def describe_structure(obj, prefix="embeddings", max_depth=4):
    entries = []

    def add_entry(path, value, note=None):
        entry = {"chemin": path, "type": type(value).__name__}
        if hasattr(value, "shape"):
            entry["shape"] = tuple(getattr(value, "shape"))
        elif hasattr(value, "__len__") and not isinstance(value, (str, bytes)):
            entry["len"] = len(value)
        if hasattr(value, "dtype"):
            entry["dtype"] = str(getattr(value, "dtype"))
        if note:
            entry["note"] = note
        if isinstance(value, np.ndarray) and value.dtype.names:
            entry["dtype_fields"] = list(value.dtype.names)
        if isinstance(value, np.ndarray) and value.ndim == 1 and len(value) > 0 and not isinstance(value[0], (np.ndarray, list, tuple, Mapping)):
            entry["exemple"] = repr(value[:3].tolist())
        entries.append(entry)

    def walk(value, path, depth):
        add_entry(path, value)
        if depth >= max_depth:
            return
        if isinstance(value, Mapping):
            for k, v in value.items():
                walk(v, f"{path}.{k}", depth + 1)
        elif isinstance(value, (list, tuple, np.ndarray)) and not isinstance(value, (str, bytes)):
            if len(value) > 0:
                walk(value[0], f"{path}[0]", depth + 1)

    walk(obj, prefix, 0)
    return entries


click_files = list_click_files(CONFIG["clicks_dir"])
print(f"Nombre total de fichiers clicks détectés: {len(click_files)}")
if not click_files:
    print("Aucun fichier clicks trouvé au chemin configuré. Vérifiez le téléchargement des données.")

files_for_eda = click_files[:2]
per_file_stats = []
for file in files_for_eda:
    df_file = pd.read_csv(file)
    ts_col = detect_timestamp_column(df_file)
    article_col = detect_article_column(df_file)
    timestamps = to_timestamp(df_file[ts_col])
    per_file_stats.append(
        {
            "fichier": file.name,
            "nb_lignes": len(df_file),
            "colonnes": ", ".join(df_file.columns),
            "articles_uniques": df_file[article_col].nunique(),
            "horodatage_min": timestamps.min(),
            "horodatage_max": timestamps.max(),
        }
    )
if per_file_stats:
    display(pd.DataFrame(per_file_stats))
else:
    print("Pas assez de fichiers pour réaliser une EDA détaillée par fichier.")

print("=== Clicks (agrégés) ===")
if clicks.empty:
    print("Aucun clic chargé. Vérifier le chemin ou augmenter max_click_files.")
else:
    clicks_summary = {
        "nb_lignes": len(clicks),
        "colonnes": ", ".join(clicks.columns),
        "utilisateurs_uniques": clicks['user_id'].nunique() if 'user_id' in clicks else None,
        "articles_uniques": clicks['article_id'].nunique() if 'article_id' in clicks else None,
    }
    display(pd.DataFrame([clicks_summary]))

    total_articles = None
    if metadata is not None and 'article_id' in metadata:
        total_articles = metadata['article_id'].nunique()
    elif 'article_id' in clicks:
        total_articles = clicks['article_id'].nunique()

    total_clients = clicks['user_id'].nunique() if 'user_id' in clicks else None
    print("Synthèse globale (articles / clients)")
    display(pd.DataFrame([{
        'nombre_total_articles': total_articles,
        'nombre_total_clients': total_clients,
    }]))

    ts_min, ts_max, daily, hourly = summarize_timestamps(clicks['timestamp'])
    display(pd.DataFrame([
        {
            'horodatage_min': ts_min,
            'horodatage_max': ts_max,
            'fenetre_jours': (ts_max - ts_min).days + 1,
        }
    ]))
    print("Répartition par jour (jusqu'à 10 premières valeurs)")
    display(daily.head(10))
    print("Répartition par heure (0-23)")
    display(hourly)

print("=== Métadonnées des articles ===")
if metadata is None:
    print("Aucun fichier metadata chargé.")
else:
    meta_summary = {
        "nb_articles": len(metadata),
        "colonnes": ", ".join(metadata.columns),
        "articles_uniques": metadata['article_id'].nunique() if 'article_id' in metadata else None,
    }
    display(pd.DataFrame([meta_summary]))
    missing = metadata.isna().sum().sort_values(ascending=False)
    display(missing.to_frame('valeurs_manquantes'))
    if 'created_at_ts' in metadata.columns:
        created = to_timestamp(metadata['created_at_ts'])
        display(pd.DataFrame([{'premier_article': created.min(), 'dernier_article': created.max()}]))
    if 'article_id' in metadata.columns:
        overlap = set(clicks['article_id'].unique()) if 'article_id' in clicks.columns else set()
        coverage = len(overlap & set(metadata['article_id'].unique()))
        print(f"Articles présents dans clicks et metadata: {coverage}")


print("=== Embeddings d'articles ===")
embeddings_path = Path(CONFIG['embeddings_path'])
if embeddings_path.exists():
    with embeddings_path.open('rb') as f:
        embeddings_obj = pickle.load(f)
    print(f"Type chargé: {type(embeddings_obj)}")

    def summarize_matrix(mat):
        stats = {
            'shape': getattr(mat, 'shape', None),
            'dtype': getattr(mat, 'dtype', None),
        }

        dim_values = []
        shape = getattr(mat, 'shape', None)
        if shape is not None and len(shape) >= 2:
            dim_values.append(shape[1])
        elif isinstance(mat, (list, tuple, np.ndarray)):
            for row in mat:
                if hasattr(row, '__len__') and not isinstance(row, (str, bytes)):
                    try:
                        dim_values.append(len(row))
                    except TypeError:
                        continue

        if dim_values:
            stats.update({
                'profondeur_min': min(dim_values),
                'profondeur_moyenne': float(np.mean(dim_values)),
                'profondeur_max': max(dim_values),
            })

        if hasattr(mat, 'shape') and len(getattr(mat, 'shape', [])) == 2:
            norms = np.linalg.norm(mat, axis=1)
            stats.update(
                {
                    'nb_vectors': mat.shape[0],
                    'dim': mat.shape[1],
                    'norm_min': norms.min(),
                    'norm_max': norms.max(),
                    'norm_moyenne': norms.mean(),
                }
            )
        return stats

    base_structure = describe_structure(embeddings_obj, max_depth=4)

    if isinstance(embeddings_obj, dict):
        keys = list(embeddings_obj.keys())
        print(f"Clés disponibles: {keys}")
        matrix = embeddings_obj.get('embeddings')
        ids = embeddings_obj.get('articles_ids') or embeddings_obj.get('article_ids')

        structure = base_structure.copy()
        if ids is not None:
            structure.insert(0, {
                'chemin': 'embeddings.article_ids',
                'type': type(ids).__name__,
                'len': len(ids),
                'note': "Identifiants d'articles fournis dans le fichier",
            })
        if structure:
            print("Structure détaillée de l'objet d'embeddings (par chemin de clé):")
            display(pd.DataFrame(structure))

        if matrix is not None:
            stats = summarize_matrix(matrix)
            stats.update(
                {
                    'colonnes': ", ".join(keys),
                    'nb_articles_ids': len(ids) if ids is not None else None,
                    'ids_uniques': len(set(ids)) if ids is not None else None,
                    'couverture_metadata': len(set(ids) & set(metadata['article_id']))
                    if (metadata is not None and ids is not None and 'article_id' in metadata)
                    else None,
                    'couverture_clicks': len(set(ids) & set(clicks['article_id']))
                    if (not clicks.empty and ids is not None and 'article_id' in clicks)
                    else None,
                }
            )
            display(pd.DataFrame([stats]))

            if ids is not None:
                sample_ids = ids[:5] if len(ids) >= 5 else ids
                print("Aperçu des premiers article_id liés aux embeddings:")
                display(pd.DataFrame({'article_id': sample_ids}))

            preview_cols = [f"emb_{i}" for i in range(min(5, matrix.shape[1] if hasattr(matrix, 'shape') else 0))]
            if preview_cols:
                preview = pd.DataFrame(matrix[:5, : len(preview_cols)], columns=preview_cols)
                if ids is not None:
                    preview.insert(0, 'article_id', ids[: len(preview)])
                print("Aperçu des embeddings (quelques colonnes et premières lignes):")
                display(preview)
                print("Colonnes affichées pour l'aperçu des embeddings:")
                print(", ".join(preview.columns))

                if ids is not None and metadata is not None and 'article_id' in metadata:
                    meta_cols = [c for c in ['title', 'category_id', 'created_at_ts', 'publisher'] if c in metadata.columns]
                    meta_sample = (
                        preview[['article_id']]
                        .merge(metadata[['article_id'] + meta_cols], on='article_id', how='left')
                    )
                    if 'created_at_ts' in meta_sample.columns:
                        meta_sample['created_at_ts'] = to_timestamp(meta_sample['created_at_ts'])
                    print("Exemple de liaison embedding -> metadata sur article_id (5 premières lignes):")
                    display(meta_sample.head())
        else:
            print("Aucune matrice d'embeddings explicite trouvée dans l'objet chargé.")
    elif hasattr(embeddings_obj, 'shape'):
        stats = summarize_matrix(embeddings_obj)

        inferred_ids = None
        mapping_note = None
        if metadata is not None and 'article_id' in metadata and hasattr(embeddings_obj, 'shape'):
            if embeddings_obj.shape[0] == len(metadata):
                inferred_ids = metadata['article_id'].reset_index(drop=True)
                mapping_note = (
                    "Aucun article_id explicite fourni ; association supposée alignée sur l'ordre des metadata."
                )
            else:
                mapping_note = (
                    "Aucun article_id dans le fichier d'embeddings et la taille ne correspond pas aux metadata : "
                    f"{embeddings_obj.shape[0]} vecteurs vs {len(metadata)} lignes de metadata."
                )
        else:
            mapping_note = (
                "Aucun identifiant d'article n'est présent dans le fichier d'embeddings (mapping externe requis)."
            )

        structure = base_structure.copy()
        if inferred_ids is not None:
            structure.insert(0, {
                'chemin': 'embeddings.article_id (inféré)',
                'type': type(inferred_ids).__name__,
                'len': len(inferred_ids),
                'note': "Alignement supposé sur metadata.article_id (index identique).",
            })
        if structure:
            print("Structure détaillée de l'objet d'embeddings (par chemin de clé):")
            display(pd.DataFrame(structure))

        if mapping_note:
            print(mapping_note)

        if inferred_ids is not None:
            stats.update(
                {
                    'ids_source': 'metadata.article_id (alignement par index)',
                    'ids_uniques': inferred_ids.nunique(),
                    'couverture_metadata': len(set(inferred_ids) & set(metadata['article_id'])),
                    'couverture_clicks': len(set(inferred_ids) & set(clicks['article_id'])) if not clicks.empty else None,
                }
            )

        display(pd.DataFrame([stats]))
        if len(getattr(embeddings_obj, 'shape', [])) >= 2 and embeddings_obj.shape[1] > 0:
            preview_cols = [f"emb_{i}" for i in range(min(5, embeddings_obj.shape[1]))]
            preview = pd.DataFrame(embeddings_obj[:5, : len(preview_cols)], columns=preview_cols)
            if inferred_ids is not None:
                preview.insert(0, 'article_id', inferred_ids.iloc[: len(preview)].values)
            print("Aperçu direct de la matrice d'embeddings:")
            display(preview)
            print("Colonnes affichées pour l'aperçu des embeddings:")
            print(", ".join(preview.columns))

            if inferred_ids is not None and metadata is not None:
                meta_cols = [c for c in ['title', 'category_id', 'created_at_ts', 'publisher'] if c in metadata.columns]
                meta_sample = preview[['article_id']].merge(
                    metadata[['article_id'] + meta_cols], on='article_id', how='left'
                )
                if 'created_at_ts' in meta_sample.columns:
                    meta_sample['created_at_ts'] = to_timestamp(meta_sample['created_at_ts'])
                print("Exemple de liaison embedding -> metadata sur article_id (inféré):")
                display(meta_sample.head())
        else:
            print("Objet chargé non structuré, utilisez type/len pour investiguer.")
else:
    print(f"Fichier d'embeddings introuvable à {embeddings_path}")





# Article Embeddings

Ce fichier contient les **embeddings des articles**, c’est-à-dire une **représentation numérique du contenu textuel** permettant de comparer les articles entre eux sur le plan sémantique.

* **Format** : matrice NumPy `(N, 250)` en `float32`
* **1 ligne = 1 article**
* **250 colonnes = dimensions latentes**
* Les valeurs individuelles n’ont pas de signification directe

L’`article_id` n’est **pas stocké explicitement** : il est **déduit de l’ordre des lignes**, qui doit rester aligné avec les métadonnées des articles.

La variable `words_count` indique le **nombre de mots du texte source** et sert uniquement d’indicateur de qualité du contenu.

Les embeddings **ne sont pas normalisés** : la **similarité cosinus** est la mesure recommandée pour comparer les articles.


## Préparation des données et split temporel

Les splits sont réalisés **par utilisateur**, en conservant l'ordre chronologique :
- historique utilisateur ordonné
- 1 interaction la plus récente pour le test
- 1 interaction juste avant pour la validation
- le reste pour l'entraînement

Cette stratégie garantit l'absence de fuite d'information tout en restant robuste pour des historiques courts.


In [None]:
# Data preparation utilities

def clean_interactions(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["user_id"] = pd.to_numeric(df["user_id"], errors="coerce")
    df["article_id"] = pd.to_numeric(df["article_id"], errors="coerce")
    df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
    df = df.dropna(subset=["user_id", "article_id", "timestamp"])
    df["user_id"] = df["user_id"].astype(int)
    df["article_id"] = df["article_id"].astype(int)
    df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True)
    return df

def build_user_histories(df: pd.DataFrame) -> Dict[int, List[int]]:
    return (
        df.sort_values(["user_id", "timestamp"])
        .groupby("user_id")["article_id"]
        .apply(list)
        .to_dict()
    )

def temporal_split_per_user(df: pd.DataFrame, min_interactions: int = 3) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    train_rows = []
    val_rows = []
    test_rows = []
    for user_id, group in df.groupby("user_id"):
        group = group.sort_values("timestamp")
        if len(group) < min_interactions:
            continue
        test_rows.append(group.iloc[-1])
        val_rows.append(group.iloc[-2])
        if len(group) > 2:
            train_rows.append(group.iloc[:-2])
    train_df = pd.concat(train_rows, ignore_index=True) if train_rows else pd.DataFrame(columns=df.columns)
    val_df = pd.DataFrame(val_rows) if val_rows else pd.DataFrame(columns=df.columns)
    test_df = pd.DataFrame(test_rows) if test_rows else pd.DataFrame(columns=df.columns)
    return train_df, val_df, test_df

clicks_clean = clean_interactions(clicks)
user_histories = build_user_histories(clicks_clean)
train_df, val_df, test_df = temporal_split_per_user(clicks_clean, CONFIG["min_user_interactions"])

print(f"Interactions totales après nettoyage: {len(clicks_clean)}")
print(f"Train: {len(train_df)}, Validation: {len(val_df)}, Test: {len(test_df)}")

# Candidate items are derived from training data only
candidate_items = sorted(train_df["article_id"].unique().tolist())
print(f"Catalog (train only): {len(candidate_items)} items")

# Leakage checks
def assert_no_leakage(train_df: pd.DataFrame, val_df: pd.DataFrame, test_df: pd.DataFrame) -> None:
    """Ensure temporal ordering per user without forbidding repeated items."""
    latest_train = train_df.groupby("user_id")["timestamp"].max()
    earliest_val = val_df.groupby("user_id")["timestamp"].min()
    earliest_test = test_df.groupby("user_id")["timestamp"].min()
    for user_id in latest_train.index:
        if user_id in earliest_val.index:
            assert latest_train[user_id] <= earliest_val[user_id], "Temporal leakage train->val"
        if user_id in earliest_test.index:
            assert latest_train[user_id] <= earliest_test[user_id], "Temporal leakage train->test"

    if not val_df.empty and not test_df.empty:
        latest_val = val_df.groupby("user_id")["timestamp"].max()
        earliest_test = test_df.groupby("user_id")["timestamp"].min()
        for user_id in latest_val.index:
            if user_id in earliest_test.index:
                assert latest_val[user_id] <= earliest_test[user_id], "Temporal leakage val->test"

assert_no_leakage(train_df, val_df, test_df)

train_user_items = train_df.groupby("user_id")["article_id"].apply(set).to_dict()

print(f"Users in train: {len(train_user_items)}")

display(train_df.head())


## Modèles et métriques de ranking

Les modèles sont entraînés **indépendamment** et évalués sur le même jeu de candidats (articles vus en train).
Pour le modèle avec pondération de session, nous transformons le signal implicite en :
`rating = rating_base + log1p(session_size)` afin de renforcer légèrement les sessions plus longues.

Le **rating_base** est défini comme `1 + 0.01 * rang_normalisé` (rang temporel par utilisateur),
ce qui introduit une légère variance nécessaire au calcul des prédictions sans changer la logique implicite.

**Note** : les modèles SVD++ exploitent à la fois les notes implicites et l'historique d'interactions
pour apprendre des facteurs latents (pas de similarité de contenu textuel direct).

---


In [None]:
# Modeling and evaluation


def build_surprise_trainset(df: pd.DataFrame, rating_col: str) -> Tuple[Dataset, object]:
    rating_min = float(df[rating_col].min())
    rating_max = float(df[rating_col].max())
    reader = Reader(rating_scale=(rating_min, rating_max))
    data = Dataset.load_from_df(df[["user_id", "article_id", rating_col]], reader)
    trainset = data.build_full_trainset()
    return data, trainset


from typing import Any, Dict, List, Tuple, Callable, Optional
import heapq
import multiprocessing as mp
import time
import numpy as np
import pandas as pd

# ----------------------------
# 1) Candidate generation
# ----------------------------
TOP_N_CANDIDATES = 5000  # tune: 2_000 / 5_000 / 10_000 / 20_000
popular_candidates: List[int] = (
    train_df.groupby("article_id")
    .size()
    .sort_values(ascending=False)
    .head(TOP_N_CANDIDATES)
    .index.astype(int)
    .tolist()
)

if not popular_candidates:
    popular_candidates = train_df["article_id"].dropna().astype(int).unique().tolist()


# ----------------------------
# 2) Fast recommenders
# ----------------------------

def recommend_random(rng: np.random.Generator, user_id: int, k: int) -> List[int]:
    seen = train_user_items.get(int(user_id), set())
    candidates = [item for item in popular_candidates if item not in seen]
    if not candidates:
        return []
    rng.shuffle(candidates)
    return candidates[:k]


def recommend_surprise_fast(model: Any, user_id: int, k: int) -> List[int]:
    """
    Score only a limited candidate pool (top-N popular items) to avoid O(all_items).
    Uses heapq.nlargest to avoid sorting the whole candidate list.
    """
    seen = train_user_items.get(int(user_id), set())
    candidates = [i for i in popular_candidates if i not in seen]
    if not candidates:
        return []

    scored: List[Tuple[int, float]] = []
    for item_id in candidates:
        est = float(model.predict(int(user_id), int(item_id)).est)
        scored.append((int(item_id), est))

    topk = heapq.nlargest(k, scored, key=lambda t: t[1])
    return [item_id for item_id, _ in topk]


def recommend_surprise(model: Any, user_id: int, k: int) -> List[int]:
    return recommend_surprise_fast(model, user_id, k)


class HybridSvdppKnnModel:
    def __init__(
        self,
        svdpp_weighted_model,
        knn_model,
        svdpp_weight: float = 0.6,
        knn_weight: float = 0.4,
    ) -> None:
        self.svdpp_weighted_model = svdpp_weighted_model
        self.knn_model = knn_model
        self.svdpp_weight = svdpp_weight
        self.knn_weight = knn_weight

    def predict(self, user_id, item_id):
        prediction = self.svdpp_weighted_model.predict(user_id, item_id)
        knn_score = self.knn_model.predict(user_id, item_id).est
        blended_score = self.svdpp_weight * prediction.est + self.knn_weight * knn_score
        return prediction._replace(est=blended_score)


# ----------------------------
# 3) Metrics
# ----------------------------

def precision_at_k(recommended: List[int], relevant: List[int], k: int) -> float:
    if not recommended:
        return 0.0
    recommended_k = recommended[:k]
    relevant_set = set(relevant)
    hits = sum(1 for item in recommended_k if item in relevant_set)
    return hits / k


def recall_at_k(recommended: List[int], relevant: List[int], k: int) -> float:
    if not relevant:
        return 0.0
    recommended_k = recommended[:k]
    relevant_set = set(relevant)
    hits = sum(1 for item in recommended_k if item in relevant_set)
    return hits / len(relevant_set)


def average_precision_at_k(recommended: List[int], relevant: List[int], k: int) -> float:
    if not relevant:
        return 0.0
    relevant_set = set(relevant)
    ap = 0.0
    hits = 0
    for idx, item in enumerate(recommended[:k], start=1):
        if item in relevant_set:
            hits += 1
            ap += hits / idx
    return ap / min(len(relevant_set), k)


def ndcg_at_k(recommended: List[int], relevant: List[int], k: int) -> float:
    if not relevant:
        return 0.0
    relevant_set = set(relevant)
    dcg = 0.0
    for idx, item in enumerate(recommended[:k], start=1):
        if item in relevant_set:
            dcg += 1.0 / np.log2(idx + 1)
    ideal_hits = min(len(relevant_set), k)
    idcg = sum(1.0 / np.log2(idx + 1) for idx in range(1, ideal_hits + 1))
    return dcg / idcg if idcg > 0 else 0.0



def rmse_from_predictions(predictions: List[float], targets: List[float]) -> float:
    if not predictions:
        return 0.0
    errors = np.subtract(np.array(predictions, dtype=float), np.array(targets, dtype=float))
    return float(np.sqrt(np.mean(errors ** 2)))


def score_rmse(
    predict_fn: Callable[[int, int], float],
    eval_df: pd.DataFrame,
    rating_col: str = "rating",
) -> float:
    if eval_df.empty or rating_col not in eval_df.columns:
        return 0.0
    predictions = []
    targets = []
    for row in eval_df.itertuples(index=False):
        predictions.append(float(predict_fn(int(row.user_id), int(row.article_id))))
        targets.append(float(getattr(row, rating_col)))
    return rmse_from_predictions(predictions, targets)


def make_predict_fn(model: Any) -> Callable[[int, int], float]:
    def _predict(user_id: int, item_id: int) -> float:
        prediction = model.predict(int(user_id), int(item_id))
        return float(getattr(prediction, "est", prediction))

    return _predict


# ----------------------------
# 4) Parallel evaluation helpers
# ----------------------------

_EVAL_RECOMMEND_FN = None
_EVAL_USER_RELEVANT = None
_EVAL_K = None


def _set_eval_globals(recommend_fn, user_relevant, k):
    global _EVAL_RECOMMEND_FN, _EVAL_USER_RELEVANT, _EVAL_K
    _EVAL_RECOMMEND_FN = recommend_fn
    _EVAL_USER_RELEVANT = user_relevant
    _EVAL_K = k


def _evaluate_user(user_id: int):
    relevant = _EVAL_USER_RELEVANT.get(int(user_id), [])
    user_start = time.perf_counter()
    recommended = _EVAL_RECOMMEND_FN(int(user_id), int(_EVAL_K))
    latency = time.perf_counter() - user_start
    return (
        {
            "precision": precision_at_k(recommended, relevant, _EVAL_K),
            "recall": recall_at_k(recommended, relevant, _EVAL_K),
            "map": average_precision_at_k(recommended, relevant, _EVAL_K),
            "ndcg": ndcg_at_k(recommended, relevant, _EVAL_K),
            "latency": float(latency),
        },
        recommended,
    )


def evaluate_model(
    model_name: str,
    recommend_fn: Callable[[int, int], List[int]],
    eval_df: pd.DataFrame,
    k: int,
    predict_fn: Optional[Callable[[int, int], float]] = None,
    rating_col: str = "rating",
    *,
    log_every: int = 2000,
    show_progress: bool = True,
    coverage_denominator: Optional[int] = None,
    parallel: bool = True,
    n_jobs: Optional[int] = 9,
) -> Dict[str, float]:
    """
    Retrocompatible replacement of your evaluate_model:
    - Same required signature and returned dict keys.
    - Adds optional progress logging (disabled by show_progress=False).
    - Allows overriding coverage denominator (defaults to len(popular_candidates) if available).
    - Enables multi-core evaluation when supported (parallel=True, n_jobs=None uses all cores).
    """


    start_time = time.perf_counter()
    per_user_metrics: List[Dict[str, float]] = []
    all_recommended: List[int] = []

    users = eval_df["user_id"].dropna().astype(int).unique().tolist()
    n_users = len(users)

    if coverage_denominator is None:
        coverage_denominator = max(len(popular_candidates), 1)

    if show_progress:
        print(f"▶ Evaluating: {model_name} | users={n_users} | k={k}", flush=True)


    rmse = score_rmse(predict_fn, eval_df, rating_col) if predict_fn is not None else 0.0

    user_relevant = (
        eval_df.groupby("user_id")["article_id"]
        .apply(lambda s: s.dropna().astype(int).tolist())
        .to_dict()
    )

    use_parallel = parallel and n_users > 1
    ctx = None
    if use_parallel:
        available_methods = mp.get_all_start_methods()
        if "fork" in available_methods:
            ctx = mp.get_context("fork")
        else:
            use_parallel = False
            if show_progress:
                print("⚠️ Multiprocessing fork not available; falling back to sequential evaluation.")

    if use_parallel and ctx is not None:
        if n_jobs is None:
            n_jobs = ctx.cpu_count()
        n_jobs = max(1, min(int(n_jobs), n_users))
        chunksize = max(1, n_users // (n_jobs * 4))

        _set_eval_globals(recommend_fn, user_relevant, k)
        with ctx.Pool(
            processes=n_jobs,
        ) as pool:
            for idx, (metrics, recommended) in enumerate(
                pool.imap_unordered(_evaluate_user, users, chunksize=chunksize), start=1
            ):
                per_user_metrics.append(metrics)
                all_recommended.extend(recommended)

                if show_progress and (
                    idx == 1 or idx == n_users or (log_every > 0 and idx % log_every == 0)
                ):
                    elapsed = time.perf_counter() - start_time
                    avg_per_user = elapsed / max(idx, 1)
                    eta = avg_per_user * (n_users - idx)
                    print(
                        f"[{model_name}] {idx}/{n_users} ({idx/n_users:.1%}) "
                        f"elapsed={elapsed/60:.1f}min ETA={eta/60:.1f}min "
                        f"avg_latency={avg_per_user:.4f}s/user",
                        flush=True,
                    )
    else:
        for idx, user_id in enumerate(users, start=1):
            relevant = user_relevant.get(int(user_id), [])

            user_start = time.perf_counter()
            recommended = recommend_fn(int(user_id), int(k))
            latency = time.perf_counter() - user_start

            all_recommended.extend(recommended)

            per_user_metrics.append(
                {
                    "precision": precision_at_k(recommended, relevant, k),
                    "recall": recall_at_k(recommended, relevant, k),
                    "map": average_precision_at_k(recommended, relevant, k),
                    "ndcg": ndcg_at_k(recommended, relevant, k),
                    "latency": float(latency),
                }
            )

            if show_progress and (
                idx == 1 or idx == n_users or (log_every > 0 and idx % log_every == 0)
            ):
                elapsed = time.perf_counter() - start_time
                avg_per_user = elapsed / max(idx, 1)
                eta = avg_per_user * (n_users - idx)
                print(
                    f"[{model_name}] {idx}/{n_users} ({idx/n_users:.1%}) "
                    f"elapsed={elapsed/60:.1f}min ETA={eta/60:.1f}min "
                    f"avg_latency={avg_per_user:.4f}s/user",
                    flush=True,
                )

    total_time = time.perf_counter() - start_time

    if not per_user_metrics:
        return {
            "model": model_name,
            "precision@k": 0.0,
            "recall@k": 0.0,
            "map@k": 0.0,
            "ndcg@k": 0.0,
            "coverage@k": 0.0,
            "rmse": float(rmse),
            "latency_per_user_s": 0.0,
            "total_eval_time_s": float(total_time),
        }

    metrics_df = pd.DataFrame(per_user_metrics)
    coverage = len(set(all_recommended)) / max(int(coverage_denominator), 1)

    if show_progress:
        print(
            f"✔ Done: {model_name} in {total_time/60:.1f}min | "
            f"mean_latency={metrics_df['latency'].mean():.4f}s/user | "
            f"coverage={coverage:.4f}",
            flush=True,
        )

    return {
        "model": model_name,
        "precision@k": float(metrics_df["precision"].mean()),
        "recall@k": float(metrics_df["recall"].mean()),
        "map@k": float(metrics_df["map"].mean()),
        "ndcg@k": float(metrics_df["ndcg"].mean()),
        "coverage@k": float(coverage),
        "rmse": float(rmse),
        "latency_per_user_s": float(metrics_df["latency"].mean()),
        "total_eval_time_s": float(total_time),
    }


# Prepare implicit ratings
train_df = train_df.copy()
train_df["interaction_rank"] = train_df.groupby("user_id").cumcount()
max_rank = train_df["interaction_rank"].max()
if max_rank > 0:
    train_df["rating"] = 1.0 + 0.01 * (train_df["interaction_rank"] / max_rank)
else:
    train_df["rating"] = 1.0
train_df["rating_weighted"] = train_df["rating"] + np.log1p(train_df["session_size"].astype(float))


# Ratings for RMSE on held-out interactions
test_df = test_df.copy()
test_df["rating"] = 1.0
rating_min = float(train_df["rating"].min())
rating_max = float(train_df["rating"].max())
random_rmse_rng = np.random.default_rng(CONFIG["random_seed"] + 1)
random_predict_fn = lambda user_id, item_id: float(random_rmse_rng.uniform(rating_min, rating_max))

_, trainset_basic = build_surprise_trainset(train_df, rating_col="rating")
_, trainset_weighted = build_surprise_trainset(train_df, rating_col="rating_weighted")

# Models
rng = np.random.default_rng(CONFIG["random_seed"])


In [None]:
# Train NormalPredictor baseline
normal_model = NormalPredictor()
normal_model.fit(trainset_basic)


In [None]:
# Train content-based TF-IDF cosine model
from collections import namedtuple

articles_df = metadata.copy() if metadata is not None else pd.DataFrame(columns=["article_id"])
if "article_id" not in articles_df.columns:
    articles_df["article_id"] = pd.Series(dtype=int)

text_columns = [col for col in ["title", "text", "keywords"] if col in articles_df.columns]
if text_columns:
    articles_df["content_text"] = (
        articles_df[text_columns]
        .fillna("")
        .astype(str)
        .agg(" ".join, axis=1)
        .str.replace(r"\s+", " ", regex=True)
        .str.strip()
    )
else:
    articles_df["content_text"] = ""

articles_df = articles_df[articles_df["article_id"].isin(candidate_items)].drop_duplicates("article_id")
articles_df = articles_df.reset_index(drop=True)

if articles_df.empty or articles_df["content_text"].str.strip().eq("").all():
    item_vectors = np.zeros((len(articles_df), 1), dtype=float)
else:
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=2)
    item_tfidf = tfidf_vectorizer.fit_transform(articles_df["content_text"])

    if item_tfidf.shape[1] > 2:
        n_components = min(256, item_tfidf.shape[1] - 1)
        svd = TruncatedSVD(n_components=n_components, random_state=CONFIG["random_seed"])
        item_vectors = svd.fit_transform(item_tfidf)
    else:
        item_vectors = item_tfidf.toarray()

item_vectors = normalize(item_vectors, norm="l2")
item_ids = articles_df["article_id"].astype(int).tolist()
item_index = {item_id: idx for idx, item_id in enumerate(item_ids)}

user_profiles = {}
for user_id, group in train_df.groupby("user_id"):
    clicked_ids = [item for item in group["article_id"].tolist() if item in item_index]
    if not clicked_ids:
        continue
    vectors = item_vectors[[item_index[item] for item in clicked_ids]]
    mean_vector = vectors.mean(axis=0, keepdims=True)
    user_profiles[int(user_id)] = normalize(mean_vector, norm="l2")[0]

popular_items = train_df["article_id"].value_counts().index.tolist()

Prediction = namedtuple("Prediction", ["est"])


class ContentBasedModel:
    def __init__(self, user_profiles, item_vectors, item_index):
        self.user_profiles = user_profiles
        self.item_vectors = item_vectors
        self.item_index = item_index

    def score(self, user_id, item_id) -> float:
        user_vector = self.user_profiles.get(int(user_id))
        idx = self.item_index.get(int(item_id))
        if user_vector is None or idx is None:
            return 0.0
        return float(self.item_vectors[idx].dot(user_vector))

    def predict(self, user_id, item_id):
        return Prediction(est=self.score(user_id, item_id))


content_model = ContentBasedModel(user_profiles, item_vectors, item_index)


def recommend_content_based(user_id: int, k: int) -> List[int]:
    seen = train_user_items.get(user_id, set())
    user_vector = user_profiles.get(user_id)
    if user_vector is None or not item_index:
        return [item for item in popular_items if item not in seen][:k]

    candidate_ids = [item for item in item_ids if item not in seen]
    if not candidate_ids:
        return []

    candidate_idx = [item_index[item] for item in candidate_ids]
    scores = item_vectors[candidate_idx].dot(user_vector)
    top_idx = np.argsort(scores)[::-1][:k]
    return [candidate_ids[i] for i in top_idx]


In [None]:
# Train SVD++ collaborative filtering model
svdpp_model = SVDpp(random_state=CONFIG["random_seed"])
svdpp_model.fit(trainset_basic)


In [None]:
# Train SVD++ model with session weighting
svdpp_weighted_model = SVDpp(random_state=CONFIG["random_seed"])
svdpp_weighted_model.fit(trainset_weighted)

In [None]:
results = []

results.append(
    evaluate_model(
        "NormalPredictor (topN popular candidates)",
        lambda user_id, k: recommend_surprise_fast(normal_model, user_id, k),
        test_df,
        CONFIG["k"],
        predict_fn=make_predict_fn(normal_model),
    )
)

In [None]:
# Train hybrid model (SVD++ session-weighted + item-item KNN)
hybrid_svdpp_knn_model = HybridSvdppKnnModel(
    svdpp_weighted_model,
    content_model,
    svdpp_weight=0.6,
    knn_weight=0.4,
)

In [None]:


results.append(
    evaluate_model(
        "Random recommender",
        lambda user_id, k: recommend_random(rng, user_id, k),
        test_df,
        CONFIG["k"],
        predict_fn=random_predict_fn,
    )
)

In [None]:
results.append(
    evaluate_model(
        "Content-Based (TF-IDF cosine)",
        lambda user_id, k: recommend_content_based(user_id, k),
        test_df,
        CONFIG["k"],
        predict_fn=make_predict_fn(content_model),
    )
)


In [None]:

if False:
    results.append(
        evaluate_model(
            "SVD++",
            lambda user_id, k: recommend_surprise(svdpp_model, user_id, k),
            test_df,
            CONFIG["k"],
            predict_fn=make_predict_fn(svdpp_model),
        )
    )

In [None]:
results.append(
    evaluate_model(
        "SVD++ + session weighting",
        lambda user_id, k: recommend_surprise(svdpp_weighted_model, user_id, k),
        test_df,
        CONFIG["k"],
        predict_fn=make_predict_fn(svdpp_weighted_model),
    )
)

In [None]:
results.append(
    evaluate_model(
        "Hybrid SVD++ (session weighting) + Content-Based (TF-IDF cosine)",
        lambda user_id, k: recommend_surprise(hybrid_svdpp_knn_model, user_id, k),
        test_df,
        CONFIG["k"],
        predict_fn=make_predict_fn(hybrid_svdpp_knn_model),
    )
)


In [None]:
results_df = pd.DataFrame(results)
display(results_df.sort_values(by="ndcg@k", ascending=False))

## Interprétation rapide

- **Random recommender** : sert de base sanity-check ; il ignore les préférences et affiche les métriques les plus faibles.
- **NormalPredictor** : exploite la distribution globale des interactions, utile comme baseline non personnalisée mais limitée en couverture personnelle.
- **KNNBasic cosine (item-item)** : capte des co-occurrences simples, efficace quand la popularité locale est un bon signal.
- **SVD++** : modèle factoriel collaboratif qui combine notes implicites et historique d'interactions pour affiner les recommandations.
- **SVD++ + session weighting** : renforce les sessions longues, ce qui peut mieux refléter l'engagement utilisateur pour le contexte news.

---
