# Évaluation d'un système de recommandation My Content

Notebook pour entraîner et comparer plusieurs approches de recommandation sur le dataset Kaggle **news-portal-user-interactions-by-globocom**. L'objectif est de montrer clairement chaque étape (du chargement des données jusqu'au choix final du modèle).

> Ce notebook aligne désormais **toutes les approches de recommandation sur la bibliothèque Surprise** (https://surprise.readthedocs.io/) afin de bénéficier d'algorithmes collaboratifs standardisés et faciles à déployer.

In [194]:
# Imports & Config
from __future__ import annotations
import json
import os
import pickle
import sys
from collections import Counter
import time
from pathlib import Path
from typing import Callable, Dict, List, Optional, Tuple, Union

import numpy as np
import pandas as pd

pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None)

# Ensure the project root is importable
PROJECT_ROOT = Path('.').resolve().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

from src.models.lightfm_item2item import (
    CONTEXT_COLUMNS,
    LightFMApproximator,
    build_interaction_matrices,
    precompute_item_neighbors,
    score_from_neighbors,
)

# Configuration
CONFIG = {
    "clicks_dir": "../data/news-portal-user-interactions-by-globocom/clicks",
    "metadata_path": "../data/news-portal-user-interactions-by-globocom/articles_metadata.csv",
    "embeddings_path": "../data/news-portal-user-interactions-by-globocom/articles_embeddings.pickle",
    "max_click_files": 100,
    "artifacts_dir": "../artifacts/evaluation",
    "k": 5,
    "train_ratio": 0.8,
    "recent_window_days": 7,
    "random_seed": 42,
    "svdpp_factors": 64,
    "min_user_interactions": 3,
    "min_item_interactions": 5,
    "lightfm_use_user_features": True,
    "lightfm_components": 48,
    "lightfm_item_neighbors": 200,
}
np.random.seed(CONFIG["random_seed"])
Path(CONFIG["artifacts_dir"]).mkdir(parents=True, exist_ok=True)
print("Config ready", CONFIG)

from surprise import Dataset, Reader, KNNBasic, NormalPredictor, SVDpp, accuracy



Config ready {'clicks_dir': '../data/news-portal-user-interactions-by-globocom/clicks', 'metadata_path': '../data/news-portal-user-interactions-by-globocom/articles_metadata.csv', 'embeddings_path': '../data/news-portal-user-interactions-by-globocom/articles_embeddings.pickle', 'max_click_files': 100, 'artifacts_dir': '../artifacts/evaluation', 'k': 5, 'train_ratio': 0.8, 'recent_window_days': 7, 'random_seed': 42, 'svdpp_factors': 64, 'min_user_interactions': 3, 'min_item_interactions': 5, 'lightfm_use_user_features': True, 'lightfm_components': 48, 'lightfm_item_neighbors': 200}


## Contexte

Nous voulons proposer à chaque lecteur un Top-5 d'articles susceptibles de l'intéresser. Le notebook illustre la démarche de A à Z : préparation des données, construction de différentes familles de modèles puis comparaison à l'aide de métriques de ranking.

## Données

Les fichiers attendus sont situés dans `/data/*`.

In [195]:

# Load data utilities


def detect_timestamp_column(df: pd.DataFrame) -> str:
    """Detect the timestamp-like column name."""
    candidates = ["click_timestamp", "timestamp", "event_time", "ts", "time"]
    for col in df.columns:
        if col in candidates or col.lower() in candidates:
            return col
    raise ValueError("No timestamp-like column found. Expected one of: " + ",".join(candidates))


def detect_article_column(df: pd.DataFrame) -> str:
    """Detect the article/item column name."""
    candidates = ["click_article_id", "clicked_article_id", "article_id", "item_id", "content_id"]
    for col in df.columns:
        if col in candidates:
            return col
    raise ValueError("No article id column found. Expected one of: " + ",".join(candidates))


def infer_unix_unit(values: pd.Series) -> str:
    numeric = pd.to_numeric(values, errors="coerce").dropna()
    if numeric.empty:
        return "s"
    max_abs = numeric.abs().max()
    if max_abs >= 1e14:
        return "ns"
    if max_abs >= 1e11:
        return "ms"
    return "s"


def to_timestamp(series: pd.Series) -> pd.Series:
    if pd.api.types.is_datetime64_any_dtype(series):
        return pd.to_datetime(series)
    if pd.api.types.is_numeric_dtype(series):
        unit = infer_unix_unit(series)
        return pd.to_datetime(series, unit=unit, errors="coerce")

    converted = pd.to_datetime(series, errors="coerce")
    if converted.notna().any():
        return converted

    unit = infer_unix_unit(series)
    return pd.to_datetime(series, unit=unit, errors="coerce")


def list_click_files(path: Union[str, Path]) -> List[Path]:
    path_obj = Path(path)
    if path_obj.is_file():
        return [path_obj]
    if path_obj.is_dir():
        return sorted(path_obj.glob("clicks_hour_*.csv"))
    return []


def ensure_context_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Ensure session_size and context columns exist with safe defaults."""
    df = df.copy()
    if "session_size" not in df.columns:
        df["session_size"] = 1
    for col in CONTEXT_COLUMNS:
        if col not in df.columns:
            df[col] = "unknown"
    return df


def create_synthetic_clicks(path: str, n_users: int = 50, n_items: int = 120, days: int = 30, interactions_per_user: int = 25) -> pd.DataFrame:
    """Create a small synthetic clicks dataset to keep the notebook runnable."""
    rng = np.random.default_rng(CONFIG["random_seed"])
    start = pd.Timestamp("2022-01-01")
    envs = ["web", "app"]
    devices = ["mobile", "desktop"]
    oss = ["ios", "android", "linux"]
    referrers = ["direct", "search", "social"]
    records = []
    for user in range(1, n_users + 1):
        offsets = rng.integers(0, days, size=interactions_per_user)
        timestamps = [start + pd.Timedelta(int(o), unit="D") for o in sorted(offsets.tolist())]
        articles = rng.integers(1, n_items + 1, size=interactions_per_user)
        for ts, art in zip(timestamps, articles):
            records.append({
                "user_id": int(user),
                "article_id": int(art),
                "timestamp": ts,
                "session_size": int(rng.integers(1, 6)),
                "click_environment": rng.choice(envs),
                "click_deviceGroup": rng.choice(devices),
                "click_os": rng.choice(oss),
                "click_country": rng.choice(["fr", "us", "br"]),
                "click_region": rng.choice(["idf", "sp", "ca"]),
                "click_referrer_type": rng.choice(referrers),
            })
    df = pd.DataFrame(records).sort_values("timestamp").reset_index(drop=True)
    Path(path).parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(path, index=False)
    print(
        f"Synthetic clicks dataset created at {path} "
        f"(users={n_users}, items={n_items}, interactions={len(df)})"
    )
    return df


def load_clicks(path: str, max_files: Optional[int] = None) -> pd.DataFrame:
    """Load clicks data from the Globo hourly files, with a safety cap."""
    files = list_click_files(path)
    total_files = len(files)
    if not files:
        print(f"Clicks directory not found at {path}. Generating a synthetic sample for demonstration.")
        return ensure_context_columns(create_synthetic_clicks(Path(path) / "clicks_hour_000.csv"))

    if max_files is not None:
        print(f"Limite explicite max_files={max_files}, total détecté={total_files}")
        files = files[:max_files]

    print(f"Chargement de {len(files)} fichiers clicks (total détecté={total_files}, limite={max_files if max_files is not None else 'aucune'})")
    frames = []
    for file in files:
        df = pd.read_csv(file)
        ts_col = detect_timestamp_column(df)
        article_col = detect_article_column(df)
        df[ts_col] = to_timestamp(df[ts_col])
        df = df.rename(columns={ts_col: "timestamp", article_col: "article_id"})
        df = ensure_context_columns(df)
        keep_cols = [col for col in [
            "user_id",
            "article_id",
            "timestamp",
            "session_size",
            *CONTEXT_COLUMNS,
        ] if col in df.columns]
        frames.append(df[keep_cols])

    combined = pd.concat(frames, ignore_index=True)
    combined = combined.sort_values("timestamp").reset_index(drop=True)
    print(f"Clicks agrégés : {len(combined)} lignes, {combined['user_id'].nunique()} utilisateurs uniques, {combined['article_id'].nunique()} articles uniques.")
    return combined


def load_metadata(path: str) -> Optional[pd.DataFrame]:
    """Load article metadata if available."""
    if not os.path.exists(path):
        print(f"Metadata file not found at {path}. Utilisation du pipeline Surprise uniquement si les métadonnées sont absentes.")
        return None
    meta = pd.read_csv(path)
    if "article_id" not in meta.columns:
        print("Metadata missing 'article_id' column. Ignoring metadata.")
        return None
    return meta


clicks = load_clicks(CONFIG["clicks_dir"], max_files=CONFIG["max_click_files"])
metadata = load_metadata(CONFIG["metadata_path"])
print(clicks.head())
print("Metadata loaded:", metadata is not None)



Limite explicite max_files=100, total détecté=385
Chargement de 100 fichiers clicks (total détecté=385, limite=100)
Clicks agrégés : 909070 lignes, 176840 utilisateurs uniques, 17096 articles uniques.
   user_id  article_id               timestamp  session_size  \
0       59      234853 2017-10-01 03:00:00.026             2   
1       79      159359 2017-10-01 03:00:01.702             2   
2      154       96663 2017-10-01 03:00:04.207             2   
3      111      202436 2017-10-01 03:00:14.140             2   
4       70      119592 2017-10-01 03:00:18.863             3   

   click_environment  click_deviceGroup  click_os  click_country  \
0                  4                  3         2              1   
1                  4                  3         2              1   
2                  4                  3         2              1   
3                  4                  3         2              1   
4                  4                  3         2              1   

   cl

## Analyse exploratoire des données

Courte photographie des fichiers sources immédiatement après le chargement :
- nombre de lignes et noms de colonnes des clics
- volumes et intégrité des métadonnées articles
- dimensions et structure du fichier d'`articles_embeddings`.

In [196]:
# EDA rapide sur les données sources
import pickle
from pathlib import Path
from collections.abc import Mapping


def summarize_timestamps(series: pd.Series):
    series = pd.to_datetime(series)
    daily = series.dt.date.value_counts().sort_index().rename_axis("date").reset_index(name="nb_clicks")
    hourly = series.dt.hour.value_counts().sort_index().rename_axis("hour").reset_index(name="nb_clicks")
    return series.min(), series.max(), daily, hourly


def describe_structure(obj, prefix="embeddings", max_depth=4):
    entries = []

    def add_entry(path, value, note=None):
        entry = {"chemin": path, "type": type(value).__name__}
        if hasattr(value, "shape"):
            entry["shape"] = tuple(getattr(value, "shape"))
        elif hasattr(value, "__len__") and not isinstance(value, (str, bytes)):
            entry["len"] = len(value)
        if hasattr(value, "dtype"):
            entry["dtype"] = str(getattr(value, "dtype"))
        if note:
            entry["note"] = note
        if isinstance(value, np.ndarray) and value.dtype.names:
            entry["dtype_fields"] = list(value.dtype.names)
        if isinstance(value, np.ndarray) and value.ndim == 1 and len(value) > 0 and not isinstance(value[0], (np.ndarray, list, tuple, Mapping)):
            entry["exemple"] = repr(value[:3].tolist())
        entries.append(entry)

    def walk(value, path, depth):
        add_entry(path, value)
        if depth >= max_depth:
            return
        if isinstance(value, Mapping):
            for k, v in value.items():
                walk(v, f"{path}.{k}", depth + 1)
        elif isinstance(value, (list, tuple, np.ndarray)) and not isinstance(value, (str, bytes)):
            if len(value) > 0:
                walk(value[0], f"{path}[0]", depth + 1)

    walk(obj, prefix, 0)
    return entries


click_files = list_click_files(CONFIG["clicks_dir"])
print(f"Nombre total de fichiers clicks détectés: {len(click_files)}")
if not click_files:
    print("Aucun fichier clicks trouvé au chemin configuré. Vérifiez le téléchargement des données.")

files_for_eda = click_files[:2]
per_file_stats = []
for file in files_for_eda:
    df_file = pd.read_csv(file)
    ts_col = detect_timestamp_column(df_file)
    article_col = detect_article_column(df_file)
    timestamps = to_timestamp(df_file[ts_col])
    per_file_stats.append(
        {
            "fichier": file.name,
            "nb_lignes": len(df_file),
            "colonnes": ", ".join(df_file.columns),
            "articles_uniques": df_file[article_col].nunique(),
            "horodatage_min": timestamps.min(),
            "horodatage_max": timestamps.max(),
        }
    )
if per_file_stats:
    display(pd.DataFrame(per_file_stats))
else:
    print("Pas assez de fichiers pour réaliser une EDA détaillée par fichier.")

print("=== Clicks (agrégés) ===")
if clicks.empty:
    print("Aucun clic chargé. Vérifier le chemin ou augmenter max_click_files.")
else:
    clicks_summary = {
        "nb_lignes": len(clicks),
        "colonnes": ", ".join(clicks.columns),
        "utilisateurs_uniques": clicks['user_id'].nunique() if 'user_id' in clicks else None,
        "articles_uniques": clicks['article_id'].nunique() if 'article_id' in clicks else None,
    }
    display(pd.DataFrame([clicks_summary]))

    total_articles = None
    if metadata is not None and 'article_id' in metadata:
        total_articles = metadata['article_id'].nunique()
    elif 'article_id' in clicks:
        total_articles = clicks['article_id'].nunique()

    total_clients = clicks['user_id'].nunique() if 'user_id' in clicks else None
    print("Synthèse globale (articles / clients)")
    display(pd.DataFrame([{
        'nombre_total_articles': total_articles,
        'nombre_total_clients': total_clients,
    }]))

    ts_min, ts_max, daily, hourly = summarize_timestamps(clicks['timestamp'])
    display(pd.DataFrame([
        {
            'horodatage_min': ts_min,
            'horodatage_max': ts_max,
            'fenetre_jours': (ts_max - ts_min).days + 1,
        }
    ]))
    print("Répartition par jour (jusqu'à 10 premières valeurs)")
    display(daily.head(10))
    print("Répartition par heure (0-23)")
    display(hourly)

print("=== Métadonnées des articles ===")
if metadata is None:
    print("Aucun fichier metadata chargé.")
else:
    meta_summary = {
        "nb_articles": len(metadata),
        "colonnes": ", ".join(metadata.columns),
        "articles_uniques": metadata['article_id'].nunique() if 'article_id' in metadata else None,
    }
    display(pd.DataFrame([meta_summary]))
    missing = metadata.isna().sum().sort_values(ascending=False)
    display(missing.to_frame('valeurs_manquantes'))
    if 'created_at_ts' in metadata.columns:
        created = to_timestamp(metadata['created_at_ts'])
        display(pd.DataFrame([{'premier_article': created.min(), 'dernier_article': created.max()}]))
    if 'article_id' in metadata.columns:
        overlap = set(clicks['article_id'].unique()) if 'article_id' in clicks.columns else set()
        coverage = len(overlap & set(metadata['article_id'].unique()))
        print(f"Articles présents dans clicks et metadata: {coverage}")


print("=== Embeddings d'articles ===")
embeddings_path = Path(CONFIG['embeddings_path'])
if embeddings_path.exists():
    with embeddings_path.open('rb') as f:
        embeddings_obj = pickle.load(f)
    print(f"Type chargé: {type(embeddings_obj)}")

    def summarize_matrix(mat):
        stats = {
            'shape': getattr(mat, 'shape', None),
            'dtype': getattr(mat, 'dtype', None),
        }

        dim_values = []
        shape = getattr(mat, 'shape', None)
        if shape is not None and len(shape) >= 2:
            dim_values.append(shape[1])
        elif isinstance(mat, (list, tuple, np.ndarray)):
            for row in mat:
                if hasattr(row, '__len__') and not isinstance(row, (str, bytes)):
                    try:
                        dim_values.append(len(row))
                    except TypeError:
                        continue

        if dim_values:
            stats.update({
                'profondeur_min': min(dim_values),
                'profondeur_moyenne': float(np.mean(dim_values)),
                'profondeur_max': max(dim_values),
            })

        if hasattr(mat, 'shape') and len(getattr(mat, 'shape', [])) == 2:
            norms = np.linalg.norm(mat, axis=1)
            stats.update(
                {
                    'nb_vectors': mat.shape[0],
                    'dim': mat.shape[1],
                    'norm_min': norms.min(),
                    'norm_max': norms.max(),
                    'norm_moyenne': norms.mean(),
                }
            )
        return stats

    base_structure = describe_structure(embeddings_obj, max_depth=4)

    if isinstance(embeddings_obj, dict):
        keys = list(embeddings_obj.keys())
        print(f"Clés disponibles: {keys}")
        matrix = embeddings_obj.get('embeddings')
        ids = embeddings_obj.get('articles_ids') or embeddings_obj.get('article_ids')

        structure = base_structure.copy()
        if ids is not None:
            structure.insert(0, {
                'chemin': 'embeddings.article_ids',
                'type': type(ids).__name__,
                'len': len(ids),
                'note': "Identifiants d'articles fournis dans le fichier",
            })
        if structure:
            print("Structure détaillée de l'objet d'embeddings (par chemin de clé):")
            display(pd.DataFrame(structure))

        if matrix is not None:
            stats = summarize_matrix(matrix)
            stats.update(
                {
                    'colonnes': ", ".join(keys),
                    'nb_articles_ids': len(ids) if ids is not None else None,
                    'ids_uniques': len(set(ids)) if ids is not None else None,
                    'couverture_metadata': len(set(ids) & set(metadata['article_id']))
                    if (metadata is not None and ids is not None and 'article_id' in metadata)
                    else None,
                    'couverture_clicks': len(set(ids) & set(clicks['article_id']))
                    if (not clicks.empty and ids is not None and 'article_id' in clicks)
                    else None,
                }
            )
            display(pd.DataFrame([stats]))

            if ids is not None:
                sample_ids = ids[:5] if len(ids) >= 5 else ids
                print("Aperçu des premiers article_id liés aux embeddings:")
                display(pd.DataFrame({'article_id': sample_ids}))

            preview_cols = [f"emb_{i}" for i in range(min(5, matrix.shape[1] if hasattr(matrix, 'shape') else 0))]
            if preview_cols:
                preview = pd.DataFrame(matrix[:5, : len(preview_cols)], columns=preview_cols)
                if ids is not None:
                    preview.insert(0, 'article_id', ids[: len(preview)])
                print("Aperçu des embeddings (quelques colonnes et premières lignes):")
                display(preview)
                print("Colonnes affichées pour l'aperçu des embeddings:")
                print(", ".join(preview.columns))

                if ids is not None and metadata is not None and 'article_id' in metadata:
                    meta_cols = [c for c in ['title', 'category_id', 'created_at_ts', 'publisher'] if c in metadata.columns]
                    meta_sample = (
                        preview[['article_id']]
                        .merge(metadata[['article_id'] + meta_cols], on='article_id', how='left')
                    )
                    if 'created_at_ts' in meta_sample.columns:
                        meta_sample['created_at_ts'] = to_timestamp(meta_sample['created_at_ts'])
                    print("Exemple de liaison embedding -> metadata sur article_id (5 premières lignes):")
                    display(meta_sample.head())
        else:
            print("Aucune matrice d'embeddings explicite trouvée dans l'objet chargé.")
    elif hasattr(embeddings_obj, 'shape'):
        stats = summarize_matrix(embeddings_obj)

        inferred_ids = None
        mapping_note = None
        if metadata is not None and 'article_id' in metadata and hasattr(embeddings_obj, 'shape'):
            if embeddings_obj.shape[0] == len(metadata):
                inferred_ids = metadata['article_id'].reset_index(drop=True)
                mapping_note = (
                    "Aucun article_id explicite fourni ; association supposée alignée sur l'ordre des metadata."
                )
            else:
                mapping_note = (
                    "Aucun article_id dans le fichier d'embeddings et la taille ne correspond pas aux metadata : "
                    f"{embeddings_obj.shape[0]} vecteurs vs {len(metadata)} lignes de metadata."
                )
        else:
            mapping_note = (
                "Aucun identifiant d'article n'est présent dans le fichier d'embeddings (mapping externe requis)."
            )

        structure = base_structure.copy()
        if inferred_ids is not None:
            structure.insert(0, {
                'chemin': 'embeddings.article_id (inféré)',
                'type': type(inferred_ids).__name__,
                'len': len(inferred_ids),
                'note': "Alignement supposé sur metadata.article_id (index identique).",
            })
        if structure:
            print("Structure détaillée de l'objet d'embeddings (par chemin de clé):")
            display(pd.DataFrame(structure))

        if mapping_note:
            print(mapping_note)

        if inferred_ids is not None:
            stats.update(
                {
                    'ids_source': 'metadata.article_id (alignement par index)',
                    'ids_uniques': inferred_ids.nunique(),
                    'couverture_metadata': len(set(inferred_ids) & set(metadata['article_id'])),
                    'couverture_clicks': len(set(inferred_ids) & set(clicks['article_id'])) if not clicks.empty else None,
                }
            )

        display(pd.DataFrame([stats]))
        if len(getattr(embeddings_obj, 'shape', [])) >= 2 and embeddings_obj.shape[1] > 0:
            preview_cols = [f"emb_{i}" for i in range(min(5, embeddings_obj.shape[1]))]
            preview = pd.DataFrame(embeddings_obj[:5, : len(preview_cols)], columns=preview_cols)
            if inferred_ids is not None:
                preview.insert(0, 'article_id', inferred_ids.iloc[: len(preview)].values)
            print("Aperçu direct de la matrice d'embeddings:")
            display(preview)
            print("Colonnes affichées pour l'aperçu des embeddings:")
            print(", ".join(preview.columns))

            if inferred_ids is not None and metadata is not None:
                meta_cols = [c for c in ['title', 'category_id', 'created_at_ts', 'publisher'] if c in metadata.columns]
                meta_sample = preview[['article_id']].merge(
                    metadata[['article_id'] + meta_cols], on='article_id', how='left'
                )
                if 'created_at_ts' in meta_sample.columns:
                    meta_sample['created_at_ts'] = to_timestamp(meta_sample['created_at_ts'])
                print("Exemple de liaison embedding -> metadata sur article_id (inféré):")
                display(meta_sample.head())
        else:
            print("Objet chargé non structuré, utilisez type/len pour investiguer.")
else:
    print(f"Fichier d'embeddings introuvable à {embeddings_path}")





Nombre total de fichiers clicks détectés: 385


Unnamed: 0,fichier,nb_lignes,colonnes,articles_uniques,horodatage_min,horodatage_max
0,clicks_hour_000.csv,1883,"user_id, session_id, session_start, session_size, click_article_id, click_timestamp, click_environment, click_deviceGroup, click_os, click_country, click_region, click_referrer_type",323,2017-10-01 03:00:00.026,2017-10-03 02:35:54.157
1,clicks_hour_001.csv,1415,"user_id, session_id, session_start, session_size, click_article_id, click_timestamp, click_environment, click_deviceGroup, click_os, click_country, click_region, click_referrer_type",289,2017-10-01 03:36:28.615,2017-10-02 02:41:03.190


=== Clicks (agrégés) ===


Unnamed: 0,nb_lignes,colonnes,utilisateurs_uniques,articles_uniques
0,909070,"user_id, article_id, timestamp, session_size, click_environment, click_deviceGroup, click_os, click_country, click_region, click_referrer_type",176840,17096


Synthèse globale (articles / clients)


Unnamed: 0,nombre_total_articles,nombre_total_clients
0,364047,176840


Unnamed: 0,horodatage_min,horodatage_max,fenetre_jours
0,2017-10-01 03:00:00.026,2017-10-27 19:34:26.381,27


Répartition par jour (jusqu'à 10 premières valeurs)


Unnamed: 0,date,nb_clicks
0,2017-10-01,94056
1,2017-10-02,303177
2,2017-10-03,261159
3,2017-10-04,215415
4,2017-10-05,34556
5,2017-10-06,301
6,2017-10-07,40
7,2017-10-08,25
8,2017-10-09,126
9,2017-10-10,65


Répartition par heure (0-23)


Unnamed: 0,hour,nb_clicks
0,0,43094
1,1,40498
2,2,27210
3,3,19347
4,4,10582
5,5,5332
6,6,3840
7,7,4878
8,8,8970
9,9,19316


=== Métadonnées des articles ===


Unnamed: 0,nb_articles,colonnes,articles_uniques
0,364047,"article_id, category_id, created_at_ts, publisher_id, words_count",364047


Unnamed: 0,valeurs_manquantes
article_id,0
category_id,0
created_at_ts,0
publisher_id,0
words_count,0


Unnamed: 0,premier_article,dernier_article
0,2006-09-27 11:14:35,2018-03-13 12:12:30


Articles présents dans clicks et metadata: 17096
=== Embeddings d'articles ===
Type chargé: <class 'numpy.ndarray'>
Structure détaillée de l'objet d'embeddings (par chemin de clé):


Unnamed: 0,chemin,type,len,note,shape,dtype,exemple
0,embeddings.article_id (inféré),Series,364047.0,Alignement supposé sur metadata.article_id (index identique).,,,
1,embeddings,ndarray,,,"(364047, 250)",float32,
2,embeddings[0],ndarray,,,"(250,)",float32,"[-0.16118301451206207, -0.9572331309318542, -0.13794444501399994]"
3,embeddings[0][0],float32,,,(),float32,


Aucun article_id explicite fourni ; association supposée alignée sur l'ordre des metadata.


Unnamed: 0,shape,dtype,profondeur_min,profondeur_moyenne,profondeur_max,nb_vectors,dim,norm_min,norm_max,norm_moyenne,ids_source,ids_uniques,couverture_metadata,couverture_clicks
0,"(364047, 250)",float32,250,250.0,250,364047,250,1.845483,11.18309,7.939456,metadata.article_id (alignement par index),364047,364047,17096


Aperçu direct de la matrice d'embeddings:


Unnamed: 0,article_id,emb_0,emb_1,emb_2,emb_3,emb_4
0,0,-0.161183,-0.957233,-0.137944,0.050855,0.830055
1,1,-0.523216,-0.974058,0.738608,0.155234,0.626294
2,2,-0.619619,-0.97296,-0.20736,-0.128861,0.044748
3,3,-0.740843,-0.975749,0.391698,0.641738,-0.268645
4,4,-0.279052,-0.972315,0.685374,0.113056,0.238315


Colonnes affichées pour l'aperçu des embeddings:
article_id, emb_0, emb_1, emb_2, emb_3, emb_4
Exemple de liaison embedding -> metadata sur article_id (inféré):


Unnamed: 0,article_id,category_id,created_at_ts
0,0,0,2017-12-13 05:53:39
1,1,1,2014-07-14 12:45:36
2,2,1,2014-08-22 00:35:06
3,3,1,2014-08-19 17:11:53
4,4,1,2014-08-03 13:06:11


# Article Embeddings

Ce fichier contient les **embeddings des articles**, c’est-à-dire une **représentation numérique du contenu textuel** permettant de comparer les articles entre eux sur le plan sémantique.

* **Format** : matrice NumPy `(N, 250)` en `float32`
* **1 ligne = 1 article**
* **250 colonnes = dimensions latentes**
* Les valeurs individuelles n’ont pas de signification directe

L’`article_id` n’est **pas stocké explicitement** : il est **déduit de l’ordre des lignes**, qui doit rester aligné avec les métadonnées des articles.

La variable `words_count` indique le **nombre de mots du texte source** et sert uniquement d’indicateur de qualité du contenu.

Les embeddings **ne sont pas normalisés** : la **similarité cosinus** est la mesure recommandée pour comparer les articles.
