In [1]:
# 02_Main_Model_Training.ipynb

import sys
import numpy as np
from scipy import sparse
import pandas as pd

# Ajout du path pour src
sys.path.append('..')

from src.preprocessing import DataLoader
from src.models import SemanticHybridRecommender
from src.metrics import mapk_score

# --- 1. Chargement & Split ---
print("--- Initialisation ---")
loader = DataLoader('../data/interactions_train.csv', '../data/items.csv')

# On utilise un split temporel (80% train, 20% validation)
train_df, val_df = loader.get_time_split(train_ratio=0.8)
print(f"Train size: {len(train_df)}, Val size: {len(val_df)}")

# --- 2. Entraînement du Meilleur Modèle (Super-Ensemble) ---
# Paramètres efficaces constatés : Alpha=0.5, Half-life=[1, 250]
# + SVD (Latent) + BM25 (Keywords) + Sequential (Co-visitation)
model = SemanticHybridRecommender(n_users=loader.n_users, n_items=loader.n_items)

print("\n--- Entraînement du modèle Super-Ensemble (Hybrid + SVD + BM25 + Sequential) ---")
model.fit(
    train_df,
    loader.items_df,
    alpha=0.5,
    half_life_days=[1, 250],     # Très court vs Très long
    ensemble_weights=[0.5, 0.5]  # Équilibré
)


# --- 3. Prédiction ---
print("\n--- Génération des prédictions (Top 10) ---")
k = 10
# Les poids optimaux (seq_weight=0.3, etc.) sont définis par défaut dans la classe
preds = model.predict(k=k, batch_size=1000)

# --- 4. Évaluation ---
print("\n--- Calcul du score MAP@K ---")

# Construction de la matrice de vérité terrain (Ground Truth) pour la validation
val_rows = val_df['u_idx'].values
val_cols = val_df['i_idx'].values
val_data = np.ones(len(val_df))
val_matrix = sparse.csr_matrix(
    (val_data, (val_rows, val_cols)),
    shape=(loader.n_users, loader.n_items)
)

score = mapk_score(preds, val_matrix, k=10)
print(f"✅ MAP@10 Final : {score:.5f}")

# --- 5. (Optionnel) Sauvegarde pour soumission ---
# Si tu dois soumettre un fichier CSV
#predictions_df = pd.DataFrame(preds, columns=[f'top_{i}' for i in range(k)])
#predictions_df.to_csv('submission.csv', index=False)

  from .autonotebook import tqdm as notebook_tqdm


--- Initialisation ---
Train size: 65417, Val size: 21628

--- Entraînement du modèle Super-Ensemble (Hybrid + SVD + BM25 + Sequential) ---

Fitting SVD Component...
Fitting SVD (Factors=100)...
SVD Fitted.
SVD Component Ready.
Fitting SemanticHybrid Decoupled | Alpha=0.5, HL=[1, 250], Weights=[0.5, 0.5]...
Loading S-BERT & preparing item embeddings (with disk cache)...
Loaded embeddings from cache: /Users/jocicsacha/Downloads/booksystemrecomander-main 2/data/cache/embeddings_all-MiniLM-L6-v2_1cb59216babf090ff3bfe7204e01818d.npy
Computing BM25/TF-IDF Keyword Similarity...

--- Building Sub-Model 1 (Half-life=1d) ---

--- Building Sub-Model 2 (Half-life=250d) ---

Computing Global Popularity Scores (Based on HL=250d)...
Computing Sequential Transition Matrix...
Ensemble Model Fitted Successfully.

--- Génération des prédictions (Top 10) ---

--- Calcul du score MAP@K ---
✅ MAP@10 Final : 0.21181
