In [1]:
# 02_Main_Model_Training.ipynb

import sys
import numpy as np
from scipy import sparse
import pandas as pd

# Ajout du path pour src
sys.path.append('..')

from src.preprocessing import DataLoader
from src.models import SemanticHybridRecommender
from src.metrics import mapk_score

# --- 1. Chargement & Split ---
print("--- Initialisation ---")
loader = DataLoader('../data/interactions_train.csv', '../data/items.csv')

# On utilise un split temporel (80% train, 20% validation)
train_df, val_df = loader.get_time_split(train_ratio=0.8)
print(f"Train size: {len(train_df)}, Val size: {len(val_df)}")

# --- 2. Entraînement du Meilleur Modèle (S-BERT) ---
# Paramètres efficaces constatés : Alpha=0.5, Half-life=[1, 250] (très court vs très long)
model = SemanticHybridRecommender(n_users=loader.n_users, n_items=loader.n_items)

print("\n--- Entraînement du modèle Hybride (Time Decay + S-BERT) ---")
model.fit(
    train_df,
    loader.items_df,
    alpha=0.5,
    half_life_days=[1, 250],     # Très court vs Très long
    #ensemble_weights=[0.6, 0.4]  # augmente le score local mais à réduit le score kaggle -> overfitting
)


# --- 3. Prédiction ---
print("\n--- Génération des prédictions (Top 10) ---")
k = 10
preds = model.predict(k=k, batch_size=1000)

# --- 4. Évaluation ---
print("\n--- Calcul du score MAP@K ---")

# Construction de la matrice de vérité terrain (Ground Truth) pour la validation
val_rows = val_df['u_idx'].values
val_cols = val_df['i_idx'].values
val_data = np.ones(len(val_df))
val_matrix = sparse.csr_matrix(
    (val_data, (val_rows, val_cols)),
    shape=(loader.n_users, loader.n_items)
)

score = mapk_score(preds, val_matrix, k=10)
print(f"✅ MAP@10 Final : {score:.5f}")

# --- 5. (Optionnel) Sauvegarde pour soumission ---
# Si tu dois soumettre un fichier CSV
#predictions_df = pd.DataFrame(preds, columns=[f'top_{i}' for i in range(k)])
#predictions_df.to_csv('submission.csv', index=False)

--- Initialisation ---
Train size: 65417, Val size: 21628

--- Entraînement du modèle Hybride (Time Decay + S-BERT) ---
Fitting SemanticHybrid Decoupled | Alpha=0.5, HL=[1, 250], Weights=[0.5, 0.5]...
Loading S-BERT & Encoding Metadata...


Batches:   0%|          | 0/473 [00:00<?, ?it/s]


--- Building Sub-Model 1 (Half-life=1d) ---

--- Building Sub-Model 2 (Half-life=250d) ---

Computing Global Popularity Scores...
Ensemble Model Fitted Successfully.

--- Génération des prédictions (Top 10) ---

--- Calcul du score MAP@K ---
✅ MAP@10 Final : 0.20693
