In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split

In [2]:
# ------------------------
# 1. Jeu de données
# ------------------------

books = pd.DataFrame([
    {'book_id': 1, 'title': 'Harry Potter', 'description': 'Magic and adventure at Hogwarts'},
    {'book_id': 2, 'title': 'The Hobbit', 'description': 'A journey with dwarves and a dragon'},
    {'book_id': 3, 'title': 'Twilight', 'description': 'Vampire romance in high school'},
    {'book_id': 4, 'title': 'Dune', 'description': 'Sci-fi epic with politics and sandworms'},
    {'book_id': 5, 'title': '1984', 'description': 'Dystopia under total surveillance'}
])

ratings = pd.DataFrame([
    {'user_id': 'u1', 'book_id': 1, 'rating': 5},
    {'user_id': 'u1', 'book_id': 2, 'rating': 4},
    {'user_id': 'u2', 'book_id': 3, 'rating': 5},
    {'user_id': 'u2', 'book_id': 4, 'rating': 2},
    {'user_id': 'u3', 'book_id': 5, 'rating': 4}
])

In [3]:
# ------------------------
# 2. TF-IDF + Similarité
# ------------------------

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(books['description'])
cosine_sim_matrix = cosine_similarity(tfidf_matrix)

In [4]:
# ------------------------
# 3. Collaborative Filtering (SVD)
# ------------------------

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['user_id', 'book_id', 'rating']], reader)
trainset, _ = train_test_split(data, test_size=0.2, random_state=42)
svd = SVD()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2e404341660>

In [5]:
# ------------------------
# 4. Génération des scores hybrides pour un utilisateur donné
# ------------------------

def recommend_books(user_id, titre_reference, top_n=3, alpha=0.5):
    # --- Trouver l'index du livre de référence
    idx_ref = books[books['title'].str.lower() == titre_reference.lower()].index
    if len(idx_ref) == 0:
        print("Titre non trouvé.")
        return []
    idx_ref = idx_ref[0]

    # --- Similarité content-based
    content_scores = cosine_sim_matrix[idx_ref]

    # --- Collaborative filtering: prédiction SVD pour tous les livres
    collab_scores = []
    for book_id in books['book_id']:
        pred = svd.predict(user_id, book_id)
        collab_scores.append(pred.est)

    # --- Normalisation des deux vecteurs
    scaler = MinMaxScaler()
    content_scores_norm = scaler.fit_transform(content_scores.reshape(-1, 1)).flatten()
    collab_scores_norm = scaler.fit_transform(np.array(collab_scores).reshape(-1, 1)).flatten()

    # --- Fusion des scores
    hybrid_scores = alpha * content_scores_norm + (1 - alpha) * collab_scores_norm

    # --- Préparation des résultats
    books['hybrid_score'] = hybrid_scores
    books_sorted = books.sort_values(by='hybrid_score', ascending=False)

    # --- Ne pas recommander le livre de référence
    books_sorted = books_sorted[books_sorted.index != idx_ref]

    return books_sorted[['title', 'hybrid_score']].head(top_n)


In [11]:
# ------------------------
# 5. Test de la recommandation
# ------------------------

user = 'u1'
titre_donne = 'Harry Potter'
reco = recommend_books(user, titre_donne, top_n=3, alpha=0.6)
print("Recommandations pour l'utilisateur", user, "basées sur :", titre_donne)
print(reco)


Recommandations pour l'utilisateur u1 basées sur : Harry Potter
      title  hybrid_score
3      Dune      0.451662
4      1984      0.232535
2  Twilight      0.180035


In [9]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

def recommend_hybrid_no_reference(user_id, books, ratings, tfidf_matrix, svd_model, alpha=0.5, top_n=3):
    # 1. Récupérer les livres notés positivement (rating >= 4)
    livres_positifs = ratings[(ratings['user_id'] == user_id) & (ratings['rating'] >= 4)]
    
    if livres_positifs.empty:
        print("Pas de livres notés positivement pour cet utilisateur.")
        # Fallback : recommander selon collaboratif seul
        collab_scores = []
        for book_id in books['book_id']:
            pred = svd_model.predict(user_id, book_id)
            collab_scores.append(pred.est)
        scaler = MinMaxScaler()
        collab_scores_norm = scaler.fit_transform(np.array(collab_scores).reshape(-1,1)).flatten()
        books['hybrid_score'] = collab_scores_norm
        return books.sort_values(by='hybrid_score', ascending=False).head(top_n)[['title', 'hybrid_score']]
    
    # 2. Récupérer indices des livres positifs dans books
    indices = livres_positifs['book_id'].apply(lambda x: books.index[books['book_id'] == x][0])
    
    # 3. Extraire leurs embeddings TF-IDF et convertir en dense array
    embeddings_positifs = tfidf_matrix[indices].toarray()
    
    # 4. Calculer la moyenne pour le profil utilisateur
    profil_user = np.mean(embeddings_positifs, axis=0).reshape(1, -1)
    
    # 5. Calculer la similarité cosine entre profil_user et tous les livres (TF-IDF dense)
    tfidf_matrix_dense = tfidf_matrix.toarray()
    content_scores = cosine_similarity(profil_user, tfidf_matrix_dense).flatten()
    
    # 6. Prédiction collaborative
    collab_scores = []
    for book_id in books['book_id']:
        pred = svd_model.predict(user_id, book_id)
        collab_scores.append(pred.est)
    
    # 7. Normaliser scores content-based et collaboratif
    scaler = MinMaxScaler()
    content_scores_norm = scaler.fit_transform(content_scores.reshape(-1, 1)).flatten()
    collab_scores_norm = scaler.fit_transform(np.array(collab_scores).reshape(-1, 1)).flatten()
    
    # 8. Combiner avec alpha
    hybrid_scores = alpha * content_scores_norm + (1 - alpha) * collab_scores_norm
    
    # 9. Retourner les top N titres (exclure ceux déjà notés si souhaité)
    books['hybrid_score'] = hybrid_scores
    
    # Optionnel : filtrer les livres déjà notés pour ne pas recommander les mêmes
    livres_deja_notes = set(livres_positifs['book_id'])
    recommendations = books[~books['book_id'].isin(livres_deja_notes)]
    
    recommendations = recommendations.sort_values(by='hybrid_score', ascending=False).head(top_n)
    
    return recommendations[['title', 'hybrid_score']]


In [10]:
recos = recommend_hybrid_no_reference('u1', books, ratings, tfidf_matrix, svd, alpha=0.6, top_n=3)
print(recos)


      title  hybrid_score
3      Dune      0.566134
4      1984      0.232535
2  Twilight      0.180035
