# Système de recommandation de livres basé sur le contenu

## Imports et chargement des données

In [41]:
import pandas as pd
import numpy as np
import os
import json
import psycopg2
import re
import ast
import nltk
import gensim
import glob
import joblib
import torch
import pickle
import unicodedata

from tqdm import tqdm
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize

from scipy import sparse
from gensim.parsing.preprocessing import remove_stopwords

from sentence_transformers import SentenceTransformer

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from surprise import SVD, NMF, Dataset, Reader, BaselineOnly, AlgoBase , SVDpp,  dump, accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise.model_selection.search import RandomizedSearchCV


# les imports qui suivent sont d'un dossier package créé pour éviter de faire toutes les opérations dans le notebook


# utils : contient les fonctions globales
# reco_collab_initial contient les fonctions pour la recommandation collaborative sur ce dataset précis

%load_ext autoreload
%autoreload 2

from package.utils import *
from package.reco_collab_initial import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
path = "../data/raw/Recommender_dataset/"

books = pd.read_csv(path+"Books.csv")
users= pd.read_csv(path+"Users.csv")
ratings = pd.read_csv(path+"Ratings.csv")

In [3]:
def save_df_to_csv(df, path):
    """ Crée un fichier csv à partir d'un dataframe.
        Arguments :
            df (DataFrame)
            path (str) : chemin du fichier qui sera créé.
    """
    df.to_csv(path, index=False, header=True)

def save_df_to_pickle(df, path):
    """ Crée un fichier pkl à partir d'un dataframe. C'est pour garder le type list des sujets et descriptions.
        Arguments :
            df (DataFrame)
            path (str) : chemin du fichier qui sera créé.
    """
    df.to_pickle(path)

In [4]:
import ast
import re

def str_list_to_text(s):
    if isinstance(s, str):
        return s
    else :
        parsed = ast.literal_eval(s)
        return " ".join(parsed)

def enlever_parentheses(texte):
    resultat = []
    niveau_parenthese = 0

    for char in texte:
        if char == '(':
            niveau_parenthese += 1
        elif char == ')':
            if niveau_parenthese > 0:
                niveau_parenthese -= 1
        else:
            if niveau_parenthese == 0:
                resultat.append(char)

    return ''.join(resultat).strip()

In [5]:
import ast
import json

def str_to_list_auto(val):
    if isinstance(val, list):
        return val  # déjà une liste
    if val is None or val == '':
        return []
    try:
        # Essaye JSON (["a", "b"])
        return json.loads(val)
    except (json.JSONDecodeError, TypeError):
        try:
            # Essaye une liste Python ("['a', 'b']")
            return ast.literal_eval(val)
        except (ValueError, SyntaxError):
            # Sinon, split sur virgule simple
            return [x.strip() for x in val.split(',') if x.strip()]



In [6]:
import re

def remove_parentheses(text):
    print(re.sub(r'\s*\(.*?\)', '', text).strip())
    return re.sub(r'\s*\(.*?\)', '', text).strip()

In [7]:
books.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')

In [8]:
ratings.columns

Index(['User-ID', 'ISBN', 'Book-Rating'], dtype='object')

In [9]:
users.columns

Index(['User-ID', 'Location', 'Age'], dtype='object')

In [10]:
books.rename(columns={'Book-Title':'title', 'Book-Author': 'author','Year-Of-Publication':'year', 'Publisher':'publisher'}, inplace=True)
books.drop(columns=['Image-URL-S', 'Image-URL-M', 'Image-URL-L'], inplace=True)
books.head(10)

Unnamed: 0,ISBN,title,author,year,publisher
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company
5,0399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group
6,0425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group
7,0671870432,PLEADING GUILTY,Scott Turow,1993,Audioworks
8,0679425608,Under the Black Flag: The Romance and the Real...,David Cordingly,1996,Random House
9,074322678X,Where You'll Find Me: And Other Stories,Ann Beattie,2002,Scribner


In [11]:
ratings.rename(columns={'User-ID':'user_id', 'Book-Rating':'rating'}, inplace=True)
ratings.head(10)

Unnamed: 0,user_id,ISBN,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6
5,276733,2080674722,0
6,276736,3257224281,8
7,276737,0600570967,6
8,276744,038550120X,7
9,276745,342310538,10


In [12]:
users.rename(columns={'User-ID':'user_id', 'Location':'location','Age':'age'}, inplace=True)
users.head(10)

Unnamed: 0,user_id,location,age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",
5,6,"santa monica, california, usa",61.0
6,7,"washington, dc, usa",
7,8,"timmins, ontario, canada",
8,9,"germantown, tennessee, usa",
9,10,"albacete, wisconsin, spain",26.0


In [13]:
def show_image(url, width=100):
    """ Affiche les images du dataset"""
    return f'<img src="{url}" width="{width}">'

def url_to_img(df, func):
    return df.style.format({'Image-URL-S':func, 'Image-URL-M':func, 'Image-URL-L':func}, escape=False)

## Préparation des données

In [14]:
books.isnull().sum()

ISBN         0
title        0
author       2
year         0
publisher    2
dtype: int64

In [15]:
books.dropna(inplace=True)
books.isnull().sum()

ISBN         0
title        0
author       0
year         0
publisher    0
dtype: int64

In [16]:
books.shape

(271356, 5)

In [17]:
books.duplicated().sum()

0

In [18]:
ratings.isnull().sum()

user_id    0
ISBN       0
rating     0
dtype: int64

In [19]:
ratings.duplicated().sum()

0

In [20]:
ratings['user_id'].value_counts()

user_id
11676     13602
198711     7550
153662     6109
98391      5891
35859      5850
          ...  
116180        1
116166        1
116154        1
116137        1
276723        1
Name: count, Length: 105283, dtype: int64

In [21]:
users.isnull().sum()

user_id          0
location         0
age         110762
dtype: int64

In [22]:
users.shape

(278858, 3)

## Feature engineering 

- Ajout des genres et descritpion (si possible) des livres dans le dataset
- Ajout de la note moyenne de chaque livre
- Ajout de la note moyenne de l'utilisateur 
- Ajout du nombre de notes de chaque livre
- Fusion des datasets pour en avoir un basé sur les utilisateurs avec leurs notes moyennes et celles des livres

### API openlibrary (tests)

In [10]:
import pandas as pd
import requests
import time

# Utiliser les 10 premiers ISBNs pour test
isbns = books['ISBN'].dropna().unique()

# Stocker les genres extraits
extracted_genres = []

for isbn in isbns:
    try:
        url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn}&jscmd=details&format=json"
        response = requests.get(url)
        data = response.json()

        key = f"ISBN:{isbn}"
        genres = data.get(key, {}).get('details', {}).get('subjects', [])
        
        # Extraire les noms des genres
        genres_list = [g['name'] if isinstance(g, dict) else g for g in genres]

        extracted_genres.append({
            'ISBN': isbn,
            'genres_from_api': ', '.join(genres_list)
        })

        # Pause pour éviter d’être bloqué par l’API
        time.sleep(1)

    except Exception as e:
        extracted_genres.append({'ISBN': isbn, 'genres_from_api': None})
        print(f"Erreur avec ISBN {isbn} : {e}")

# Convertir en DataFrame et enregistrer
genres_df = pd.DataFrame(extracted_genres)
genres_df.to_csv("genres_from_openlibrary.csv", index=False)

print("✅ Extraction terminée !")


ModuleNotFoundError: No module named 'requests'

### Ajout des genres et description en utilisant les dumps d'openlibrary

Avant de faire ça, j'ai créé une base de données avec postgresql grâce à : https://github.com/LibrariesHacked/openlibrary-search

In [24]:
def get_key_books(isbn_list):
    """ Permet de récupérer une liste de key de la table 'editions' pour chaque isbn de la liste. 
        (C'est une liste de listes de key).
        Arguments :
            isbn_list (str[]) : liste d'isbn
    """
    conn = psycopg2.connect(
        dbname='openlibrary',
        user='user',
        password='password',
        host='localhost',
        port='5432'
    )
    conn.autocommit = True
    cur = conn.cursor()

    sql = """
        SELECT e.key
        FROM editions e
        JOIN edition_isbns ei ON ei.edition_key = e.key
        WHERE ei.isbn = %s
    """

    key_books = []
    for isbn in tqdm(isbn_list):
        cur.execute(sql, (isbn,))
        results = cur.fetchall()
        key_books.append(results)

    cur.close()
    conn.close()

    return key_books

In [25]:
def get_key_books_list(isbn_list):
    """ Renvoie une liste de key de la table 'editions' correspondant aux isbn donnés.
        Arguments :
            isbn_list (str[]) : liste d'isbn
    """
    conn = psycopg2.connect(
        dbname='openlibrary',
        user='user',
        password='password',
        host='localhost',
        port='5432'
    )
    conn.autocommit = True
    cur = conn.cursor()

    sql = """
        SELECT e.key
        FROM editions e
        JOIN edition_isbns ei ON ei.edition_key = e.key
        WHERE ei.isbn = %s
    """

    key_books = []
    for isbn in tqdm(isbn_list):
        cur.execute(sql, (isbn,))
        results = cur.fetchall()
        key_books.extend([r[0] for r in results])

    cur.close()
    conn.close()

    return key_books

In [26]:
isbn_list=books['ISBN']
isbn_list

0         0195153448
1         0002005018
2         0060973129
3         0374157065
4         0393045218
             ...    
271355    0440400988
271356    0525447644
271357    006008667X
271358    0192126040
271359    0767409752
Name: ISBN, Length: 271356, dtype: object

In [49]:
keys=get_key_books_list(isbn_list)

100%|██████████| 271356/271356 [04:17<00:00, 1054.76it/s]


In [50]:
keys_books=get_key_books(isbn_list)

100%|██████████| 271356/271356 [03:54<00:00, 1157.51it/s]


In [51]:
keys[:10]

['/books/OL18389036M',
 '/books/OL22232624M',
 '/books/OL7390154M',
 '/books/OL3597106M',
 '/books/OL22133369M',
 '/books/OL1890897M',
 '/books/OL20945359M',
 '/books/OL7423081M',
 '/books/OL23243677M',
 '/books/OL358977M']

In [52]:
keys_books[:10]

[[('/books/OL18389036M',), ('/books/OL22232624M',), ('/books/OL7390154M',)],
 [('/books/OL3597106M',), ('/books/OL22133369M',)],
 [('/books/OL1890897M',), ('/books/OL20945359M',)],
 [('/books/OL7423081M',), ('/books/OL23243677M',)],
 [('/books/OL358977M',)],
 [('/books/OL1530439M',)],
 [('/books/OL7504546M',), ('/books/OL24223668M',)],
 [('/books/OL27242140M',)],
 [('/books/OL804879M',)],
 [('/books/OL3659400M',)]]

In [53]:
books_with_keys=pd.DataFrame({
    'ISBN': isbn_list,
    'key_book': keys_books
}, columns=['ISBN', 'key_book'])

In [None]:
# Sauvegarde du dataframe pour ne plus faire l'opération

save_df_to_csv(books_with_keys, '../../datasets/isbn_with_OLkey.csv') 

La base a été créée. On cherche maintenant à relier les editions à un work correspondant. Les editions peuvent avoir dans leur champ data (fichier json) une colonne work qui contient une clé key de la table works. Comme le nombre d'edition est massif, un index a d'abord été fait sur edition_isbns (table reliant les editions à leur isbn) pour accélérer le temps des requêtes. Ensuite grâce à la fonction get_key_books_list au-dessus, on récupère uniquement les clés des livres de notre dataset initial. La fonction update_editions_work_key ci-dessous permet de faire la liaison entre l'edition et le work uniquement aux livres concernés. 

In [27]:
def update_editions_work_key(keys_list):
    conn = psycopg2.connect(
        dbname='openlibrary',
        user='user',
        password='password',
        host='localhost',
        port='5432'
    )
    conn.autocommit = True
    cur = conn.cursor()

    query = """
        UPDATE editions
        SET work_key = data->'works'->0->>'key'
        WHERE key = %s;
    """

    for key in tqdm(keys_list):
        cur.execute(query, (key,))

    cur.close()
    conn.close()

In [None]:
update_editions_work_key(keys)

100%|██████████| 367546/367546 [13:09<00:00, 465.64it/s] 


Maintenant, on peut récupérer les informations que l'on souhaite pour chacun des livres de notre dataset avec la fonction get_infos_by_isbn_list.

In [28]:
def split_subject_words(subject):
    """ Permet de séparer les mots tout en ignorant la ponctuation et les espaces. 
        On met aussi en minuscule les mots pour éciter la casse.
        Arguments :
            subject (str) : liste de genres retournés lors de la requête sql
    """
    return re.findall(r'\b\w+\b', subject.lower())

def get_infos_by_isbn_list(isbn_list):
    conn = psycopg2.connect(
        dbname='openlibrary',
        user='user', 
        password='password', 
        host='localhost',
        port='5432'
    )
    conn.autocommit = True
    cur = conn.cursor()

    # Requête sql permettant de récupérer les genres et descriptions d'un livre via son isbn
    sql = """
        select
        e.data->>'subjects' "Subjects",
        w.data->'description'->>'value' "WorkDescription"
    from editions e
    join edition_isbns ei
        on ei.edition_key = e.key
    join works w
        on w.key = e.work_key
    where ei.isbn = %s
    """

    # Comme on peut avoir plusieurs résultats pour un même isbn, 
    # la requête renverra plusieurs fois des subjects et descriptions (assez souvent identiques)
    # On traite de ce cas juste après
    
    subjects_final= [] # liste de listes qui contiendra les genres recueillis de chaque livre
    desc_final=[] # liste des descriptions des livres

    for isbn in tqdm(isbn_list):
        cur.execute(sql, (isbn,))
        results = cur.fetchall()

        # On crée ici des sets pour éviter les doublons lors de l'ajout des genres ou descriptions
        all_words = set()
        description_set = set()

        for r in results:
            
            if r[0]:
                subjects_list = json.loads(r[0]) 
                for subject in subjects_list:
                    words = split_subject_words(subject) # On sépare les mots des listes de genres qu'on obtient 
                                                         # pour comparer avec les suivants et ne pas ajouter de doublon
                    for w in words:
                        all_words.add(w)

            
            if r[1]:
                description_set.add(r[1])

        subjects_words = list(all_words)
        descriptions = list(description_set)
        subjects_final.append(subjects_words)
        desc_final.append(descriptions)

    cur.close()
    conn.close()

    return subjects_final, desc_final


In [59]:
subjects, description=get_infos_by_isbn_list(isbn_list)

100%|██████████| 271356/271356 [09:23<00:00, 481.44it/s]


In [263]:
books_desc=pd.DataFrame({
    'ISBN': isbn_list,
    'sujets': subjects,
    'description': description
})

In [264]:
books_desc

Unnamed: 0,ISBN,sujets,description
0,0195153448,"[mythology, classical]",[An excellent primer on classical mythology fo...
1,0002005018,"[york, ontario, teachers, sisters, fiction, ac...","[E-book extras: ""Hero of the Humdrum"": A profi..."
2,0060973129,"[war, 1945, campaigns, world, france, history,...",[]
3,0374157065,"[century, influenza, 20th, history]","[The fascinating, true story of the world's de..."
4,0393045218,"[age, basin, prehistoric, tarim, zizhiqu, mumm...","[In the museums of Urumchi, the wind-swept reg..."
...,...,...,...
271355,0440400988,"[juvenile, camps, fiction]","[On her own for the first time, fourteen-year-..."
271356,0525447644,"[juvenile, counting, literature]",[Illustrations of people and animals introduce...
271357,006008667X,"[century, states, american, alternate, history...","[In Lily Dale, New York, the dead don't die. I..."
271358,0192126040,"[works, utopias, early, science, 1800, to, pol...",[The Republic is Plato's most famous work and ...


In [311]:
books_desc['sujets']=books_desc['sujets'].apply(str_to_list_auto)
books_desc['description']=books_desc['description'].apply(str_to_list_auto)

In [None]:
save_df_to_pickle(books_desc, '../../datasets/books_desc.pkl')

In [None]:
save_df_to_csv(books_desc, '../../datasets/books_desc.csv')

In [23]:
books_desc=pd.read_pickle('../../datasets/books_desc.pkl')

In [24]:
books_desc.head()

Unnamed: 0,ISBN,sujets,description
0,195153448,"[mythology, classical]",[An excellent primer on classical mythology fo...
1,2005018,"[york, ontario, teachers, sisters, fiction, ac...","[E-book extras: ""Hero of the Humdrum"": A profi..."
2,60973129,"[war, 1945, campaigns, world, france, history,...",[]
3,374157065,"[century, influenza, 20th, history]","[The fascinating, true story of the world's de..."
4,393045218,"[age, basin, prehistoric, tarim, zizhiqu, mumm...","[In the museums of Urumchi, the wind-swept reg..."


En voyant les premières lignes du DataFrame, on remarque des listes vides si la description ou les sujets ne sont pas trouvables. On va chercher ceux qui ont au moins une description, pour avoir un maximum de mots pour l'entrainement des modèles en NLP.

In [122]:
books_desc_non_empty = books_desc[
    books_desc['description'].apply(lambda x: len(x) > 0)
].copy()

In [123]:
type(books_desc_non_empty.iloc[0]['description'])

list

Les descriptions sont une liste avec la phrase qui est un str. On veut uniquement le str dans notre colonne et on fait pareil pour les sujets.

In [124]:
books_desc_non_empty['description'] = books_desc_non_empty['description'].apply(lambda x: " ".join(x) if isinstance(x, list) else str(x))
books_desc_non_empty['sujets'] = books_desc_non_empty['sujets'].apply(lambda x: ", ".join(x) if isinstance(x, list) else str(x))

In [125]:
books_desc_non_empty.head()

Unnamed: 0,ISBN,sujets,description
0,195153448,"mythology, classical",An excellent primer on classical mythology for...
1,2005018,"york, ontario, teachers, sisters, fiction, act...","E-book extras: ""Hero of the Humdrum"": A profil..."
3,374157065,"century, influenza, 20th, history","The fascinating, true story of the world's dea..."
4,393045218,"age, basin, prehistoric, tarim, zizhiqu, mummi...","In the museums of Urumchi, the wind-swept regi..."
6,425176428,"imaginary, battles, histories, wars, and",With its in-depth reflections on the monumenta...


In [126]:
save_df_to_csv(books_desc_non_empty, '../../datasets/tmp/books_desc_non_empty_only_desc.csv')

In [127]:
save_df_to_pickle(books_desc_non_empty, '../../datasets/tmp/books_desc_non_empty_only_desc.pkl')

In [128]:
books_desc_non_empty=pd.read_pickle('../../datasets/tmp/books_desc_non_empty_only_desc.pkl')

On veut maintenant enlever les stop words des sujets pour n'avoir que les mots intéressants. On fait ça parce que ce sont les thèmes des livres qu'on a gardés.

In [129]:
books_desc_non_empty['sujets']=books_desc_non_empty['sujets'].apply(remove_stopwords)

Maintenant que c'est fait, reformons la liste des sujets.

In [130]:
books_desc_non_empty['sujets']=books_desc_non_empty['sujets'].apply(lambda x : x.split(','))

In [131]:
books_desc_non_empty.reset_index(drop=True, inplace=True)

In [132]:
books_desc_non_empty

Unnamed: 0,ISBN,sujets,description
0,0195153448,"[mythology, classical]",An excellent primer on classical mythology for...
1,0002005018,"[york, ontario, teachers, sisters, fiction...","E-book extras: ""Hero of the Humdrum"": A profil..."
2,0374157065,"[century, influenza, 20th, history]","The fascinating, true story of the world's dea..."
3,0393045218,"[age, basin, prehistoric, tarim, zizhiqu, ...","In the museums of Urumchi, the wind-swept regi..."
4,0425176428,"[imaginary, battles, histories, wars, ]",With its in-depth reflections on the monumenta...
...,...,...,...
79336,014002803X,"[general, modern, fiction]","Amid preparations for Operation Apollo, secret..."
79337,0440400988,"[juvenile, camps, fiction]","On her own for the first time, fourteen-year-o..."
79338,0525447644,"[juvenile, counting, literature]",Illustrations of people and animals introduce ...
79339,006008667X,"[century, states, american, alternate, his...","In Lily Dale, New York, the dead don't die. In..."


In [133]:
save_df_to_csv(books_desc_non_empty, '../../datasets/tmp/books_desc_non_empty_only_desc.csv')

In [134]:
save_df_to_pickle(books_desc_non_empty, '../../datasets/tmp/books_desc_non_empty_only_desc.pkl')

Maintenant on fusionne le dernier dataset avec books.

In [135]:
books_final=books.merge(books_desc_non_empty, on='ISBN')

In [136]:
books_final

Unnamed: 0,ISBN,title,author,year,publisher,sujets,description
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,"[mythology, classical]",An excellent primer on classical mythology for...
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,"[york, ontario, teachers, sisters, fiction...","E-book extras: ""Hero of the Humdrum"": A profil..."
2,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,"[century, influenza, 20th, history]","The fascinating, true story of the world's dea..."
3,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,"[age, basin, prehistoric, tarim, zizhiqu, ...","In the museums of Urumchi, the wind-swept regi..."
4,0425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group,"[imaginary, battles, histories, wars, ]",With its in-depth reflections on the monumenta...
...,...,...,...,...,...,...,...
79336,014002803X,Anti Death League,Kingsley Amis,1975,Viking Press,"[general, modern, fiction]","Amid preparations for Operation Apollo, secret..."
79337,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),"[juvenile, camps, fiction]","On her own for the first time, fourteen-year-o..."
79338,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,"[juvenile, counting, literature]",Illustrations of people and animals introduce ...
79339,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,"[century, states, american, alternate, his...","In Lily Dale, New York, the dead don't die. In..."


On a enfin les descriptions avec les livres. On a quand même réduit notre dataset de manière assez conséquente mais on a au moins assez de texte par livre pour pouvoir faire un meilleur entrainement.

Maintenant, on veut enlever les balises html des textes pour éviter du bruit.

In [137]:
import html

books_final['description'] = books_final['description'].apply(lambda x: re.sub(r'<.*?>', '', html.unescape(x)))
books_final['title'] = books_final['title'].apply(lambda x: re.sub(r'<.*?>', '', html.unescape(x)))
books_final['author'] = books_final['author'].apply(lambda x: re.sub(r'<.*?>', '', html.unescape(x)))
books_final['publisher'] = books_final['publisher'].apply(lambda x: re.sub(r'<.*?>', '', html.unescape(x)))

In [138]:
books_final.head()

Unnamed: 0,ISBN,title,author,year,publisher,sujets,description
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,"[mythology, classical]",An excellent primer on classical mythology for...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,"[york, ontario, teachers, sisters, fiction...","E-book extras: ""Hero of the Humdrum"": A profil..."
2,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,"[century, influenza, 20th, history]","The fascinating, true story of the world's dea..."
3,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company,"[age, basin, prehistoric, tarim, zizhiqu, ...","In the museums of Urumchi, the wind-swept regi..."
4,425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group,"[imaginary, battles, histories, wars, ]",With its in-depth reflections on the monumenta...


In [139]:
save_df_to_csv(books_final, '../../datasets/preprocess/books_final.csv')
save_df_to_pickle(books_final, '../../datasets/preprocess/books_final.pkl')

In [140]:
books_final=pd.read_pickle('../../datasets/preprocess/books_final.pkl')

In [141]:
books_final.head()

Unnamed: 0,ISBN,title,author,year,publisher,sujets,description
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,"[mythology, classical]",An excellent primer on classical mythology for...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,"[york, ontario, teachers, sisters, fiction...","E-book extras: ""Hero of the Humdrum"": A profil..."
2,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,"[century, influenza, 20th, history]","The fascinating, true story of the world's dea..."
3,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company,"[age, basin, prehistoric, tarim, zizhiqu, ...","In the museums of Urumchi, the wind-swept regi..."
4,425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group,"[imaginary, battles, histories, wars, ]",With its in-depth reflections on the monumenta...


## Création du dataset pour la recommandation basée sur le contenu

In [142]:
content_df=books_final.copy() #dataset pour la recommandation basée sur le contenu

In [143]:
content_df.shape

(79341, 7)

In [144]:
content_df.head()

Unnamed: 0,ISBN,title,author,year,publisher,sujets,description
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,"[mythology, classical]",An excellent primer on classical mythology for...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,"[york, ontario, teachers, sisters, fiction...","E-book extras: ""Hero of the Humdrum"": A profil..."
2,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,"[century, influenza, 20th, history]","The fascinating, true story of the world's dea..."
3,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company,"[age, basin, prehistoric, tarim, zizhiqu, ...","In the museums of Urumchi, the wind-swept regi..."
4,425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group,"[imaginary, battles, histories, wars, ]",With its in-depth reflections on the monumenta...


In [145]:
save_df_to_pickle(content_df, '../../datasets/reco_datasets/content_dataset.pkl')

## Création du dataset pour la recommandation collaborative

Avant tout ça, on va ajouter une colonne 'is_implicit' pour faciliter les utilisations de modèles collaboratif plus tard. Dans la page kaggle, ils disent que les notes vont de 1 à 10 et que s'il y a un 0, c'est une interaction implicite.

In [23]:
ratings_explicit=ratings[ratings['rating'] != 0].copy()
ratings_implicit=ratings[ratings['rating'] == 0].copy()

In [24]:
save_df_to_csv(ratings_implicit, '../../datasets/reco_datasets/ratings_implicit.csv')
save_df_to_pickle(ratings_implicit, '../../datasets/reco_datasets/ratings_implicit.pkl')

In [25]:
ratings_explicit.head()

Unnamed: 0,user_id,ISBN,rating
1,276726,0155061224,5
3,276729,052165615X,3
4,276729,0521795028,6
6,276736,3257224281,8
7,276737,0600570967,6


In [26]:
ratings_implicit.head()

Unnamed: 0,user_id,ISBN,rating
0,276725,034545104X,0
2,276727,0446520802,0
5,276733,2080674722,0
10,276746,0425115801,0
11,276746,0449006522,0


In [137]:
ratings_explicit.sort_values(by='user_id', inplace=True)
ratings_explicit.reset_index(drop=True, inplace=True)
ratings_explicit.head()

Unnamed: 0,user_id,ISBN,rating
0,8,0002005018,5
1,8,1881320189,7
2,8,1575663937,6
3,8,074322678X,5
4,8,1552041778,5


In [138]:
ratings_implicit.sort_values(by='user_id', inplace=True)
ratings_implicit.reset_index(drop=True, inplace=True)
ratings_implicit.head()

Unnamed: 0,user_id,ISBN,rating
0,2,0195153448,0
1,7,034542252,0
2,8,0425176428,0
3,8,1558746218,0
4,8,080652121X,0


In [139]:
ratings_explicit['user_id'].value_counts()

user_id
11676     8524
98391     5802
153662    1969
189835    1906
23902     1395
          ... 
84129        1
165812       1
34231        1
165826       1
41658        1
Name: count, Length: 77805, dtype: int64

On voit que certains utilisateurs n'ont intéragi qu'avec un livre alors que d'autres l'ont fait avec beaucoup. On va prendre les utilisateurs qui ont intéragi au moins autant de fois que la moyenne d'interaction.

In [140]:
moy_nb_inter=ratings_explicit['user_id'].value_counts().mean()
moy_nb_inter

5.573819163292847

In [141]:
mask_rating = ratings_explicit['user_id'].value_counts() > 10

In [142]:
user_indexes=mask_rating[mask_rating].index

user_indexes

Index([ 11676,  98391, 153662, 189835,  23902,  76499, 171118, 235105,  16795,
       248718,
       ...
       147947, 254995, 154031, 107540, 167761,  60703,   6115, 162861, 104759,
       104925],
      dtype='int64', name='user_id', length=6655)

In [143]:
ratings_tmp=ratings_explicit[ratings_explicit['user_id'].isin(user_indexes)]

In [144]:
ratings_tmp.reset_index(drop=True, inplace=True)

In [145]:
ratings_tmp

Unnamed: 0,user_id,ISBN,rating
0,183,9724509524,8
1,183,9728440170,9
2,183,9728440138,9
3,183,9728440057,7
4,183,9728440030,9
...,...,...,...
288766,278851,067161746X,7
288767,278851,0440486599,5
288768,278851,0439050006,5
288769,278851,0380865610,5


Maintenant, on va faire pareil avec le nombre de livres qui a eu une interaction.

In [146]:
moy_books_inter=ratings_tmp['ISBN'].value_counts().mean()
moy_books_inter

2.0066222404436136

In [147]:
mask_books = ratings_tmp['ISBN'].value_counts() > 10
book_indexes=mask_books[mask_books].index

In [148]:
ratings_tmp=ratings_tmp[ratings_tmp['ISBN'].isin(book_indexes)].reset_index(drop=True).copy()

ratings_tmp

Unnamed: 0,user_id,ISBN,rating
0,242,0553278223,10
1,243,0446364800,9
2,243,155874262X,5
3,243,0449006522,6
4,243,0446606383,6
...,...,...,...
67400,278843,0060517794,7
67401,278843,0060173289,9
67402,278851,0894803700,5
67403,278851,1558531025,8


In [149]:
save_df_to_csv(ratings_tmp, '../../datasets/reco_datasets/collaborative_dataset_explicit.csv')
save_df_to_pickle(ratings_tmp, '../../datasets/reco_datasets/collaborative_dataset_explicit.pkl')

In [150]:
books_final = pd.read_csv(path+"Books.csv")
books_final.rename(columns={'Book-Title':'title', 'Book-Author': 'author','Year-Of-Publication':'year', 'Publisher':'publisher'}, inplace=True)
books_final.drop(columns=['Image-URL-S', 'Image-URL-M'], inplace=True)
books_final.head(10)

Unnamed: 0,ISBN,title,author,year,publisher,Image-URL-L
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...
5,0399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...
6,0425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group,http://images.amazon.com/images/P/0425176428.0...
7,0671870432,PLEADING GUILTY,Scott Turow,1993,Audioworks,http://images.amazon.com/images/P/0671870432.0...
8,0679425608,Under the Black Flag: The Romance and the Real...,David Cordingly,1996,Random House,http://images.amazon.com/images/P/0679425608.0...
9,074322678X,Where You'll Find Me: And Other Stories,Ann Beattie,2002,Scribner,http://images.amazon.com/images/P/074322678X.0...


In [151]:
ratings_tmp_books=ratings_tmp.merge(books_final, on='ISBN')

In [152]:
ratings_tmp_books

Unnamed: 0,user_id,ISBN,rating,title,author,year,publisher,Image-URL-L
0,242,0553278223,10,The Martian Chronicles,RAY BRADBURY,1984,Spectra,http://images.amazon.com/images/P/0553278223.0...
1,243,0446364800,9,The General's Daughter,Nelson DeMille,1993,Warner Books,http://images.amazon.com/images/P/0446364800.0...
2,243,155874262X,5,Chicken Soup for the Soul (Chicken Soup for th...,Jack Canfield,1993,Health Communications,http://images.amazon.com/images/P/155874262X.0...
3,243,0449006522,6,Manhattan Hunt Club,JOHN SAUL,2002,Ballantine Books,http://images.amazon.com/images/P/0449006522.0...
4,243,0446606383,6,The Midnight Club,James Patterson,1999,Warner Vision,http://images.amazon.com/images/P/0446606383.0...
...,...,...,...,...,...,...,...,...
66260,278843,0060517794,7,Little Altars Everywhere,Rebecca Wells,2003,HarperTorch,http://images.amazon.com/images/P/0060517794.0...
66261,278843,0060173289,9,Divine Secrets of the Ya-Ya Sisterhood : A Novel,Rebecca Wells,1996,HarperCollins,http://images.amazon.com/images/P/0060173289.0...
66262,278851,0894803700,5,"14,000 Things to Be Happy About",Barbara Ann Kipfer,1990,Workman Publishing,http://images.amazon.com/images/P/0894803700.0...
66263,278851,1558531025,8,Life's Little Instruction Book (Life's Little ...,H. Jackson Brown,1991,Thomas Nelson,http://images.amazon.com/images/P/1558531025.0...


On a ainsi les notes et les livres correspondants avec l'image de couverture, ça pourra servir plus tard si on veut afficher le livre.

In [153]:
save_df_to_csv(ratings_tmp_books, '../../datasets/reco_datasets/ratings_with_books.csv')
save_df_to_pickle(ratings_tmp_books, '../../datasets/reco_datasets/ratings_with_books.pkl')

In [154]:
users.head()

Unnamed: 0,user_id,location,age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [155]:
user_stats = ratings_explicit.groupby('user_id')['rating'].mean().reset_index()
user_stats.columns = ['user_id', 'user_avg_rating']

user_stats

Unnamed: 0,user_id,user_avg_rating
0,8,5.571429
1,9,6.000000
2,10,6.000000
3,12,10.000000
4,14,5.333333
...,...,...
77800,278846,8.000000
77801,278849,9.000000
77802,278851,6.500000
77803,278852,8.000000


In [156]:
users_avg_rating=users.merge(user_stats, on='user_id')

users_avg_rating

Unnamed: 0,user_id,location,age,user_avg_rating
0,8,"timmins, ontario, canada",,5.571429
1,9,"germantown, tennessee, usa",,6.000000
2,10,"albacete, wisconsin, spain",26.0,6.000000
3,12,"fort bragg, california, usa",,10.000000
4,14,"mediapolis, iowa, usa",,5.333333
...,...,...,...,...
77800,278846,"toronto, ontario, canada",23.0,8.000000
77801,278849,"georgetown, ontario, canada",23.0,9.000000
77802,278851,"dallas, texas, usa",33.0,6.500000
77803,278852,"brisbane, queensland, australia",32.0,8.000000


In [157]:
df_avg=ratings_tmp.merge(users_avg_rating, on='user_id')
df_avg

Unnamed: 0,user_id,ISBN,rating,location,age,user_avg_rating
0,242,0553278223,10,"neuffen, baden-wuerttemberg, germany",37.0,8.545455
1,243,0446364800,9,"arden hills, minnesota, usa",,7.277778
2,243,155874262X,5,"arden hills, minnesota, usa",,7.277778
3,243,0449006522,6,"arden hills, minnesota, usa",,7.277778
4,243,0446606383,6,"arden hills, minnesota, usa",,7.277778
...,...,...,...,...,...,...
67400,278843,0060517794,7,"pismo beach, california, usa",28.0,7.700000
67401,278843,0060173289,9,"pismo beach, california, usa",28.0,7.700000
67402,278851,0894803700,5,"dallas, texas, usa",33.0,6.500000
67403,278851,1558531025,8,"dallas, texas, usa",33.0,6.500000


Ici on a les informations basiques des utilisateurs encore présents dans le dataset ratings.

In [48]:
save_df_to_csv(df_avg, '../../datasets/reco_datasets/users_stats.csv')
save_df_to_pickle(df_avg, '../../datasets/reco_datasets/users_stats.pkl')

# Recommandation basée sur le contenu

## Préparation des textes (TF-IDF, W2V, GloVe)

In [85]:
content_df=pd.read_pickle('../../datasets/reco_datasets/content_dataset.pkl')

In [86]:
# On nettoie légèrement le texte pour garder sa forme principale (dont accents etc)

content_df['title_clean'] = content_df['title'].apply(nettoyage_word2vec)
content_df['author_clean'] = content_df['author'].apply(nettoyage_word2vec)
content_df['publisher_clean'] = content_df['publisher'].apply(nettoyage_word2vec)
content_df['sujets_clean'] = content_df['sujets'].apply(nettoyage_word2vec)
content_df['description_clean'] = content_df['description'].apply(nettoyage_word2vec)

In [87]:
content_df['sujets_clean'] = content_df['sujets_clean'].apply(lambda x: " ".join(x) if isinstance(x, list) else "")

In [88]:
content_df.head()

Unnamed: 0,ISBN,title,author,year,publisher,sujets,description,title_clean,author_clean,publisher_clean,sujets_clean,description_clean
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,"[mythology, classical]",An excellent primer on classical mythology for...,classical mythology,mark p o morford,oxford university press,mythology classical,an excellent primer on classical mythology for...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,"[ontario, teachers, young, women, new, york, f...","E-book extras: ""Hero of the Humdrum"": A profil...",clara callan,richard bruce wright,harperflamingo canada,ontario teachers young women new york fiction ...,e book extras hero of the humdrum a profile of...
2,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,"[20th, history, century, influenza]","The fascinating, true story of the world's dea...",flu the story of the great influenza pandemic ...,gina bari kolata,farrar straus giroux,th history century influenza,the fascinating true story of the world s dead...
3,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company,"[antiquities, fabrics, prehistoric, textile, b...","In the museums of Urumchi, the wind-swept regi...",the mummies of urumchi,e j w barber,w w norton company,antiquities fabrics prehistoric textile bronze...,in the museums of urumchi the wind swept regio...
4,425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group,"[histories, wars, imaginary, battles]",With its in-depth reflections on the monumenta...,what if the world s foremost military historia...,robert cowley,berkley publishing group,histories wars imaginary battles,with its in depth reflections on the monumenta...


In [89]:
save_df_to_pickle(content_df, '../../datasets/reco_datasets/content_dataset_w2v.pkl')

In [90]:
content_df['description_clean'].duplicated().sum()

21381

In [91]:
content_df.drop_duplicates(subset=['description_clean'], inplace=True)

In [92]:
content_df.reset_index(drop=True, inplace=True)

In [93]:
content_df

Unnamed: 0,ISBN,title,author,year,publisher,sujets,description,title_clean,author_clean,publisher_clean,sujets_clean,description_clean
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,"[mythology, classical]",An excellent primer on classical mythology for...,classical mythology,mark p o morford,oxford university press,mythology classical,an excellent primer on classical mythology for...
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,"[ontario, teachers, young, women, new, york, f...","E-book extras: ""Hero of the Humdrum"": A profil...",clara callan,richard bruce wright,harperflamingo canada,ontario teachers young women new york fiction ...,e book extras hero of the humdrum a profile of...
2,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,"[20th, history, century, influenza]","The fascinating, true story of the world's dea...",flu the story of the great influenza pandemic ...,gina bari kolata,farrar straus giroux,th history century influenza,the fascinating true story of the world s dead...
3,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company,"[antiquities, fabrics, prehistoric, textile, b...","In the museums of Urumchi, the wind-swept regi...",the mummies of urumchi,e j w barber,w w norton company,antiquities fabrics prehistoric textile bronze...,in the museums of urumchi the wind swept regio...
4,0425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group,"[histories, wars, imaginary, battles]",With its in-depth reflections on the monumenta...,what if the world s foremost military historia...,robert cowley,berkley publishing group,histories wars imaginary battles,with its in depth reflections on the monumenta...
...,...,...,...,...,...,...,...,...,...,...,...,...
57955,3462017942,Die Massenpsychologie des Faschismus.,Wilhelm Reich,1986,Kiepenheuer & Witsch,[],**The Mass Psychology of Fascism** (German: *D...,die massenpsychologie des faschismus,wilhelm reich,kiepenheuer witsch,,the mass psychology of fascism german die mass...
57956,020130998X,The Unified Modeling Language Reference Manual...,James Rumbaugh,1998,Addison-Wesley Professional,"[software, computer, science, uml, development]",The Unified Modeling Language (UML) has rapidl...,the unified modeling language reference manual...,james rumbaugh,addison wesley professional,software computer science uml development,the unified modeling language uml has rapidly ...
57957,0231128444,Slow Food(The Case For Taste),Carlo Petrini,2003,Columbia University Press,"[food, slow, gaststättengewerbe, ess, trinksit...",Discusses the history and spread of the Intern...,slow food the case for taste,carlo petrini,columbia university press,food slow gaststattengewerbe ess trinksitte mo...,discusses the history and spread of the intern...
57958,014002803X,Anti Death League,Kingsley Amis,1975,Viking Press,"[general, modern, fiction]","Amid preparations for Operation Apollo, secret...",anti death league,kingsley amis,viking press,general modern fiction,amid preparations for operation apollo secret ...


In [None]:
save_df_to_pickle(content_df, '../../datasets/reco_datasets/content_dataset_sans_doublon_full.pkl')
save_df_to_csv(content_df, '../../datasets/reco_datasets/content_dataset_sans_doublon_full.csv')

In [95]:
conntent_df_sans_doublon=content_df.drop(columns=['title_clean', 'author_clean', 'publisher_clean', 'sujets_clean', 'description_clean'])

In [None]:
save_df_to_pickle(conntent_df_sans_doublon, '../../datasets/reco_datasets/content_dataset_sans_doublon.pkl')
save_df_to_csv(conntent_df_sans_doublon, '../../datasets/reco_datasets/content_dataset_sans_doublon.csv')

## Utilisation de TF-IDF (pour faire une recherche dans le dataset)

In [416]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
content_df=pd.read_pickle('../../datasets/reco_datasets/content_dataset_sans_doublon_full.pkl')

In [418]:
tfidf=TfidfVectorizer(ngram_range=(1, 3))
title_matrix=tfidf.fit_transform(content_df['title_clean'])

In [419]:
title_matrix.shape

(68631, 398073)

In [420]:
from sklearn.metrics.pairwise import cosine_similarity

def search(title):
    title = nettoyage_texte(title)
    vec = tfidf.transform([title])
    similarite = cosine_similarity(vec, title_matrix).flatten()
    indices = np.argpartition(similarite, -5)[-5:]
    res=content_df.iloc[indices][::-1][['ISBN',	'title', 'author', 'year', 'publisher']]

    return res

In [421]:
search('Harry Potter')

Unnamed: 0,ISBN,title,author,year,publisher
12059,043965548X,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling,2004,Scholastic Paperbacks
22877,0439554896,Harry Potter and the Chamber of Secrets (Harry...,J. K. Rowling,2003,Arthur A. Levine Books
39974,1594130027,Harry Potter and the Prisoner of Azkaban (Harr...,J. K. Rowling,2003,Large Print Press
9505,2070541304,"Harry Potter, tome 3 : Harry Potter et le Pris...",Joanne K. Rowling,1999,Gallimard
41176,2070541290,"Harry Potter, tome 2 : Harry Potter et la Cham...",Joanne K. Rowling,1999,Gallimard


In [422]:
import ipywidgets as widgets
from IPython.display import display

book_input = widgets.Text(
    value = ' ',
    description = 'Book Title',
    disabled=False
)

book_list=widgets.Output()

def on_type(data):
    with book_list:
        book_list.clear_output()
        title = data["new"] 
        if len(title) > 5:
            display(search(title))

book_input.observe(on_type, names='value')

display(book_input, book_list)

Text(value=' ', description='Book Title')

Output()

In [423]:
tfidf.get_feature_names_out()[:20]

array(['00', '00 mystery', '00 mystery at', '00 on', '00 on houseboat',
       '001', '001 double', '001 double love', '001 with',
       '001 with cdrom', '002', '002 secrets', '002 secrets sweet', '003',
       '003 playing', '003 playing wfire', '004', '004 power',
       '004 power play', '005'], dtype=object)

## Entrainement de Word2Vec

In [156]:
content_df=pd.read_pickle('../../datasets/reco_datasets/content_dataset_sans_doublon_full.pkl')

On va créer les tokens des mots des textes.

In [19]:
tqdm.pandas(desc="Tokenisation des textes")

titles=content_df['title_clean'].progress_apply(gensim.utils.simple_preprocess)
subjects_proc=content_df['sujets_clean'].progress_apply(gensim.utils.simple_preprocess)
desc_proc=content_df['description_clean'].progress_apply(gensim.utils.simple_preprocess)

Tokenisation des textes: 100%|██████████| 57960/57960 [00:01<00:00, 55562.28it/s] 
Tokenisation des textes: 100%|██████████| 57960/57960 [00:00<00:00, 96390.35it/s] 
Tokenisation des textes: 100%|██████████| 57960/57960 [00:05<00:00, 10001.95it/s]


In [20]:
#ici on ajoute une séparation pour aider le modèle à différencier le type de contenu qu'il lit

text_fusion=[]

for t, s, d in zip(titles, subjects_proc, desc_proc):
    text_fusion.append(['<TITLE>'] + t + ['<SUBJECTS>'] + s + ['<DESC>'] + d)

In [21]:
w2v=gensim.models.Word2Vec(sentences=text_fusion, vector_size=150, window=10, workers=10, min_count=5)

In [23]:
w2v.save("../../models/word2vec_model_text_fusion.model")

In [27]:
fusion_token=[get_text_vector(book, w2v) for book in tqdm(text_fusion)]


100%|██████████| 57960/57960 [00:14<00:00, 3893.81it/s]


In [28]:
fusion_token_serie=pd.Series(fusion_token)

In [29]:
fusion_token_serie.head()

0    [-0.06411282, 0.22272427, -0.44902217, -0.2710...
1    [0.16263367, 0.1321265, 0.28438684, -0.3819983...
2    [0.56824416, 0.16446283, -0.12623402, 0.009544...
3    [0.16337354, 0.15990022, -0.34316146, -0.27643...
4    [0.5493796, 0.015232784, -0.13083039, -0.12401...
dtype: object

In [30]:
X=np.vstack(fusion_token_serie)

In [31]:
with open('../../datasets/embeddings_w2v.pkl', 'wb') as f:
    pickle.dump(X, f)

## Utilisation de GloVe

In [32]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [33]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_fusion)

In [34]:
print("Nombre de mots dans le dictionnaire :", len(tokenizer.word_index))

Nombre de mots dans le dictionnaire : 120413


In [37]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

embedding_dim = 200

glove_path = f'../../models/glove/glove.6B.{embedding_dim}d.txt'
word2vec_output_file = f'../../models/glove/glove.6B.{embedding_dim}d.word2vec.txt'


In [38]:
glove2word2vec(glove_path, word2vec_output_file)

  glove2word2vec(glove_path, word2vec_output_file)


(400000, 200)

In [39]:
glove = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

In [42]:
fusion_token_glove=[sentence_vector(book, glove) for book in tqdm(text_fusion)]


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 57960/57960 [00:14<00:00, 3976.88it/s]


In [43]:
fusion_token_glove[0]

array([ 0.30755815,  0.10562591,  0.17970185, -0.3679312 ,  0.3016246 ,
       -0.02384786, -0.457119  ,  0.15751502,  0.18150266, -0.14326148,
        0.15395962,  0.33609253, -0.00537747, -0.19615272,  0.20035541,
        0.08767714,  0.1716869 ,  0.3529484 ,  0.02919114, -0.21337925,
        0.44143882,  2.0086691 ,  0.05645609, -0.3886385 ,  0.31383142,
        0.19095032, -0.02020439, -0.20434809,  0.16358308,  0.05781369,
       -0.08092288,  0.29954487, -0.12072979, -0.29630747,  0.29977554,
       -0.218659  , -0.40371227, -0.34025308,  0.12339522, -0.25631747,
        0.03560612, -0.40788114,  0.02046514,  0.06516082, -0.10348261,
        0.16734011,  0.3651881 , -0.10212114,  0.23175912,  0.2390988 ,
       -0.36878094,  0.07891554, -0.02370057,  0.27292237,  0.14671047,
        0.0394091 ,  0.12512803,  0.28383404, -0.02325648,  0.0538092 ,
       -0.33994094, -0.01931171, -0.36128017,  0.13571976,  0.12617655,
       -0.05602182,  0.1582334 ,  0.33777148,  0.22782296, -0.14

In [44]:
fusion_token_glove_serie=pd.Series(fusion_token_glove)
fusion_token_glove_serie

0        [0.30755815, 0.10562591, 0.17970185, -0.367931...
1        [0.017109556, 0.28028125, -0.0817375, -0.22292...
2        [0.03869363, 0.04833352, -0.04343567, -0.24260...
3        [0.18290083, 0.0658717, -0.09196495, -0.213143...
4        [0.059435833, 0.19990379, -0.04997457, -0.1633...
                               ...                        
57955    [-0.035772197, 0.13296166, 0.039128445, -0.106...
57956    [0.15818211, 0.27699795, 0.07344365, -0.109747...
57957    [0.055929236, 0.1144517, -0.1385344, -0.040413...
57958    [0.0904096, -0.018649003, -0.13404374, -0.1886...
57959    [0.027832823, 0.12822556, -0.16156122, -0.1185...
Length: 57960, dtype: object

In [45]:
X_glove=np.vstack(fusion_token_glove_serie)

X_glove

array([[ 0.30755815,  0.10562591,  0.17970185, ...,  0.12969396,
         0.00673639,  0.2008201 ],
       [ 0.01710956,  0.28028125, -0.0817375 , ..., -0.00422918,
         0.15208262,  0.13644333],
       [ 0.03869363,  0.04833352, -0.04343567, ...,  0.07369021,
        -0.08564171,  0.1066957 ],
       ...,
       [ 0.05592924,  0.1144517 , -0.1385344 , ...,  0.02251309,
         0.00438884,  0.18385674],
       [ 0.0904096 , -0.018649  , -0.13404374, ...,  0.03422218,
         0.01244916,  0.06070415],
       [ 0.02783282,  0.12822556, -0.16156122, ...,  0.20009831,
        -0.14493118,  0.19678748]])

In [46]:
with open('../../datasets/embeddings_glove.pkl', 'wb') as f:
    pickle.dump(X_glove, f)

## Utilisation de BERT

### Nettoyage des textes (pour garder plus de contexte)

In [49]:
content_df=pd.read_pickle('../../datasets/reco_datasets/content_dataset_sans_doublon_full.pkl')

In [50]:
content_df['title_clean'] = content_df['title'].apply(nettoyage_leger)
content_df['author_clean'] = content_df['author'].apply(nettoyage_leger)
content_df['publisher_clean'] = content_df['publisher'].apply(nettoyage_leger)
content_df['sujets_clean'] = content_df['sujets'].apply(nettoyage_leger)
content_df['description_clean'] = content_df['description'].apply(nettoyage_leger)

In [51]:
content_df.head()

Unnamed: 0,ISBN,title,author,year,publisher,sujets,description,title_clean,author_clean,publisher_clean,sujets_clean,description_clean
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,"[mythology, classical]",An excellent primer on classical mythology for...,Classical Mythology,Mark P. O. Morford,Oxford University Press,"[mythology, classical]",An excellent primer on classical mythology for...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,"[ontario, teachers, young, women, new, york, f...","E-book extras: ""Hero of the Humdrum"": A profil...",Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,"[ontario, teachers, young, women, new, york, f...","E-book extras: ""Hero of the Humdrum"": A profil..."
2,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,"[20th, history, century, influenza]","The fascinating, true story of the world's dea...",Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,Farrar Straus Giroux,"[20th, history, century, influenza]","The fascinating, true story of the world's dea..."
3,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company,"[antiquities, fabrics, prehistoric, textile, b...","In the museums of Urumchi, the wind-swept regi...",The Mummies of Urumchi,E. J. W. Barber,W. W. Norton & Company,"[antiquities, fabrics, prehistoric, textile, b...","In the museums of Urumchi, the wind-swept regi..."
4,425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group,"[histories, wars, imaginary, battles]",With its in-depth reflections on the monumenta...,What If?: The World's Foremost Military Histor...,Robert Cowley,Berkley Publishing Group,"[histories, wars, imaginary, battles]",With its in-depth reflections on the monumenta...


In [52]:
content_df.shape

(57960, 12)

In [53]:
titles=content_df['title_clean'].tolist()

titles[:10]

['Classical Mythology',
 'Clara Callan',
 'Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It',
 'The Mummies of Urumchi',
 "What If?: The World's Foremost Military Historians Imagine What Might Have Been",
 'PLEADING GUILTY',
 'Goodbye to the Buttermilk Sky',
 'Mary-Kate & Ashley Switching Goals (Mary-Kate and Ashley Starring in)',
 'Flood : Mississippi 1927',
 'Wild Animus']

In [54]:
subjects=content_df['sujets_clean'].tolist()

subjects[:10]

[['mythology', 'classical'],
 ['ontario',
  'teachers',
  'young',
  'women',
  'new',
  'york',
  'fiction',
  'sisters',
  'actresses'],
 ['20th', 'history', 'century', 'influenza'],
 ['antiquities',
  'fabrics',
  'prehistoric',
  'textile',
  'bronze',
  'age',
  'zizhiqu',
  'uygur',
  'tarim',
  'xinjiang',
  'china',
  'basin',
  'mummies'],
 ['histories', 'wars', 'imaginary', 'battles'],
 [],
 ['alabama', 'farm', 'married', 'depressions', 'life', 'fiction', 'women'],
 ['soccer',
  'twins',
  'tie',
  'movie',
  'books',
  'juvenile',
  'sports',
  'recreation',
  'general',
  'grades',
  'television',
  'fiction',
  'stories',
  'ages',
  'children'],
 ['survival', 'friends', 'americans', 'best', 'floods', 'fiction', 'african'],
 ['adventure',
  'alaska',
  'national',
  'preserve',
  'elias',
  'popular',
  'american',
  'saint',
  'general',
  'wrangell',
  'spiritual',
  'park',
  'literary',
  'fiction',
  'life',
  'modern',
  'metaphysics']]

In [55]:
subjects_text = [" ".join(tokens) for tokens in subjects]

subjects_text[:10]

['mythology classical',
 'ontario teachers young women new york fiction sisters actresses',
 '20th history century influenza',
 'antiquities fabrics prehistoric textile bronze age zizhiqu uygur tarim xinjiang china basin mummies',
 'histories wars imaginary battles',
 '',
 'alabama farm married depressions life fiction women',
 'soccer twins tie movie books juvenile sports recreation general grades television fiction stories ages children',
 'survival friends americans best floods fiction african',
 'adventure alaska national preserve elias popular american saint general wrangell spiritual park literary fiction life modern metaphysics']

In [56]:
descriptions = content_df['description_clean'].tolist()

descriptions[:10]

['An excellent primer on classical mythology for readers who have little or no background of classical knowledge.',
 'E-book extras: "Hero of the Humdrum": A profile of Richard B. Wright by John Bemrose; prize citations.It is 1934, and in a small town in Canada, Clara Callan reluctantly takes leave of her sister, Nora, who is bound for the show business world of New York. Richard B. Wright\'s acclaimed novel, winner in 2001 of Canada\'s two most prestigious literary awards, is a mesmerizing tribute to friendship and sisterhood, romance and redemption.Winner in 2001 of Canada\'s two most prestigious literary awards -- the Governor General\'s Award and the Giller Prize -- Richard B. Wright\'s celebrated novel Clara Callan is the powerful, moving story of two sisters and their life-changing experiences on the eve of World War II.',
 "The fascinating, true story of the world's deadliest disease. In 1918, the Great Flu Epidemic felled the young and healthy virtually overnight. An estimated 

In [57]:
authors=content_df['author_clean'].tolist()

authors[:10]

['Mark P. O. Morford',
 'Richard Bruce Wright',
 'Gina Bari Kolata',
 'E. J. W. Barber',
 'Robert Cowley',
 'Scott Turow',
 'Julia Oliver',
 'Mary-Kate & Ashley Olsen',
 'Kathleen Duey',
 'Rich Shapero']

In [58]:
text_fusion_bert=[]

for t, s, d in zip(titles, subjects_text, descriptions):
    fusion = f"Title : {t}, Subjects : {s}, Description : {d}"
    text_fusion_bert.append(fusion)

In [59]:
text_fusion_bert[:10]

['Title : Classical Mythology, Subjects : mythology classical, Description : An excellent primer on classical mythology for readers who have little or no background of classical knowledge.',
 'Title : Clara Callan, Subjects : ontario teachers young women new york fiction sisters actresses, Description : E-book extras: "Hero of the Humdrum": A profile of Richard B. Wright by John Bemrose; prize citations.It is 1934, and in a small town in Canada, Clara Callan reluctantly takes leave of her sister, Nora, who is bound for the show business world of New York. Richard B. Wright\'s acclaimed novel, winner in 2001 of Canada\'s two most prestigious literary awards, is a mesmerizing tribute to friendship and sisterhood, romance and redemption.Winner in 2001 of Canada\'s two most prestigious literary awards -- the Governor General\'s Award and the Giller Prize -- Richard B. Wright\'s celebrated novel Clara Callan is the powerful, moving story of two sisters and their life-changing experiences on

### Ajout d'un modèle TFIDF pour la recherche de livre

In [169]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [170]:
tfidf=TfidfVectorizer(ngram_range=(1, 3))
title_matrix=tfidf.fit_transform(content_df['title_clean'])

In [171]:
def search(title):
    title = nettoyage_texte(title)
    vec = tfidf.transform([title])
    similarite = cosine_similarity(vec, title_matrix).flatten()
    indices = np.argpartition(similarite, -1)[-1:]
    res=content_df.iloc[indices[0]]['title']
    return res

In [172]:
search('Harry Potter')

'Harry Potter and the Chamber of Secrets (Book 2)'

### Utilisation d'un modèle BERT pré-entrainé

In [60]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import torch
import pickle

In [61]:
!nvidia-smi

Tue Aug 19 13:56:29 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.97                 Driver Version: 580.97         CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4060 ...  WDDM  |   00000000:01:00.0  On |                  N/A |
| N/A   48C    P0             15W /  105W |     910MiB /   8188MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+----------------------------------------------

In [62]:
print("GPU dispo :", torch.cuda.is_available())
print(f"Version: {torch.__version__}, GPU: {torch.cuda.is_available()}, NUM_GPU: {torch.cuda.device_count()}")

GPU dispo : True
Version: 2.7.0+cu128, GPU: True, NUM_GPU: 1


In [63]:
sbert = SentenceTransformer('all-MiniLM-L6-v2', device='cuda' if torch.cuda.is_available() else 'cpu')

In [64]:
embeddings=sbert.encode(text_fusion_bert, batch_size=64, show_progress_bar=True) #avec dataset à 210k +

Batches:   0%|          | 0/906 [00:00<?, ?it/s]

In [65]:
sbert.save('../../models/sbert/')

In [66]:
with open('../../datasets/embeddings_sbert.pkl', 'wb') as f:
    pickle.dump(embeddings, f)

In [188]:
def affichage_reco_bert(reco):
    print(f"Recommandations pour : {content_df.iloc[index]['title']}\n")
    for title in reco: 
        print(title[0])

In [191]:
import ipywidgets as widgets
from IPython.display import display

book_input = widgets.Text(
    value = '',
    description = 'Book Title',
    disabled=False
)

book_list=widgets.Output()

def on_type(data):
    with book_list:
        book_list.clear_output()
        title = data["new"] 
        if len(title) > 1:
            display(recommend_books_by_embedding(title, embeddings, knn_bert, content_df))

book_input.observe(on_type, names='value')

display(book_input, book_list)

Text(value='', description='Book Title')

Output()

## TF-IDF + Word2Vec

In [67]:
import nltk
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# === 2. Prétraitement ===
tokenized_docs = text_fusion.copy()

In [68]:
# === 3. Entraînement Word2Vec ===
w2v_tfidf = Word2Vec(sentences=tokenized_docs, vector_size=100, window=5, min_count=1, workers=4)

In [69]:
# === 4. TF-IDF ===
# Ici, on doit joiner les tokens pour TF-IDF
joined_docs = [" ".join(doc) for doc in tokenized_docs]
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(joined_docs)
tfidf_vocab = tfidf_vectorizer.vocabulary_

In [113]:
def tfidf_w2v_vector(doc_tokens, tfidf_row_sparse, model):
    vector_size = model.vector_size
    vec = np.zeros(vector_size)
    weight_sum = 0

    for word in doc_tokens:
        if word in model.wv and word in tfidf_vocab:
            tfidf_weight = tfidf_row_sparse[0, tfidf_vocab[word]]  # accès sparse
            vec += model.wv[word] * tfidf_weight
            weight_sum += tfidf_weight

    if weight_sum == 0:
        return vec  # vecteur nul
    else:
        return vec / weight_sum

In [114]:
fusion_token_tfidf = [
    tfidf_w2v_vector(tokenized_docs[i], tfidf_matrix[i], w2v_tfidf)
    for i in tqdm(range(len(tokenized_docs)))
]

# Stockage dans une Series
fusion_token_tfidf_serie = pd.Series(fusion_token_tfidf)


100%|██████████| 57960/57960 [02:30<00:00, 385.09it/s]


In [116]:
X_tfidf=np.vstack(fusion_token_tfidf_serie)

In [117]:
with open('../../datasets/embeddings_tfidf_w2v.pkl', 'wb') as f:
    pickle.dump(X_tfidf, f)

## Comparaison des recommandations des modèles

In [118]:
#chargement des emebeddings

with open('../../datasets/embeddings_w2v.pkl', 'rb') as f:
    X_w2v = pickle.load(f)

with open('../../datasets/embeddings_glove.pkl', 'rb') as f:
    X_glove = pickle.load(f)

with open('../../datasets/embeddings_sbert.pkl', 'rb') as f:
    embeddings = pickle.load(f)

with open('../../datasets/embeddings_tfidf_w2v.pkl', 'rb') as f:
    X_tfidf = pickle.load(f)

In [119]:
from sklearn.neighbors import NearestNeighbors
from joblib import dump, load
from sklearn.preprocessing import normalize

# Création des modèles KNN

knn_w2v = NearestNeighbors(n_neighbors=11, metric='cosine')
knn_glove = NearestNeighbors(n_neighbors=11, metric='cosine')
knn_sbert = NearestNeighbors(n_neighbors=11, metric='cosine')
knn_tfidf_w2v = NearestNeighbors(n_neighbors=11, metric='cosine')

In [120]:
# Entrainement des modèles

knn_w2v.fit(X_w2v)
knn_glove.fit(X_glove)
knn_sbert.fit(embeddings)
knn_tfidf_w2v.fit(X_tfidf)

0,1,2
,n_neighbors,11
,radius,1.0
,algorithm,'auto'
,leaf_size,30
,metric,'cosine'
,p,2
,metric_params,
,n_jobs,


In [166]:
# Choix du livre

book_index = np.where(content_df['title']=="Harry Potter and the Chamber of Secrets (Book 2)")[0][0]
book_title = content_df.iloc[book_index]['title']

book_index

1327

In [167]:
def get_recommendations(knn_model, embeddings_matrix, book_index, top_k=10):
    distances, indices = knn_model.kneighbors([embeddings_matrix[book_index]])
    recs = []
    for i, idx in enumerate(indices[0][1:top_k+1]):  # On ignore le livre lui-même
        recs.append({
            #"rank": i+1,
            "title": content_df.iloc[idx]['title']
            #"ISBN": content_df.iloc[idx]['ISBN'],
            #"distance": distances[0][i+1],
            #"similarity": 1 - distances[0][i+1]
            #"idx": idx
        })
    return recs

In [168]:
# Récupérer les recommandations pour chaque modèle
recs_w2v = get_recommendations(knn_w2v, X_w2v, book_index)
recs_glove = get_recommendations(knn_glove, X_glove, book_index)
recs_sbert = get_recommendations(knn_sbert, embeddings, book_index)
recs_tfidf_w2v = get_recommendations(knn_tfidf_w2v, X_tfidf, book_index)

In [169]:
# Transformer en DataFrame pour comparaison
df_w2v = pd.DataFrame(recs_w2v).add_prefix("w2v_")
df_glove = pd.DataFrame(recs_glove).add_prefix("glove_")
df_sbert = pd.DataFrame(recs_sbert).add_prefix("sbert_")
df_tfidf_w2v = pd.DataFrame(recs_tfidf_w2v).add_prefix("tfidf_w2v_")

In [170]:
# Concaténer les résultats pour avoir un tableau comparatif
df_compare = pd.concat([df_w2v, df_glove, df_sbert, df_tfidf_w2v], axis=1)
print(f"Recommandations pour : {book_title}\n")
display(df_compare)

Recommandations pour : Harry Potter and the Chamber of Secrets (Book 2)



Unnamed: 0,w2v_title,glove_title,sbert_title,tfidf_w2v_title
0,Harry Potter and the Prisoner of Azkaban (Book 3),Harry Potter and the Prisoner of Azkaban (Book 3),Horrible Harry and the Dungeon (Puffin Chapter...,Horrible Harry Moves Up to Third Grade (Puffin...
1,Ghosts Don't Eat Potato Chips (Adventures of t...,The Sorcerer's Companion: A Guide to the Magic...,Harry Potter and the Prisoner of Azkaban (Book 3),Horrible Harry's Secret (Horrible Harry)
2,The Mystery of the Stolen Bike #8 (Marc Brown ...,The Gospel According to Harry Potter: Spiritua...,"Danger In The Palace (Circle Of Magic, Book 4)",Horrible Harry and the Dungeon (Puffin Chapter...
3,The Mystery of the Homeless Treasure (The Home...,Support Your Local Wizard,Horrible Harry's Secret (Horrible Harry),Horrible Harry Goes to the Moon (Horrible Harry)
4,December Secrets (Kids of the Polk Street School),"Grave Peril (The Dresden Files, Book 3)",Horrible Harry in Room 2B (A Young Puffin),The Gospel According to Harry Potter: Spiritua...
5,"Stage Invader (Wishbone Mysteries, No. 15)",TOMORROWS MAGIC,The Sorcerer's Companion: A Guide to the Magic...,Horrible Harry and the Ant Invasion
6,GRADUATION (FINAL FRIENDS 3) : GRADUATION (Fin...,The New Kid at School (Dragon Slayers' Academy...,The Classroom at the End of the Hall,Barry Trotter and the Unauthorized Parody
7,Down a Dark Hall (Laurel Leaf Books),The Books of Magic #1: The Invitation (The Boo...,Horrible Harry and the Ant Invasion,Harry Potter and the Sorcerer's Stone Movie Po...
8,Dracula Doesn't Rock and Roll (Adventures of t...,Last Term at Malory Towers,Harry and the Terrible Whatzit,Horrible Harry's Secret (Horrible Harry)
9,The Return of the Third-Grade Ghosthunters,Harry Potter and the Bible: The Menace Behind...,Support Your Local Wizard,Horrible Harry in Room 2B (A Young Puffin)


# Recommandation Collaborative

In [160]:
collaborative_df = pd.read_csv('../../datasets/reco_datasets/collaborative_dataset_explicit.csv')

In [161]:
collaborative_df

Unnamed: 0,user_id,ISBN,rating
0,242,0553278223,10
1,243,0446364800,9
2,243,155874262X,5
3,243,0449006522,6
4,243,0446606383,6
...,...,...,...
67400,278843,0060517794,7
67401,278843,0060173289,9
67402,278851,0894803700,5
67403,278851,1558531025,8


In [162]:
collaborative_df.shape

(67405, 3)

In [163]:
collaborative_df.duplicated(['user_id', 'ISBN']).sum()

0

## Matrice User-Book

In [257]:
from scipy.sparse import coo_matrix

In [258]:
collaborative_df['rating'] = pd.to_numeric(collaborative_df['rating'])

In [259]:
collaborative_df['user_index']=collaborative_df['user_id'].astype("category").cat.codes

In [260]:
collaborative_df['book_index']=collaborative_df['ISBN'].astype("category").cat.codes

In [261]:
collaborative_df.head()

Unnamed: 0,user_id,ISBN,rating,title,user_index,book_index
0,276725,034545104X,0,Flesh Tones: A Novel,82917,39799
1,276726,0155061224,5,Rites of Passage,82918,19442
2,276727,0446520802,0,The Notebook,82919,73695
3,276729,052165615X,3,Help!: Level 1,82920,86750
4,276729,0521795028,6,The Amsterdam Connection : Level 4 (Cambridge ...,82920,86782


In [262]:
ratings_mat_coo=coo_matrix((collaborative_df['rating'], (collaborative_df['user_index'], collaborative_df['book_index'])))

In [263]:
ratings_mat_coo

<83602x216961 sparse matrix of type '<class 'numpy.int64'>'
	with 874985 stored elements in COOrdinate format>

In [264]:
ratings_mat=ratings_mat_coo.tocsr()

In [131]:
np.where(collaborative_df['user_index']==15488)

(array([166996, 166997, 166998, 166999, 167000, 167001, 167002, 167003,
        167004, 167005, 167006, 167007, 167008, 167009, 167010, 167011,
        167012, 167013, 167014, 167015, 167016, 167017, 167018, 167019,
        167020, 167021, 167022, 167023, 167024, 167025, 167026, 167027,
        167028, 167029, 167030, 167031, 167032, 167033, 167034, 167035,
        167036, 167037, 167038, 167039, 167040, 167041, 167042, 167043,
        167044, 167045, 167046, 167047, 167048, 167049, 167050, 167051,
        167052, 167053, 167054, 167055, 167056, 167057, 167058, 167059,
        167060, 167061, 167062, 167063, 167064, 167065, 167066, 167067,
        167068, 167069, 167070, 167071, 167072, 167073, 167074, 167075,
        167076, 167077, 167078, 167079, 167080, 167081, 167082, 167083,
        167084, 167085, 167086, 167087, 167088, 167089, 167090, 167091,
        167092, 167093], dtype=int64),)

In [132]:
index=15488

In [133]:
from sklearn.metrics.pairwise import cosine_similarity

sim = cosine_similarity(ratings_mat[index, :], ratings_mat).flatten()

In [134]:
indices = np.argpartition(sim, -15)[-15:]

indices

array([55104, 70102, 79323, 42415, 48700, 28284, 14836, 67843, 61288,
       81010, 53494, 61646, 62281, 15488, 31848], dtype=int64)

In [135]:
similar_users = collaborative_df[collaborative_df['user_index'].isin(indices)].copy()

In [136]:
similar_users = similar_users[similar_users['user_id'] != 51742]

In [138]:
similar_users.head()

Unnamed: 0,user_id,ISBN,rating,title,user_index,book_index
161520,49587,385490445,9,Alias Grace : A Novel,14836,58655
296071,94418,385490445,9,Alias Grace : A Novel,28284,58655
337007,106369,66621054,8,Longaberger: An American Success Story,31848,11146
337008,106369,449219364,9,G Is for Gumshoe (Kinsey Millhone Mysteries (P...,31848,76106
337009,106369,449219461,9,H Is for Homicide (Kinsey Millhone Mysteries (...,31848,76111


In [139]:
book_recs=similar_users.groupby('ISBN').rating.agg(['count', 'mean'])

In [140]:
book_recs

Unnamed: 0_level_0,count,mean
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1
0030018897,1,9.0
0060504072,1,10.0
0060928336,1,0.0
0066621054,1,8.0
0140502343,1,10.0
0142001740,1,9.0
015600125X,1,0.0
0316693863,1,7.0
0345308980,1,0.0
0385319207,1,9.0


### Création de la matrice Livre-Utilisateur (et Utilisateur-Livre)

In [16]:
user_book_matrix_u=collaborative_df.pivot_table(index='user_id', columns='ISBN', values='rating') #pour faire une recommandation basée sur les utilisateurs

In [17]:
user_book_matrix_b=collaborative_df.pivot_table(index='title', columns='user_id', values='rating') #pour faire une recommandation basée sur les livres

In [18]:
user_book_matrix_u.head()

ISBN,0001010565,0001046438,0001047663,0001047868,000104799X,0001048082,0001048473,0001049879,0001052039,0001053744,...,999750805X,9997508769,9997511263,9997519086,9997522052,9997542738,9998560802,9998914140,9999669972,9999984584
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
254,,,,,,,,,,,...,,,,,,,,,,
2276,,,,,,,,,,,...,,,,,,,,,,
2766,,,,,,,,,,,...,,,,,,,,,,
2977,,,,,,,,,,,...,,,,,,,,,,
3363,,,,,,,,,,,...,,,,,,,,,,


In [19]:
user_book_matrix_b.head()

user_id,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"A Light in the Storm: The Civil War Diary of Amelia Martin, Fenwick Island, Delaware, 1861 (Dear America)",,,,,,,,,,,...,,,,,,,,,,
Always Have Popsicles,,,,,,,,,,,...,,,,,,,,,,
Apple Magic (The Collector's series),,,,,,,,,,,...,,,,,,,,,,
Beyond IBM: Leadership Marketing and Finance for the 1990s,,,,,,,,,,,...,,,,,,,,,,
Clifford Visita El Hospital (Clifford El Gran Perro Colorado),,,,,,,,,,,...,,,,,,,,,,


In [16]:
user_book_matrix_u.fillna(0, inplace=True)

In [17]:
user_book_matrix_b.fillna(0, inplace=True) 

In [18]:
user_book_matrix_u.shape

(899, 145592)

In [19]:
user_book_matrix_b.shape

(133737, 899)

In [161]:
matrix = user_book_matrix.loc[(user_book_matrix != 0).any(axis=1), (user_book_matrix != 0).any(axis=0)]


In [162]:
matrix

user_id,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"A Light in the Storm: The Civil War Diary of Amelia Martin, Fenwick Island, Delaware, 1861 (Dear America)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Dark Justice,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Earth Prayers From around the World: 365 Prayers, Poems, and Invocations for Honoring the Earth",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Final Fantasy Anthology: Official Strategy Guide (Brady Games),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
God's Little Promise Book,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
them (Modern Library),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
why I'm like this : True Stories,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Â¡Corre, perro, corre!",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Â¿Eres tu mi mamÃ¡?/Are You My Mother?,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Sparsing

In [18]:
from scipy.sparse import csr_matrix

In [19]:
user_book_sparse=csr_matrix(user_book_matrix_b)

In [20]:
user_book_sparse

<133737x899 sparse matrix of type '<class 'numpy.float64'>'
	with 103377 stored elements in Compressed Sparse Row format>

### Utilisation de KNN

In [21]:
from sklearn.neighbors import NearestNeighbors
from joblib import dump, load

In [22]:
knn_collab=NearestNeighbors(algorithm='brute')

In [23]:
knn_collab.fit(user_book_sparse)

In [24]:
dump(knn_collab, './models/knn_model_collab.joblib')

['./models/knn_model_collab.joblib']

In [25]:
book_names=user_book_matrix_b.index

In [26]:
book_names[:10]

Index([' A Light in the Storm: The Civil War Diary of Amelia Martin, Fenwick Island, Delaware, 1861 (Dear America)',
       ' Always Have Popsicles', ' Apple Magic (The Collector's series)',
       ' Beyond IBM: Leadership Marketing and Finance for the 1990s',
       ' Clifford Visita El Hospital (Clifford El Gran Perro Colorado)',
       ' Dark Justice', ' Deceived',
       ' Earth Prayers From around the World: 365 Prayers, Poems, and Invocations for Honoring the Earth',
       ' Final Fantasy Anthology: Official Strategy Guide (Brady Games)',
       ' God's Little Promise Book'],
      dtype='object', name='title')

In [27]:
np.where(book_names=='Harry Potter and the Chamber of Secrets')

(array([43745], dtype=int64),)

In [28]:
index=43747

In [29]:
user_book_matrix_b.iloc[index, :]

user_id
254       9.0
2276      0.0
2766      0.0
2977      0.0
3363      0.0
         ... 
275970    9.0
277427    0.0
277478    0.0
277639    0.0
278418    0.0
Name: Harry Potter and the Chamber of Secrets (Book 2), Length: 899, dtype: float64

In [30]:
distances, reco = knn_collab.kneighbors(user_book_matrix_b.iloc[index, :].values.reshape(1, -1), n_neighbors=6)

In [31]:
distances

array([[ 0.        , 67.73129098, 69.57210488, 72.22091879, 74.67224682,
        75.03073444]])

In [32]:
reco[0]

array([ 43747,  43759,  43752,  43763, 114542,  98769], dtype=int64)

In [33]:
for i in reco[0]:
    print(user_book_matrix_b.index[i])

Harry Potter and the Chamber of Secrets (Book 2)
Harry Potter and the Prisoner of Azkaban (Book 3)
Harry Potter and the Goblet of Fire (Book 4)
Harry Potter and the Sorcerer's Stone (Book 1)
The Secret of the Indian (Indian in the Cupboard)
The Canterbury Tales (Bantam Classics)


## Utilisation de SVD (via Surprise)

In [237]:
collaborative_df_explicit = pd.read_csv('../../datasets/reco_datasets/collaborative_dataset_explicit.csv')

In [238]:
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(collaborative_df_explicit[['user_id', 'ISBN', 'rating']], reader)

### GridSearch et RandomSearch pour sélection d'hyper-paramètres

In [24]:
param_grid = {
    'n_factors': [20, 50, 75, 125, 150, 225],
    'lr_all': [0.0005, 0.001, 0.002, 0.005],
    'reg_all': [0.005, 0.01, 0.015, 0.02, 0.03, 0.05],
    'n_epochs': [10, 20, 30, 50, 70, 100]
}

In [417]:
rs = RandomizedSearchCV(algo_class=SVD, param_distributions=param_grid, cv=10)

In [418]:
rs.fit(data)

In [26]:
gs = GridSearchCV(algo_class=SVD, param_grid=param_grid, cv=5, n_jobs=-1, joblib_verbose=1)

In [27]:
gs.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   28.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 19.7min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 32.0min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 49.9min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed: 74.8min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 109.3min
[Parallel(n_jobs=-1)]: Done 4320 out of 4320 | elapsed: 123.1min finished


In [29]:
print("Meilleurs hyperparamètres (RMSE) :", gs.best_params['rmse'])
print("Score RMSE moyen :", gs.best_score['rmse'])
print("Score MAE moyen :", gs.best_score['mae'])

Meilleurs hyperparamètres (RMSE) : {'n_factors': 20, 'lr_all': 0.002, 'reg_all': 0.05, 'n_epochs': 50}
Score RMSE moyen : 1.5746239454070294
Score MAE moyen : 1.2100412026701315


In [30]:
summary_df = pd.DataFrame({'Model': 'SVD',
                           'RMSE': gs.best_score['rmse'],
                           'Model Configuration':[f'{gs.best_params["rmse"]}']})

In [31]:
summary_df.to_csv('./param/best_svd_params.csv', index=False)

### Utilisation de SVD avec les meilleurs paramètres

In [241]:
best_params_df=pd.read_csv('../../param/best_svd_params.csv')

In [242]:
best_params_df

Unnamed: 0,Model,RMSE,Model Configuration
0,SVD,1.574624,"{'n_factors': 20, 'lr_all': 0.002, 'reg_all': ..."


In [243]:
best_params_df['Model Configuration'][0]

"{'n_factors': 20, 'lr_all': 0.002, 'reg_all': 0.05, 'n_epochs': 50}"

In [244]:
trainset, testset = train_test_split(data, test_size=0.2)

In [245]:
svd = SVD(n_factors=20, lr_all=0.002, reg_all=0.05, n_epochs=50)

In [246]:
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1cf2e35c370>

In [247]:
predictions = svd.test(testset)

In [248]:
rmse = accuracy.rmse(predictions)

RMSE: 1.5415


In [249]:
full_data = data.build_full_trainset()

In [250]:
svd.fit(full_data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1cf2e35c370>

In [252]:
dump.dump('../../models/svd/svd_model_best', algo=svd)

### Prédictions avec SVD

In [253]:
%load_ext autoreload
%autoreload 2

from package.reco_collab_initial import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [254]:
# Recommandation aléatoire d'un livre que l'utilisateur n'a pas noté

recommandation_collaborative(2, svd, collaborative_df_explicit, books)

'Thinner'

In [263]:
user=32773

In [264]:
ids = set(collaborative_df_explicit['user_id'])
ids

{32773,
 262151,
 163851,
 65549,
 196621,
 98322,
 262169,
 131099,
 229409,
 32802,
 163876,
 98344,
 262186,
 65584,
 196656,
 98356,
 131126,
 262208,
 131154,
 98391,
 98428,
 229502,
 65663,
 65665,
 196738,
 163973,
 98440,
 131238,
 196775,
 262311,
 65710,
 229551,
 98484,
 164027,
 32964,
 164036,
 196821,
 65752,
 164061,
 65769,
 33004,
 242,
 243,
 98547,
 131317,
 262391,
 254,
 65791,
 164096,
 229632,
 33026,
 262399,
 262407,
 98575,
 229656,
 196891,
 131363,
 229681,
 262459,
 196930,
 98628,
 229703,
 131402,
 229707,
 65877,
 98647,
 33114,
 98655,
 33124,
 229741,
 229742,
 131441,
 65913,
 98686,
 388,
 197000,
 229768,
 262541,
 131476,
 197012,
 65946,
 33179,
 98722,
 98723,
 33200,
 98741,
 131510,
 229816,
 446,
 98758,
 98783,
 98787,
 164323,
 262634,
 503,
 505,
 507,
 33283,
 131594,
 262666,
 66073,
 33307,
 131622,
 33319,
 164418,
 98887,
 66129,
 197206,
 98904,
 131675,
 131686,
 164465,
 98930,
 230013,
 638,
 98943,
 33408,
 164480,
 643,
 651,
 1

In [265]:
# Top k recommandations à un utilisateur

recommandation_collaborative_top_k(5, 32773, svd, collaborative_df_explicit, books, ids)

Unnamed: 0,user_id,ISBN,note_predite,title
2669,32773,836213319,9.334601,Dilbert: A Book of Postcards
2545,32773,439425220,9.306763,Harry Potter and the Chamber of Secrets Postca...
341,32773,743454529,9.289037,"My Sister's Keeper : A Novel (Picoult, Jodi)"
575,32773,618002235,9.189893,"The Two Towers (The Lord of the Rings, Part 2)"
2092,32773,345339738,9.174055,"The Return of the King (The Lord of the Rings,..."


In [266]:
predicted_ratings=predict_unrated_books(user, svd, get_unrated_item(user, collaborative_df_explicit))

In [267]:
predicted_ratings

Unnamed: 0,user_id,ISBN,note_predite
2669,32773,0836213319,9.334601
2545,32773,0439425220,9.306763
341,32773,0743454529,9.289037
575,32773,0618002235,9.189893
2092,32773,0345339738,9.174055
...,...,...,...
2116,32773,0802139868,5.977992
1781,32773,039914739X,5.906538
443,32773,0441359175,5.890505
2185,32773,0425182908,5.723018


## Utilisation de NMF (via Surprise)

In [150]:
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(collaborative_df_explicit[['user_id', 'ISBN', 'rating']], reader)

In [151]:
trainset, testset = train_test_split(data, test_size=0.2)

In [154]:
nmf_surprise=NMF(n_factors=20)
nmf_surprise.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x18896066500>

In [155]:
predictions = nmf_surprise.test(testset)
rmse = accuracy.rmse(predictions)

RMSE: 2.2408


In [156]:
param_grid_gs_nmf = {
    'n_factors': [20, 50, 70, 100, 125],
    'n_epochs': [10, 20, 30, 50, 100],
    'reg_pu':[0.02, 0.04, 0.06, 0.08, 0.1],
    'reg_qi':[0.02, 0.04, 0.06, 0.08, 0.1]
}
gs_nmf = GridSearchCV(algo_class=NMF, param_grid=param_grid_gs_nmf, cv=3, n_jobs=-1, joblib_verbose=1)

In [157]:
gs_nmf.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   55.7s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  3.9min


KeyboardInterrupt: 

In [None]:
# Meilleurs paramètres trouvés
print("Meilleurs hyperparamètres (RMSE) :", gs_nmf.best_params['rmse'])
print("Score RMSE moyen :", gs_nmf.best_score['rmse'])
print("Score MAE moyen :", gs_nmf.best_score['mae'])

In [76]:
nmf_surprise=NMF(n_factors=125, n_epochs=50)
full_data=data.build_full_trainset
nmf_surprise.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x233a8623820>

In [77]:
recommandation_collaborative_top_k(5, user, nmf_surprise, collaborative_df_explicit, books)

Unnamed: 0,ISBN,title,author,year,publisher
0,0345450736,"Between Sisters (Hannah, Kristin)",KRISTIN HANNAH,2003,Ballantine Books
1,0006375952,City of Djinns,William Dalrymple,1994,Harpercollins
2,031226481X,We Love Harry Potter!,Sharon Moore,1999,St. Martin's Press
3,0380721457,Mrs. Dewinter,Susan Hill Long,1994,Avon
4,0671552473,Him With His Foot in His Mouth,Saul Bellow,1985,Pocket


In [67]:
ids = collaborative_df_explicit['user_id'].unique()

## Comparaison entre SVD et NMF

In [None]:
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(collaborative_df_explicit[['user_id', 'ISBN', 'rating']], reader)

In [63]:
collaborative_df_explicit.shape

(197413, 4)

In [81]:
svd = SVD(n_factors=100, n_epochs=50) 
nmf_surprise=NMF(n_factors=100, n_epochs=50)

In [82]:
trainset, testset = train_test_split(data, test_size=0.3)

In [83]:
nmf_surprise.fit(trainset)
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2b06d741060>

In [84]:
pred_svd = svd.test(testset)
pred_nmf = nmf_surprise.test(testset)

In [85]:
print('RMSE (SVD) : ', accuracy.rmse(pred_svd))
print('RMSE (NMF) : ', accuracy.rmse(pred_nmf))

RMSE: 1.6171
RMSE (SVD) :  1.617071498768352
RMSE: 2.4342
RMSE (NMF) :  2.4341947271774353


Avec les mêmes paramètres, le RMSE de NMF est plus grand d'environ 1 par rapport à SVD.

In [87]:
full_data=data.build_full_trainset()

nmf_surprise.fit(full_data)
svd.fit(full_data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2b06d741060>

In [88]:
user_ids = collaborative_df_explicit['user_id'].unique()

user_ids

array([     8,     99,    183, ..., 278637, 278843, 278851], dtype=int64)

In [89]:
recommandation_collaborative_top_k(5, 8, svd, collaborative_df_explicit, books)

Unnamed: 0,ISBN,title,author,year,publisher
0,0142001740,The Secret Life of Bees,Sue Monk Kidd,2003,Penguin Books
1,193156146X,The Time Traveler's Wife,Audrey Niffenegger,2003,MacAdam/Cage Publishing
2,0743454529,"My Sister's Keeper : A Novel (Picoult, Jodi)",Jodi Picoult,2004,Atria
3,0836213319,Dilbert: A Book of Postcards,Scott Adams,1996,Andrews McMeel Pub
4,0439425220,Harry Potter and the Chamber of Secrets Postca...,J. K. Rowling,2002,Scholastic


In [90]:
recommandation_collaborative_top_k(5, 8, nmf_surprise, collaborative_df_explicit, books)

Unnamed: 0,ISBN,title,author,year,publisher
0,843760494X,Cien AÃ±os de Soledad,Gabriel GarcÃ­a MÃ¡rquez,1994,Ediciones Catedra S.A.
1,3404701704,Die Regeln des GlÃ?Â¼cks.,Dalai Lama,2001,LÃ?Â¼bbe
2,0380701006,Inside-Outside Book of New York City,Herman Wouk,1986,Avon Books
3,0373196148,Caught By Surprise (The Tale Of The Sea) (Sil...,Sandra Paul,2002,Silhouette
4,1559210354,"Imperial Woman (Buck, Pearl S. Oriental Novels...",Pearl S. Buck,1991,Moyer Bell Ltd.


On obtient donc des recommandations totalement différentes. Comme SVD a le meilleur RMSE, on va choisir ce type de modèle pour la recommandation collaborative.

## Utilisation de ALS pour la recommandation collaborative implicite

In [49]:
collaborative_df_implicit = pd.read_pickle('../../datasets/reco_datasets/ratings_implicit.pkl')

In [51]:
collaborative_df_implicit.reset_index(drop=True, inplace=True)
collaborative_df_implicit

Unnamed: 0,user_id,ISBN,rating
0,276725,034545104X,0
1,276727,0446520802,0
2,276733,2080674722,0
3,276746,0425115801,0
4,276746,0449006522,0
...,...,...,...
716104,276704,059032120X,0
716105,276704,0679752714,0
716106,276704,080410526X,0
716107,276704,0876044011,0


In [56]:
collaborative_df_implicit['interaction'] = collaborative_df_implicit['rating'].apply(lambda x: 1 if x == 0 else x)

In [57]:
collaborative_df_implicit.head()

Unnamed: 0,user_id,ISBN,rating,interaction
0,276725,034545104X,0,1
1,276727,0446520802,0,1
2,276733,2080674722,0,1
3,276746,0425115801,0,1
4,276746,0449006522,0,1


In [58]:
import pandas as pd
import scipy.sparse as sp

def build_user_item_matrix_implicit(ratings: pd.DataFrame):
    """
    Construit une matrice user-item pour ALS implicite à partir d'un DataFrame
    avec colonnes ['user_id', 'ISBN', 'rating'].
    
    Les ratings == 0 sont considérés comme interactions implicites et
    transformés en valeur 1. Les ratings > 0 conservent leur poids.
    """
    # Transformer les ratings en interactions implicites
    ratings['interaction'] = ratings['rating'].apply(lambda x: 1 if x == 0 else x)

    # Encodage des IDs utilisateurs et livres en indices numériques continus
    user_ids = ratings["user_id"].astype("category")
    book_ids = ratings["ISBN"].astype("category")

    ratings["user_idx"] = user_ids.cat.codes
    ratings["book_idx"] = book_ids.cat.codes

    # Construire la matrice utilisateur-item sparse
    user_item_matrix = sp.csr_matrix(
        (ratings["interaction"], (ratings["user_idx"], ratings["book_idx"]))
    )

    # Mappings pour retrouver les vrais IDs ensuite
    user_mapping = dict(enumerate(user_ids.cat.categories))
    book_mapping = dict(enumerate(book_ids.cat.categories))

    return user_item_matrix, user_mapping, book_mapping


In [101]:
collaborative_df_implicit.sort_values('user_id', inplace=True)

In [102]:
from implicit.als import AlternatingLeastSquares

# Construire la matrice
user_item_matrix, user_mapping, book_mapping = build_user_item_matrix_implicit(collaborative_df_implicit)

# Entraîner ALS
als = AlternatingLeastSquares(
    factors=50,
    regularization=0.01,
    iterations=20,
    random_state=42
)

als.fit(user_item_matrix.T)  # important: items x users




  0%|          | 0/20 [00:00<?, ?it/s]

In [105]:
# Recommandations pour un utilisateur
user_idx = 0
recommended_items, scores = als.recommend(
    user_idx,
    user_item_matrix[user_idx],
    N=5
)

# Affichage avec mapping ISBN
for book_idx, score in zip(recommended_items, scores):
    print(book_mapping[book_idx], score)


0330360434 0.0010230065
0060191414 0.0007004945
0205288626 0.00060519046
0020442106 0.0006031282
0310442508 0.00058559637


# Recommandation hybride

On a deux manières différentes de faire des recommandations. On veut maintenant les combiner pour pouvoir recommander les livres les plus pertinents à l'utilisateur. Pour ça, on va calculer un score combinant à la fois la similarité cosinus et la prediction de note. Un poids alpha sera utilisé pour pouvoir gérer l'influence des types de recommandations.

In [43]:
# chargment des modèles

# basé sur le contenu

w2v=gensim.models.Word2Vec.load("./models/word2vec_model_text_fusion.model")
sbert=SentenceTransformer('./models/sbert_gpu/')

# collaboratif

_, svd= dump.load('./models/svd/svd_model_best')

In [44]:
# chargement des datasets

content_df=pd.read_pickle('./datasets/reco_datasets/content_dataset_sans_doublon_full.pkl')
collaborative_df_explicit = pd.read_csv('./datasets/reco_datasets/collaborative_dataset_explicit.csv')

In [45]:
content_df.drop(columns=['title_clean', 'description_clean', 'sujets_clean', 'author_clean', 'publisher_clean'], inplace=True)

In [42]:
# pour la recherche des titres

titles_clean=content_df['title'].apply(nettoyage_leger)

tfidf=TfidfVectorizer(ngram_range=(1, 5), lowercase=True, stop_words='english')

title_matrix=tfidf.fit_transform(titles_clean)


def search(title):
    title = nettoyage_leger(title)
    vec = tfidf.transform([title])
    similarite = cosine_similarity(vec, title_matrix).flatten()
    indice = np.argmax(similarite)
    res=content_df.iloc[indice]['title']
    return res

KeyboardInterrupt: 

In [46]:
# chargement des embeddings

with open('./datasets/embeddings_w2v.pkl', 'rb') as f:
    embedding_w2v = pickle.load(f)

with open('./datasets/embeddings.pkl', 'rb') as f:
    embeddings_bert = pickle.load(f)

In [47]:
content_df.head()

Unnamed: 0,ISBN,title,author,year,publisher,sujets,description
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,"[classical, mythology]",An excellent primer on classical mythology for...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,"[sisters, actresses, young, new, ontario, teac...","E-book extras: ""Hero of the Humdrum"": A profil..."
2,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,"[20th, history, century, influenza]","The fascinating, true story of the world's dea..."
3,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company,"[xinjiang, mummies, age, uygur, china, prehist...","In the museums of Urumchi, the wind-swept regi..."
4,425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group,"[histories, battles, wars, imaginary]",With its in-depth reflections on the monumenta...


In [48]:
collaborative_df_explicit.head()

Unnamed: 0,user_id,ISBN,rating
0,8,2005018,5
1,99,312252617,8
2,99,312261594,8
3,99,316748641,7
4,99,446677450,10


In [49]:
ids=sorted(set(collaborative_df_explicit['user_id']))
ids

[8,
 99,
 114,
 183,
 242,
 243,
 244,
 254,
 300,
 383,
 388,
 424,
 446,
 487,
 503,
 505,
 507,
 566,
 595,
 625,
 626,
 638,
 643,
 651,
 709,
 735,
 741,
 753,
 776,
 805,
 834,
 850,
 882,
 900,
 901,
 929,
 1025,
 1063,
 1075,
 1083,
 1116,
 1129,
 1131,
 1155,
 1167,
 1211,
 1248,
 1343,
 1409,
 1412,
 1424,
 1435,
 1548,
 1558,
 1585,
 1660,
 1674,
 1706,
 1725,
 1733,
 1790,
 1805,
 1848,
 1850,
 1903,
 1928,
 2010,
 2012,
 2024,
 2030,
 2033,
 2041,
 2084,
 2103,
 2110,
 2132,
 2134,
 2139,
 2179,
 2189,
 2197,
 2276,
 2287,
 2288,
 2313,
 2337,
 2358,
 2363,
 2399,
 2406,
 2411,
 2437,
 2439,
 2442,
 2462,
 2549,
 2589,
 2622,
 2685,
 2719,
 2766,
 2769,
 2789,
 2793,
 2799,
 2891,
 2954,
 2977,
 3017,
 3145,
 3167,
 3192,
 3282,
 3326,
 3329,
 3331,
 3346,
 3363,
 3371,
 3373,
 3416,
 3418,
 3462,
 3471,
 3538,
 3556,
 3569,
 3578,
 3594,
 3640,
 3669,
 3675,
 3693,
 3719,
 3757,
 3814,
 3817,
 3827,
 3840,
 3917,
 3923,
 3965,
 3983,
 4017,
 4091,
 4092,
 4098,
 4114,
 41

In [50]:
# Version basée uniquement sur les livres du dataset

def calcul_score_content(book_title, embeddings, books, n=5):

    close_title=search(book_title)
    print(close_title)
    book_index=np.where(close_title==books['title'])[0][0]

    similarity = cosine_similarity([embeddings[book_index]], embeddings)[0]
    similarity[book_index] = -1

    id_recos = np.argsort(similarity)[-n:][::-1] 
    scores = similarity[id_recos]

    return scores, id_recos

L'utilisateur pourra créer son profile avec les livres qu'il aime même s'ils ne sont pas dans le dataset. Les fonctions d'après servent à faire des recommandations basées sur le contenu même de livres non présents dans le dataset.

In [51]:
def get_vector(text, model):
    """Calcule l'embedding moyen d'un texte avec un modèle Word2Vec."""
    words = text.lower().split()
    word_vecs = [model.wv[w] for w in words if w in model.wv]
    return np.mean(word_vecs, axis=0) if word_vecs else np.zeros(model.vector_size)

def calcul_score_content(book_title, embeddings, model, n=5):
    """
    Calcule les similarités entre un livre donné et la base `books` via Word2Vec.
    book_title : str -> titre du livre à comparer (même s'il n'est pas dans books)
    embeddings : np.array -> embeddings pré-calculés pour books
    books : DataFrame avec au moins une colonne 'title'
    model : Word2Vec entraîné sur le même espace
    n : nombre de recommandations à retourner
    """
    
    # Embedding du livre donné
    query_vec = get_vector(book_title, model)

    # Calcul des similarités
    similarity = cosine_similarity([query_vec], embeddings)[0]

    # Top n indices
    id_recos = np.argsort(similarity)[-n:][::-1] 
    scores = similarity[id_recos]

    return scores, id_recos


In [52]:
# Recommandations basées sur SBERT
def calcul_score_content_sbert(book_title, books, embeddings, model, n=5):
    # Encode le livre recherché
    book_embedding = model.encode([book_title], convert_to_numpy=True, normalize_embeddings=True)
    
    # Similarité cosinus avec tous les livres
    similarity = cosine_similarity(book_embedding, embeddings)[0]
    
    # Exclure le livre lui-même s'il est dans le dataset
    match_idx = books.index[books['title'] == book_title]
    if len(match_idx) > 0:
        similarity[match_idx[0]] = -1
    
    # Top-n recommandations
    id_recos = np.argsort(similarity)[-n:][::-1]
    scores = similarity[id_recos]
    
    return scores, id_recos


In [55]:
scores, id_recos = calcul_score_content('Harry Potter', embedding_w2v, w2v)
content_df.iloc[id_recos]

Unnamed: 0,ISBN,title,author,year,publisher,sujets,description
52942,439193966,Horrible Harry Goes to the Moon (Horrible Harry),Suzy Kline,2001,Scholastic Inc.,"[juvenile, fictitious, schools, harry, fiction...",Inspired by his third-grade class's study of t...
1329,439064872,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,2000,Scholastic,"[juvenile, fictitious, wizards, schools, magic...",Throughout the summer holidays after his first...
33591,590466372,Horrible Harry's Secret (Horrible Harry),Suzy Kline,1993,Scholastic Inc.,"[fiction, friendship, schools]","Horrible Harry falls in love with Song Lee, an..."
34401,689714769,Harry Houdini : Young Magician,Kathryn Kilby Borland,1991,Aladdin,"[literature, juvenile, artists, escape, 1926, ...",A biography of Harry Houdini concentrating on ...
25460,590424025,Harry Houdini: Master of Magic,Robert Kraske,1995,Scholastic,"[arts, books, children, juvenile, nonfiction, ...","A brief biography of Ehrich Weiss, who gained ..."


In [56]:
scores, id_recos = calcul_score_content_sbert('Harry Potter', content_df, embeddings_bert, sbert)
content_df.iloc[id_recos]

Unnamed: 0,ISBN,title,author,year,publisher,sujets,description
25239,312282540,"Harry Potter, You're the Best: A Tribute from ...",Sharon Moore,2001,St. Martin's Press,"[books, contemporary, children, juvenile, crit...",this book sucks dont read
12291,415933749,Harry Potter's World: Multidisciplinary Critic...,Elizabeth E. Heilman,2003,Falmer Press,"[books, children, criticism, magic, character,...",The Harry Potter books have become ubiquitous ...
353,767908473,The Sorcerer's Companion: A Guide to the Magic...,ALLAN ZOLA KRONZEK,2001,Broadway,"[manuals, children, criticism, etc, magic, cha...",Who was the real Nicholas Flamel? How did the ...
7225,439286239,Harry Potter and the Sorcerer's Stone Movie Po...,Inc Staff Scholastic,2001,Scholastic,"[book, pictures, poster]",Harry Potter fans of all ages will treasure th...
46041,803718314,"The Bell, the Book and the Spellbinder",Brad Strickland,1997,Dial Books,"[magic, wizards, fiction]",Johnny Dixon's best friend Fergie steals an en...


In [59]:
recommandation_collaborative_top_k(5, ids[0], svd, collaborative_df_explicit, books, ids)

Unnamed: 0,user_id,ISBN,note_predite,title
9881,8,0439425220,9.126137,Harry Potter and the Chamber of Secrets Postca...
5068,8,0743454529,9.114931,"My Sister's Keeper : A Novel (Picoult, Jodi)"
14143,8,0836213319,8.984506,Dilbert: A Book of Postcards
17854,8,067168390X,8.915766,Lonesome Dove
9203,8,0836220889,8.896413,Calvin and Hobbes


In [60]:
def recommandation_hybride(user_id, collaborative_model, content_df, collaborative_df, books, embeddings, ids, alpha=0.5, n=5):

    recos_collab = recommandation_collaborative_top_k(n, user_id, collaborative_model, collaborative_df, books, ids)
    if recos_collab is None:
        return
    
    print(recos_collab)
 
    score_collab = np.array(recos_collab['note_predite'].tolist())
    isbn_collab = recos_collab['ISBN'].to_list()

    score_content_global = np.zeros(len(content_df))

    for isbn, s_collab in tqdm(zip(isbn_collab, score_collab), total=len(isbn_collab)):
        print(isbn, s_collab)
        
        ref_index = np.where(isbn == content_df['ISBN'])[0]
        if len(ref_index) == 0:
            continue
        ref_index = ref_index[0]

        sim = cosine_similarity([embeddings[ref_index]], embeddings)[0]
        sim[ref_index] = 0

        score_content_global += sim * s_collab

    score_content_norm = (score_content_global - score_content_global.min()) / (score_content_global.max() - score_content_global.min() + 1e-8)

    score_collab_global = np.zeros(len(content_df))
    for title, s_collab in zip(isbn_collab, score_collab):
        idx = np.where(isbn == content_df['ISBN'])[0]
        if len(idx) > 0:
            score_collab_global[idx[0]] = s_collab

    score_collab_norm = (score_collab_global - score_collab_global.min()) / (score_collab_global.max() - score_collab_global.min() + 1e-8)

    score_final = alpha * score_collab_norm + (1 - alpha) * score_content_norm

    result_df = content_df.copy()
    result_df['score_hybride'] = score_final

    livres_lus = set(isbn_collab)
    result_df = result_df[~result_df['title'].isin(livres_lus)]
    result_df = result_df.sort_values('score_hybride', ascending=False)

    return result_df.head(n)

In [61]:
user_id=ids[0]

In [62]:
recos=recommandation_hybride(user_id, svd, content_df, collaborative_df_explicit, books, embeddings_bert, ids)
print(f"Recommandation pour l'utilisateur n°{user_id} :\n")

recos

       user_id        ISBN  note_predite  \
9881         8  0439425220      9.126137   
5068         8  0743454529      9.114931   
14143        8  0836213319      8.984506   
17854        8  067168390X      8.915766   
9203         8  0836220889      8.896413   

                                                   title  
9881   Harry Potter and the Chamber of Secrets Postca...  
5068        My Sister's Keeper : A Novel (Picoult, Jodi)  
14143                       Dilbert: A Book of Postcards  
17854                                      Lonesome Dove  
9203                                   Calvin and Hobbes  


 40%|████      | 2/5 [00:00<00:00, 10.63it/s]

0439425220 9.12613716850456
0743454529 9.114930743799865


100%|██████████| 5/5 [00:00<00:00, 23.10it/s]

0836213319 8.984506384093908
067168390X 8.915766497102478
0836220889 8.896412602406246
Recommandation pour l'utilisateur n°8 :






Unnamed: 0,ISBN,title,author,year,publisher,sujets,description,score_hybride
3378,553280074,Goodbye Doesn't Mean Forever,LURLENE MCDANIEL,1989,Bantam,"[leukemia, teenagers, death, fiction]",Jory Delaney has always had lots of money. But...,0.5
23640,451207009,The Language of Sisters,Amy Yurk,2002,New American Library,"[sisters, victims, disabilities, people, young...","""Ten years ago, Nicole Hunter left her trouble...",0.490701
14518,553562657,A Season for Goodbye (One Last Wish),LURLENE MCDANIEL,1995,Laurel Leaf,"[books, camps, children, teenagers, juvenile, ...","It's been a year since Katie O'Roark, Chelsea ...",0.484193
29925,440147433,Little Sister,Patricia MacDonald,1986,Dell Publishing Company,[],Beth Pearson has never much liked her younger ...,0.482792
10325,671776126,Plain Truth,Jodi Picoult,2000,Atria,"[mystery, detective, legal, infanticide, trial...",From the bestselling author of My Sister's Kee...,0.481316
