# Système de recommandation de livres basé sur le contenu

# Cahier des charges

Projet : Système de recommandation de livres

Objectif : 
- Prédire les livres qu'un utilisateur pourrait aimer en fonction de ses préférences (recommandation collaborative) et des similarités avec d'autres livres (recommandation basée sur le contenu)

Techniques Utilisées :
- Filtrage collaboratif (basé-mémoire/basé-modèle)
- Recommandation basée sur le contenu (TF-IDF, embeddings : Word2Vec, GloVe, etc)
- Modèles : KNN, SVD, NMF, etc

Étapes :

Dataset
- Utiliser : https://www.kaggle.com/datasets/arashnic/book-recommendation-dataset
- Tester d’autres datasets de recommandation de livres

Prétraitement
- Nettoyage des données (valeurs manquantes, doublons)
- Feature engineering (genres, auteurs, notes moyennes)
- Vectorisation des textes (titres, descriptions) avec TF-IDF, Word2Vec, GloVe, etc

In [34]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD

In [35]:
path = "./datasets/"

books = pd.read_csv(path+"Books.csv")
users= pd.read_csv(path+"Users.csv")
ratings = pd.read_csv(path+"Ratings.csv")

## Tests sur les données

In [12]:
books.rename(columns={'Book-Title':'title', 'Book-Author': 'author'}, inplace=True)
books.drop(columns=['Image-URL-S', 'Image-URL-M', 'Image-URL-L'], inplace=True)
books.head(10)

Unnamed: 0,ISBN,title,author,Year-Of-Publication,Publisher
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company
5,0399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group
6,0425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group
7,0671870432,PLEADING GUILTY,Scott Turow,1993,Audioworks
8,0679425608,Under the Black Flag: The Romance and the Real...,David Cordingly,1996,Random House
9,074322678X,Where You'll Find Me: And Other Stories,Ann Beattie,2002,Scribner


In [5]:
def show_image(url, width=100):
    """ Affiche les images du dataset"""
    return f'<img src="{url}" width="{width}">'

def url_to_img(df, func):
    return df.style.format({'Image-URL-S':func, 'Image-URL-M':func, 'Image-URL-L':func}, escape=False)

In [13]:
mask=np.where(books.isna())
books_nan=books.iloc[mask[0]]

books_nan

Unnamed: 0,ISBN,title,author,Year-Of-Publication,Publisher
118033,0751352497,A+ Quiz Masters:01 Earth,,1999,Dorling Kindersley
128890,193169656X,Tyrant Moon,Elaine Corvidae,2002,
129037,1931696993,Finders Keepers,Linnea Sinclair,2001,
187689,9627982032,The Credit Suisse Guide to Managing Your Perso...,,1995,Edinburgh Financial Publishing


In [14]:
books.fillna('Unknown', inplace=True)

In [15]:
books.iloc[mask[0]]

Unnamed: 0,ISBN,title,author,Year-Of-Publication,Publisher
118033,0751352497,A+ Quiz Masters:01 Earth,Unknown,1999,Dorling Kindersley
128890,193169656X,Tyrant Moon,Elaine Corvidae,2002,Unknown
129037,1931696993,Finders Keepers,Linnea Sinclair,2001,Unknown
187689,9627982032,The Credit Suisse Guide to Managing Your Perso...,Unknown,1995,Edinburgh Financial Publishing


lignes a verif :

- 118033
- 128890
- 129037
- 187689
- 209540
- 220731
- 221678

In [16]:
books.duplicated().sum()

0

In [17]:
ratings.rename(columns={'Book-Rating':'rating'}, inplace=True)
ratings.head()

Unnamed: 0,User-ID,ISBN,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [18]:
ratings.isna().sum()

User-ID    0
ISBN       0
rating     0
dtype: int64

In [19]:
ratings.duplicated().sum()

0

In [20]:
ratings.head()

Unnamed: 0,User-ID,ISBN,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [21]:
ratings['ISBN'].value_counts()

ISBN
0971880107     2502
0316666343     1295
0385504209      883
0060928336      732
0312195516      723
               ... 
1568656386        1
1568656408        1
1569551553        1
1570081808        1
05162443314       1
Name: count, Length: 340556, dtype: int64

In [22]:
X=ratings['User-ID'].value_counts() > 200
X

User-ID
11676      True
198711     True
153662     True
98391      True
35859      True
          ...  
116180    False
116166    False
116154    False
116137    False
276723    False
Name: count, Length: 105283, dtype: bool

In [23]:
X[X].shape

(899,)

In [24]:
Y=X[X].index
Y

Index([ 11676, 198711, 153662,  98391,  35859, 212898, 278418,  76352, 110973,
       235105,
       ...
       260183,  73681,  44296, 155916,   9856, 274808,  28634,  59727, 268622,
       188951],
      dtype='int64', name='User-ID', length=899)

In [25]:
ratings_test=ratings[ratings['User-ID'].isin(Y)]
ratings_test.head()

Unnamed: 0,User-ID,ISBN,rating
1456,277427,002542730X,10
1457,277427,0026217457,0
1458,277427,003008685X,8
1459,277427,0030615321,0
1460,277427,0060002050,0


## Ajout des notes moyennes de chaque livre

In [26]:
book_rating_stats = ratings.groupby('ISBN')['rating'].mean().reset_index()
book_rating_stats

Unnamed: 0,ISBN,rating
0,0330299891,3.0
1,0375404120,1.5
2,0586045007,0.0
3,9022906116,3.5
4,9032803328,0.0
...,...,...
340551,cn113107,0.0
340552,ooo7156103,7.0
340553,§423350229,0.0
340554,´3499128624,8.0


In [27]:
np.where(book_rating_stats['ISBN']=='0000913154')

(array([107], dtype=int64),)

In [28]:
book_rating_stats.iloc[107]

ISBN      0000913154
rating           8.0
Name: 107, dtype: object

In [29]:
books_w_ratings=book_rating_stats.merge(books, on='ISBN')
books_w_ratings.head()

Unnamed: 0,ISBN,rating,title,author,Year-Of-Publication,Publisher
0,0000913154,8.0,The Way Things Work: An Illustrated Encycloped...,C. van Amerongen (translator),1967,Simon &amp; Schuster
1,0001010565,0.0,Mog's Christmas,Judith Kerr,1992,Collins
2,0001046438,9.0,Liar,Stephen Fry,0,Harpercollins Uk
3,0001046713,0.0,Twopence to Cross the Mersey,Helen Forrester,1992,HarperCollins Publishers
4,000104687X,6.0,"T.S. Eliot Reading \The Wasteland\"" and Other ...",T.S. Eliot,1993,HarperCollins Publishers


## Dataset User

In [30]:
users.isna().sum()

User-ID          0
Location         0
Age         110762
dtype: int64

In [31]:
users.duplicated().sum()

0

## Ajout des genres en utilisant un autre dataset + propagation via KNN

## Ajout des genres et descriptions en utilisant un autre dataset

In [4]:
import requests

def get_book_genres(title):
    url = f"https://openlibrary.org/search.json?title={title}"
    response = requests.get(url)
    data = response.json()
    if data['docs']:
        book = data['docs'][0]
        genres = book.get('subject', [])
        return genres
    return []

# Exemple
genres = get_book_genres("Harry Potter")
print(genres)


[]


In [None]:
import requests

API_URL = "https://api.hardcover.app/v1/graphql"
API_TOKEN = ""  # Remplace par ton token

def test_api_connection():
    query = """
    query {
      me {
        id
        username
      }
    }
    """
    headers = {
        "Authorization": f"Bearer {API_TOKEN}",
        "Content-Type": "application/json"
    }
    response = requests.post(API_URL, json={"query": query}, headers=headers)
    if response.status_code == 200:
        data = response.json()
        if "errors" in data:
            print("Erreur dans la requête GraphQL :", data["errors"])
        else:
            print("Connexion réussie ! Infos utilisateur :")
            print(data["data"]["me"])
    else:
        print(f"Erreur HTTP {response.status_code}: {response.text}")

test_api_connection()


Connexion réussie ! Infos utilisateur :
[{'id': 39049, 'username': 'redyydm'}]


In [None]:
import requests
import json

API_URL = "https://api.hardcover.app/v1/graphql"
API_TOKEN = ""  # Remplace par ton token

def get_book_genres(slug):
    query = """
    query {
      search(
        query: "$title",
        query_type: "Book",
        per_page: 5,
        page: 1
      ) {
        book {
          title
        }
      }
    }
    """
    headers = {
        "Authorization": f"Bearer {API_TOKEN}",
        "Content-Type": "application/json"
    }
    variables = {"title":slug}

    response = requests.post(API_URL, json={"query": query}, headers=headers)
    if response.status_code == 200:
        data = response.json()
        with open('fichier_export.json', 'w', encoding='utf-8') as f:
          json.dump(data, f, ensure_ascii=False, indent=4)
        docs=data['data']['search']['results']['hits']
        for doc in docs:
            print(doc['document']['genres'])

        
    else:
        print(f"Erreur API: {response.status_code} - {response.text}")
        return None, []
        
get_book_genres("les miserables")


KeyError: 'data'

In [None]:
def search_book_slug(title):
    query = """
    query ($search: String!) {
      search(query: $search, first: 5) {
        edges {
          node {
            slug
            title
            author_names
          }
        }
      }
    }
    """
    variables = {"search": title}
    headers = {
        "Authorization": f"Bearer {API_TOKEN}",
        "Content-Type": "application/json"
    }

    response = requests.post(API_URL, json={"query": query, "variables": variables}, headers=headers)
    if response.status_code == 200:
        data = response.json()
        results = data.get("data", {}).get("search", {}).get("edges", [])
        # Retourne le premier slug trouvé
        if results:
            return results[0]["node"]["slug"]
        else:
            print(f"Aucun résultat pour '{title}'")
            return None
    else:
        print(f"Erreur API: {response.status_code} - {response.text}")
        return None

## Ajout des genres et descriptions des livres via API google books (si possible)

In [4]:
import requests
import time

def get_book_info(isbn, api_key=None):
    base_url = 'https://www.googleapis.com/books/v1/volumes'
    params = {
        'q': f'isbn:{isbn}',
    }
    if api_key:
        params['key'] = api_key

    response = requests.get(base_url, params=params)
    if response.status_code != 200:
        return None

    data = response.json()
    items = data.get('items')
    if not items:
            return {
                 'title': None,
                'author': None,
                'categories': None,
                'description': None,
            }

    volume_info = items[0]['volumeInfo']
    return {
        'title': volume_info.get('title'),
        'author': volume_info.get('authors'),
        'categories': volume_info.get('categories'),
        'description': volume_info.get('description'),
    }

In [None]:
books_df = pd.read_csv(path+'Books.csv')
books_df.rename(columns={'Book-Title':'title', 'Book-Author': 'author'}, inplace=True)
books_df.drop(columns=['Image-URL-S', 'Image-URL-M', 'Image-URL-L'], inplace=True)
enriched_data = []

for idx, row in books_df.head(10).iterrows():
    isbn = row['ISBN']
    info = get_book_info(isbn, '') 
    enriched_data.append(info)
    time.sleep(0.1) 

enriched_df = pd.DataFrame(enriched_data)

enriched_df

Unnamed: 0,0
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,


In [33]:
books_df

Unnamed: 0,ISBN,title,author,Year-Of-Publication,Publisher
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company
...,...,...,...,...,...
271355,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm)
271356,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books
271357,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco
271358,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press


In [34]:
books_df_enriched=books_df.merge(enriched_df, on='title')
books_df_enriched

Unnamed: 0,ISBN,title,author_x,Year-Of-Publication,Publisher,author_y,categories,description
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,"[Mark P. O. Morford, Robert J. Lenardon]",[Social Science],Provides an introduction to classical myths pl...
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,[Richard Bruce Wright],[Actresses],"In a small town in Canada, Clara Callan reluct..."
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,[Carlo D'Este],[History],"Here, for the first time in paperback, is an o..."
3,0399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,[Amy Tan],[Fiction],An absorbing narrative of Winnie Louie's life.
4,0771074670,Nights Below Station Street,David Adams Richards,1988,Emblem Editions,[David Adams Richards],[Fiction],Another story based in the fictional rural tow...
...,...,...,...,...,...,...,...,...
203,0872203166,The Prince,Niccolo Machiavelli,1995,Hackett Publishing Company,[Niccolò Machiavelli],[Political ethics],"Notes, a 2-page map, an index, and an altogeth..."
204,0226500446,The Prince,Niccolo Machiavelli,1998,Press,[Niccolò Machiavelli],[Political ethics],"Notes, a 2-page map, an index, and an altogeth..."
205,0897110080,Black Beauty,Anna Sewell,1975,Worlds Great Classics,"[Anna Sewell, Ron Huron]",[Juvenile Fiction],A horse in nineteenth-century England recounts...
206,0895310619,Black Beauty,Anna Sewell,1981,Sharon Pubns,"[Anna Sewell, Ron Huron]",[Juvenile Fiction],A horse in nineteenth-century England recounts...


### API openlibrary

In [None]:
import pandas as pd
import requests
import time

# Utiliser les 10 premiers ISBNs pour test
isbns = books['ISBN'].dropna().unique()

# Stocker les genres extraits
extracted_genres = []

for isbn in isbns:
    try:
        url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn}&jscmd=details&format=json"
        response = requests.get(url)
        data = response.json()

        key = f"ISBN:{isbn}"
        genres = data.get(key, {}).get('details', {}).get('subjects', [])
        
        # Extraire les noms des genres
        genres_list = [g['name'] if isinstance(g, dict) else g for g in genres]

        extracted_genres.append({
            'ISBN': isbn,
            'genres_from_api': ', '.join(genres_list)
        })

        # Pause pour éviter d’être bloqué par l’API
        time.sleep(1)

    except Exception as e:
        extracted_genres.append({'ISBN': isbn, 'genres_from_api': None})
        print(f"Erreur avec ISBN {isbn} : {e}")

# Convertir en DataFrame et enregistrer
genres_df = pd.DataFrame(extracted_genres)
genres_df.to_csv("genres_from_openlibrary.csv", index=False)

print("✅ Extraction terminée !")
