In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import plotly.express as px
import plotly.graph_objects as go


data = pd.read_csv("books_data.csv")
data.tail(60)

Unnamed: 0,bookID,title,authors,average_rating
11067,45420,The Home Front,Brian Braithwaite/Noelle Walsh/Glyn Davies,3.4
11068,45431,Half Moon Investigations,Eoin Colfer,3.77
11069,45432,The Supernaturalist,Eoin Colfer,3.86
11070,45436,The Wish List,Eoin Colfer,3.77
11071,45438,Legend of the Worst Boy in the World,Eoin Colfer/Glenn McCoy,3.76
11072,45440,The Legend of Spud Murphy,Eoin Colfer/Glenn McCoy,3.88
11073,45442,La venganza de Opal (Artemis Fowl #4),Eoin Colfer/Ana Alcaina,4.05
11074,45444,Going Potty,Eoin Colfer,3.14
11075,45449,Artemis Fowl (Artemis Fowl #1),Eoin Colfer/Claudia Feldmann,3.84
11076,45450,La última oportunidad,Eoin Colfer,3.77


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11127 entries, 0 to 11126
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   bookID          11127 non-null  int64 
 1   title           11127 non-null  object
 2   authors         11127 non-null  object
 3   average_rating  11127 non-null  object
dtypes: int64(1), object(3)
memory usage: 347.8+ KB


In [None]:
fig = px.histogram(data, x='average_rating',
                   nbins=30,
                   title='Distribution of Average Ratings')
fig.update_xaxes(title_text='Average Rating')
fig.update_yaxes(title_text='Frequency')
fig.show()

In [None]:
top_authors = data['authors'].value_counts().head(5)
fig = px.bar(top_authors, x=top_authors.values, y=top_authors.index, orientation='h',
             labels={'x': 'Number of Books', 'y': 'Author'},
             title='Number of Books per Author')
fig.show()

In [None]:
# Convert 'average_rating' to a numeric data type
data['average_rating'] = pd.to_numeric(data['average_rating'],errors='coerce')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11127 entries, 0 to 11126
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   bookID          11127 non-null  int64  
 1   title           11127 non-null  object 
 2   authors         11127 non-null  object 
 3   average_rating  11123 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 347.8+ KB


In [None]:
# Create a new column 'book_content' by combining 'title' and 'authors'
data['bookContet'] = data['title'] + ' ' + data['authors']
data.head()

Unnamed: 0,bookID,title,authors,average_rating,bookContet
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,Harry Potter and the Half-Blood Prince (Harry ...
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,Harry Potter and the Order of the Phoenix (Har...
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,Harry Potter and the Chamber of Secrets (Harry...
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,Harry Potter and the Prisoner of Azkaban (Harr...
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,Harry Potter Boxed Set Books 1-5 (Harry Potte...


In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(data['bookContet'])

In [None]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


In [None]:
def recommend_books(book_title, cosine_sim=cosine_sim):
    # Get the index of the book that matches the title
    idx = data[data['title'] == book_title].index[0]

    # Get the cosine similarity scores for all books with this book
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the books based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top 10 most similar books (excluding the input book)
    sim_scores = sim_scores[1:11]

    # Get the book indices
    book_indices = [i[0] for i in sim_scores]

    # Return the top 10 recommended books
    return data['title'].iloc[book_indices]

In [None]:
book_title = "Las Crónicas de Narnia"
recommended_books = recommend_books(book_title)
print(recommended_books)

3018     The Chronicles of Narnia (The Chronicles of Na...
9495         The Lion  the Witch and the Wardrobe (Narnia)
9494     The Lion  the Witch and the Wardrobe (Chronicl...
5014     The Voyage of the “Dawn Treader” (The Chronicl...
11120       O Príncipe Caspian (As Crónicas de Nárnia  #4)
11116         A Última Batalha (As Crónicas de Nárnia  #7)
11118     O Sobrinho do Mágico (As Crónicas de Nárnia  #1)
3878                  The Lion  the Witch and the Wardrobe
11117    O Cavalo e o Seu Rapaz (As Crónicas de Nárnia ...
11119    A Viagem do Caminheiro da Alvorada (As Crónica...
Name: title, dtype: object
