# Web scraping le site goodreads.com

### Extraction des titres auteurs couvertures et liens de chaque livre

In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pymongo
import pandas as pd 
import re 
from IPython.display import display, HTML


firefox_options = webdriver.FirefoxOptions()
firefox_options.headless = True  
driver = webdriver.Firefox(options=firefox_options)


start_url = 'https://www.goodreads.com/list/show/1938.What_To_Read_Next'
additional_pages = [ 
    #'https://www.goodreads.com/list/show/1938.What_To_Read_Next?page=2',
    #'https://www.goodreads.com/list/show/1938.What_To_Read_Next?page=3',
    #'https://www.goodreads.com/list/show/1938.What_To_Read_Next?page=4',
    #'https://www.goodreads.com/list/show/1938.What_To_Read_Next?page=5'
]

def scroll_and_load_more():
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(10)  

title_author_link = []


for page_url in [start_url] + additional_pages:
    driver.get(page_url)
    
    for _ in range(1):
        scroll_and_load_more()

    books = driver.find_elements(By.CSS_SELECTOR, 'tr')
    for book in books[0:]: 
        
        cover = book.find_element(By.CSS_SELECTOR,'img.bookCover').get_attribute('src')
        title = book.find_element(By.CSS_SELECTOR, 'a.bookTitle').text
        author = book.find_element(By.CSS_SELECTOR, 'a.authorName span').text
        link = book.find_element(By.CSS_SELECTOR, 'a.bookTitle').get_attribute('href')
        title_author_link.append({'Cover_URL' : cover, 'Title': title, 'Author': author, 'book_link': link})
    

df_title_author_link = pd.DataFrame(title_author_link)

def display_image(url):
    return f'<img src="{url}" width="50" height="50" />'


df_title_author_link['Cover'] = df_title_author_link['Cover_URL'].apply(display_image)

# On supprime de la colonne 'Cover_URL' 
df_title_author_link = df_title_author_link.drop('Cover_URL', axis=1)
columns = ['Cover', 'Title', 'Author','book_link']
df_title_author_link = df_title_author_link[columns]

display(HTML(df_title_author_link[columns].head().to_html(escape=False)))

driver.quit()

  firefox_options.headless = True


Unnamed: 0,Cover,Title,Author,book_link
0,,Life of Pi,Yann Martel,https://www.goodreads.com/book/show/4214.Life_of_Pi
1,,"The Hobbit (The Lord of the Rings, #0)",J.R.R. Tolkien,https://www.goodreads.com/book/show/5907.The_Hobbit
2,,"City of Bones (The Mortal Instruments, #1)",Cassandra Clare,https://www.goodreads.com/book/show/256683.City_of_Bones
3,,Wuthering Heights,Emily Brontë,https://www.goodreads.com/book/show/6185.Wuthering_Heights
4,,"Eragon (The Inheritance Cycle, #1)",Christopher Paolini,https://www.goodreads.com/book/show/113436.Eragon


### Extraction des résumés des notes et des genres à partir des liens de chaque livre

In [7]:
firefox_options = webdriver.FirefoxOptions()
firefox_options.headless = True 
driver = webdriver.Firefox(options=firefox_options)

summary_rating_genre = []

def scrape_book_page(url):
    driver.get(url)
    time.sleep(8)
    
    summary_element = driver.find_element(By.XPATH, '//span[@class="Formatted"]')
    summary = summary_element.text
   
    rating_element = driver.find_element(By.XPATH, '//div[@class="RatingStatistics__rating"]')
    rating = rating_element.text
    
    genre_element = driver.find_elements(By.XPATH,'//span[@class="BookPageMetadataSection__genreButton"]')
    genre = [element.text for element in genre_element]
    
    return {'Summary': summary, 'Rating': rating, 'Genre': genre}

for link in df_title_author_link['book_link']:
    scraped_data = scrape_book_page(link)
    summary_rating_genre.append(scraped_data)


driver.quit()


df_summary_rating_genre = pd.DataFrame(summary_rating_genre)


#print(df_summary_rating_genre)

  firefox_options.headless = True


### Nettoyage et fusion des data frames dans un seul 'total_data'

In [8]:
import pandas as pd 
import re 

df_summary_rating_genre['Summary'] = df_summary_rating_genre['Summary'].str.replace('\n', ' ').str.replace('\n\n', ' ')
df_summary_rating_genre['Genre'] = df_summary_rating_genre['Genre'].apply(lambda x: '|'.join(x))

#merge des data frames 
total_data = pd.concat([df_title_author_link, df_summary_rating_genre], axis=1, ignore_index=False)

pd.set_option('display.max_colwidth', None)
def supprimer_contenu_parentheses(texte):
    return re.sub(r'\([^)]*\)','', texte).strip()

total_data['Title'] = total_data['Title'].apply(supprimer_contenu_parentheses)


### Importation depuis la bibliothèque sur goodreads du csv qui contient les livres lus

In [9]:
import pandas as pd 
import re 

aplibrary = pd.read_csv("/Users/Samira/Downloads/goodreads_library_export_ap.csv")

#On filtre la librairie pour extraire les livres lu des pas lu
desired_data = 'read'
readbooks = aplibrary[aplibrary['Exclusive Shelf'] == desired_data]
readbooks.columns

# suppression des colonnes inutiles 
columns_to_drop = ['Book Id', 'Additional Authors', 'ISBN', 'ISBN13', 'My Rating', 'Publisher', 'Date Read', 'Date Added', 'Bookshelves', 'Bookshelves with positions', 'My Review', 'Spoiler', 'Private Notes', 'Read Count', 'Owned Copies', 'Number of Pages','Year Published', 'Original Publication Year']
columns_to_drop = [col for col in columns_to_drop if col in readbooks.columns]
readbooks.drop(columns=columns_to_drop, inplace=True)

#dataframe avec les livres lu
mybooks = pd.DataFrame(readbooks['Title'])

#suppressions des caractères entre parenthèses pour pluss de facilité a reconnaitre les livres
def supprimer_contenu_parentheses(texte):
    return re.sub(r'\([^)]*\)','', texte).strip()

mybooks['Title'] = mybooks['Title'].apply(supprimer_contenu_parentheses)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  readbooks.drop(columns=columns_to_drop, inplace=True)


### Système de recommendation et interface tkinter

In [10]:
import ipywidgets as widgets
from IPython.display import display, HTML
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import pandas as pd 
import tkinter as tk
from tkinter import ttk
from PIL import Image, ImageTk


# Initialisation des TF-IDF Vectorizer et Creation des matrices

tfidf_vectorizer_summ = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
tfidf_vectorizer_genre = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
tfidf_matrix_summ = tfidf_vectorizer_summ.fit_transform(total_data['Summary'])
tfidf_matrix_genre = tfidf_vectorizer_genre.fit_transform(total_data['Genre'])

# Calcul des cosine similarity entre les différents livres selon leur résumées et leur genres 

cosine_sim_summ = cosine_similarity(tfidf_matrix_summ, tfidf_matrix_summ)
cosine_sim_genre = cosine_similarity(tfidf_matrix_genre, tfidf_matrix_genre)

alpha = 0.5  
beta = 0.5  
cosine_sim_combined = alpha * cosine_sim_summ + beta * cosine_sim_genre


def get_recommendations(title, mybooks, total_data, cosine_sim=cosine_sim_combined):  

    idx = total_data[total_data['Title'].str.contains(title, case=False, regex=True)].index[0]  
    
    mybooks_titles = mybooks['Title'].values
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = [score for score in sim_scores if total_data['Title'].iloc[score[0]] not in mybooks_titles]
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    sim_scores = [score for score in sim_scores if total_data['Title'].iloc[score[0]] != title]
    
    sim_scores = sim_scores[1:6]  
    book_indices = [i[0] for i in sim_scores]
    return total_data.iloc[book_indices]    

# Fonction pour gérer la recherche et l'affichage des recommandations

def get_books_recommendations():
    title = book_entry.get()
    recommended_books = get_recommendations(title, mybooks, total_data)
    recommended_books_df = pd.DataFrame(recommended_books)
    selected_columns = ['Title', 'Author', 'Rating', 'Genre']
    selected_recommendations_df = recommended_books_df[selected_columns]
    recommendation_output.delete(1.0, tk.END)
    recommendation_output.insert(tk.END, selected_recommendations_df.to_string(index=False, header=True, justify='right'))

# Fonction pour l'affichage des résumé

def get_book_summary():
    title = book_entry.get().strip()  
    if title:
        matching_books = total_data[total_data['Summary'].str.contains(title, case=False)]
        if not matching_books.empty:
            book_summary = matching_books.iloc[0]['Summary']
            df_summary = pd.DataFrame({'Summary': [book_summary]})
            pd.set_option('display.max_colwidth', None)
            summary_text.config(state="normal")
            summary_text.delete("1.0", tk.END)
            summary_text.insert(tk.END, df_summary.to_string(index=False, header=False, justify='right'))
            summary_text.config(state="disabled")
        else:
            summary_text.config(state="normal")
            summary_text.delete("1.0", tk.END)
            summary_text.insert(tk.END, "No matching book found.")
            summary_text.config(state="disabled")


# Création de l'interface avec Tkinter
root = tk.Tk()
root.title("Book Recommendation")

title_label = ttk.Label(root, text="Enter Book Title:")
title_label.pack()
book_entry = ttk.Entry(root)
book_entry.pack()

search_button = ttk.Button(root, text="Search", command=get_books_recommendations)
search_button.pack()

def next_page():
    recommendation_output.pack_forget()
    search_button.pack_forget()
    book_entry.pack()
    search_summary_button.pack()
    summary_text.pack()
    next_button.pack_forget()
    

def quit_application():
    root.quit()

next_button = ttk.Button(root, text="Next", command=next_page)
next_button.pack()

recommendation_output = tk.Text(root, wrap=tk.WORD, width=100, height=20)
recommendation_output.pack()

summary_text = tk.Text(root, wrap=tk.WORD, width=100, height=20)
summary_text.config(state="disabled")

search_summary_button = ttk.Button(root, text="Search Summary", command=get_book_summary)

root.mainloop()
