### Imports

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException, TimeoutException, StaleElementReferenceException, InvalidArgumentException
import json
import logging
import time

import pandas as pd 

## Data Collection

### Collection of Book Page URLs

The book page URLs were first collected and saved in a JSON file. Subsequently, using these URLs, detailed book information was scraped. This two-step approach enhances efficiency and troubleshooting: if an error occurs during the data extraction phase, it can be resolved without repeating the initial URL collection. 


In [None]:
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize the WebDriver
driver = webdriver.Chrome()

# Open the Goodreads Choice Awards 2023 page
driver.get("https://www.goodreads.com/choiceawards/best-books-2023?ref=nav_brws_gca")

# Wait for the page to load fully by waiting for a known element on the page
try:
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "a.previousYears__link"))
    )
except Exception as e:
    logger.error("An error occurred while waiting for the page to load: %s", e)
    driver.quit()
    exit()

years = []

# Add the 2023 URL manually
years.append({'Year': '2023', 'url': 'https://www.goodreads.com/choiceawards/best-books-2023?ref=nav_brws_gca'})

# Find the previous years' links
year_links = driver.find_elements(By.CSS_SELECTOR, "a.previousYears__link")

for link in year_links:
    year = link.text.split()[0]  # Extract the year from the link text
    url = link.get_attribute("href")
    years.append({'Year': year, 'url': url})

genres = []

for year in years[0:3]:
    driver.get(year['url'])
    genre_elements = driver.find_elements(By.CLASS_NAME, "category")
    # Extract and print the genre names and URLs
    for element in genre_elements:
        genre_name = element.find_element(By.TAG_NAME, "h4").text
        genre_link = element.find_element(By.TAG_NAME, "a").get_attribute("href")
        genres.append({'Year': year['Year'], 'Genre': genre_name, 'Genre_link': genre_link})

# List to store books information
books = []

for element in genres:
    driver.get(element['Genre_link'])
    
    # Find all book entries
    book_entries = driver.find_elements(By.CSS_SELECTOR, "div.inlineblock.pollAnswer.resultShown, div.inlineblock.pollAnswer.resultShown.pollAnswer--last")
    
    # Iterate over each book entry
    for entry in book_entries:
        # Extract necessary information from each book entry
        votes = entry.find_element(By.CSS_SELECTOR, "strong.uitext.result").text
        book_link = entry.find_element(By.CSS_SELECTOR, "a.pollAnswer__bookLink").get_attribute("href")
        books.append({'Year': element['Year'], 'Genre': element['Genre'], 'Votes': votes, 'Book link': book_link})

# Save book links to a file
with open('book_urls_final.json', 'w') as f:
    json.dump(books, f, indent=4)
    
# Close the driver
driver.quit()

### Scraping Book Information and Initial 120 Reviews

Selenium was used to scrape reviews from Goodreads.com due to its dynamic nature. Automation of clicking actions was necessary to navigate and access the required content.

In [None]:
# Function to read and parse JSON data from a file
def read_json_file(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

# Function to retry finding an element
def get_element_with_retry(driver, by, value, retries=3, delay=2):
    for i in range(retries):
        try:
            return WebDriverWait(driver, delay).until(EC.presence_of_element_located((by, value)))
        except (StaleElementReferenceException, NoSuchElementException, TimeoutException):
            if i < retries - 1:
                time.sleep(delay)
            else:
                return None

file_path = r'C:\Users\filepath\book_urls_3_years.json'  # A generic path has been used in this presentation file
json_data = read_json_file(file_path)

# Extracting book links and other details from JSON data
book_details = []
for book_list in json_data:  
    for book in book_list:
        if "Book link" in book:
            book_details.append(book)

# Initialize the WebDriver 
driver = webdriver.Chrome()

# Open the Goodreads homepage
driver.get("https://www.goodreads.com/")

# Find the "Sign In" link and click it
sign_in_link = driver.find_element(By.XPATH, "//a[@class='gr-hyperlink' and @href='/user/sign_in']")
sign_in_link.click()

# Find the "Sign in with email" link and click it
sign_in_with_email_link = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.XPATH, "//a[contains(@href, 'https://www.goodreads.com/ap/signin') and .//button[contains(text(), 'Sign in with email')]]"))
)
sign_in_with_email_link.click()

# Find the email input field and enter your email
email_input = driver.find_element(By.ID, 'ap_email')
email_input.send_keys('example@gmail.com')  # A generic email address has been used in this presentation file

# Find the password input field and enter your password
password_input = driver.find_element(By.ID, 'ap_password')
password_input.send_keys('password')  # A generic password has been used in this presentation file

# Find the "Sign in" button and click it
sign_in_button = driver.find_element(By.ID, 'signInSubmit')
sign_in_button.click()

# Wait until the homepage is loaded after signing in
WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.XPATH, "//a[contains(@href, '/user/show')]"))
)

book_info = []

for book in book_details:
    book_link = book["Book link"]
    try:
        driver.get(book_link)
    except TimeoutException:
        print(f"Page not found for link: {book_link}")
        continue
    
    # Find the book title
    book_title_element = get_element_with_retry(driver, By.XPATH, "//h1[contains(@class, 'Text__title1') and @data-testid='bookTitle']")
    book_title = book_title_element.text if book_title_element else "NaN"

    # Retrieve year and votes directly from the book dictionary
    book_year = book.get("Year", "NaN")
    votes = book.get("Votes", "NaN")
    genre = book.get("Genre", "NaN")
    
    # Find the author name
    book_author_element = get_element_with_retry(driver, By.XPATH, "//a[contains(@class, 'ContributorLink')]//span[@data-testid='name']")
    book_author = book_author_element.text if book_author_element else "NaN"
    
    # Find the book rating
    book_rating_element = get_element_with_retry(driver, By.XPATH, "//div[contains(@class, 'RatingStatistics__rating')]")
    book_rating = book_rating_element.text if book_rating_element else "NaN"

    # Find the rating count
    rating_count_element = get_element_with_retry(driver, By.XPATH, "//span[@data-testid='ratingsCount']")
    rating_count = rating_count_element.text if rating_count_element else "NaN"

    # Find the genres
    genres = []
    genre_elements = driver.find_elements(By.XPATH, "//span[contains(@class, 'BookPageMetadataSection__genreButton')]//span[contains(@class, 'Button__labelItem')]")
    for genre_element in genre_elements:
        try:
            genres.append(genre_element.text)
        except StaleElementReferenceException:
            genre_element = get_element_with_retry(driver, By.XPATH, "//span[contains(@class, 'BookPageMetadataSection__genreButton')]//span[contains(@class, 'Button__labelItem')]")
            genres.append(genre_element.text if genre_element else "NaN")

    # Collect reviews
    reviews = []

    try:
        # Ensure the Filters button is visible and interactable
        filters_button = WebDriverWait(driver, 10).until(
            EC.visibility_of_element_located((By.XPATH, "//button[contains(., 'Filters')]"))
        )
    
        # Scroll to the Filters button to ensure it's fully in view
        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", filters_button)
    
        # Use JavaScript to click the Filters button to bypass any overlays or other issues
        driver.execute_script("arguments[0].click();", filters_button)
    
        # Continue with other operations like selecting the English language option
        english_radio_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//input[@id='en']"))
        )
        driver.execute_script("arguments[0].click();", english_radio_button)

        # Apply the filter
        apply_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(., 'Apply')]"))
        )
        driver.execute_script("arguments[0].click();", apply_button)

    except Exception as e:
        print(f"An error occurred while setting filters: {str(e)}")

    def get_reviews():
        review_elements = driver.find_elements(By.XPATH, "//section[contains(@class, 'ReviewText')]")
        for review_element in review_elements:
            try:
                spoiler_button = review_element.find_element(By.XPATH, ".//div[contains(@class, 'Button__container')]/button[@aria-label='Activate this button to show the hidden review.']")
                if spoiler_button:
                    driver.execute_script("arguments[0].scrollIntoView(true);", spoiler_button)
                    time.sleep(1)
                    driver.execute_script("arguments[0].click();", spoiler_button)
                    time.sleep(1)
            except NoSuchElementException:
                pass

            try:
                review_text_element = review_element.find_element(By.XPATH, ".//div[contains(@class, 'TruncatedContent__text--large')]/span[contains(@class, 'Formatted')]")
                reviews.append(review_text_element.text)
            except NoSuchElementException:
                reviews.append("NaN")

    try:
        get_reviews()
    except Exception:
        reviews = ["NaN"]

    try:
        more_reviews_button = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//a[contains(@href, '/book/show/') and contains(@aria-label, 'Tap to show more reviews and ratings')]"))
        )
        driver.execute_script("arguments[0].scrollIntoView(true);", more_reviews_button)
        time.sleep(1)
        driver.execute_script("arguments[0].click();", more_reviews_button)
        time.sleep(2)
        get_reviews()
    except (NoSuchElementException, ElementClickInterceptedException, TimeoutException):
        pass

    click_count = 0
    while click_count < 3:
        try:
            show_more_button = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'Button__container')]//button[contains(@class, 'Button--secondary') and .//span[@data-testid='loadMore']]"))
            )
            driver.execute_script("arguments[0].scrollIntoView(true);", show_more_button)
            time.sleep(1)
            driver.execute_script("arguments[0].click();", show_more_button)
            time.sleep(2)
            get_reviews()
            click_count += 1
        except (NoSuchElementException, ElementClickInterceptedException, TimeoutException):
            break

    book_data = {
        'Year': book_year,
        'Genre': genre,
        'Votes': votes,
        'Title': book_title, 
        'Author': book_author,
        'Overall Rating': book_rating,
        'Ratings': rating_count,
        'Genres': genres,
        'Reviews': reviews,
    }
    
    book_info.append(book_data)

# Save the results in a JSON file
with open('book_reviews_finale.json', 'w', encoding='utf-8') as f:
    json.dump(book_info, f, ensure_ascii=False, indent=4)

# Close the WebDriver
driver.quit()
