In [1]:
import pandas as pd

from selenium import webdriver # Will use to automate Chrome browser to open a specified URL with Selenium.
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementNotInteractableException

import time
import random
import numpy as np
import re
import math

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

from bs4 import BeautifulSoup # Using beautiful soup to parse pages for better performance and scalability
import uuid # Create a random id
import csv # Used to output batched data

import os # Had issue with repeatedly writing data to csv during testing, importing way to clear csv upon scraper initialization

import concurrent.futures # When I was researching parallelism, this was the most optimal option for our task
from fake_useragent import UserAgent # Needed to add fake_useragent so that I could run multiple processes headlessly

import hashlib # Used for comparing review pages

class GoodreadsScraper:
    """Base GoodreadsScraper that will hold methods used in both review and book link scraping"""
    
    def load_csv(self, file_path):
        """Loads the given CSV file into a pandas DataFrame"""
        try:
            return pd.read_csv(file_path)
        except Exception as e:
            print(f"Error reading CSV: {e}")
            return pd.DataFrame() # Return empty df if file fails to load

    def setup_driver(self):
        """Sets up the driver options and web driver service"""
        options = Options()

        # Used to increase efficiency
        options.add_argument("--blink-settings=imagesEnabled=false")
        options.add_argument("--disable-web-security")
        options.add_argument("--disable-extensions")
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-infobars") 
        options.add_argument("--start-maximized") # Make us look human
        # Used to bypass bot detection
        options.add_argument(f"--user-agent={self.user_agent}")
        options.add_argument("--disable-blink-features=AutomationControlled") # Don't let chrome flag us as a bot
        options.add_experimental_option("excludeSwitches", ["enable-automation"])
        options.add_experimental_option("useAutomationExtension", False) # hide selenium automation extension
                
        # Make it run headless for efficiency
        # Keep option to run headful for testing
        if self.headless: 
            options.add_argument("--headless=new")
            options.add_argument("--window-size=1920,1080") # Set window size

        PATH = "/opt/homebrew/bin/chromedriver"
        service = Service(PATH)
        
        self.driver = webdriver.Chrome(service=service, options=options)

        # Fingerprint evasion
        # Overide webdriver, set language, make fake list of plugins
        self.apply_stealth(self.driver)
        
    def open_url(self, url):
        """Opens the given url on the driver"""
        self.driver.get(url)

    def quit_driver(self):
        """Quits the driver"""
        if self.driver:
            self.driver.quit()

    def random_sleep(self):
        """Sleep the program for a randomly rounded non-integer amount of time, will be used to avoid being rate-limited"""
        # Generate a random value between 1-2 seconds
        sleep_time = random.uniform(1, 2)
        
        # Rounding level to prevent repetition
        rounding = np.random.choice(list(range(2, 7)))
        
        time.sleep(round(sleep_time, rounding))
        
    def longer_sleep(self):
        """Sleep the program for a random interval, that is larger than polite rate limiting interval, used between major actions
        such as loading a new book url."""
        # Generate a random value between 1-2 seconds
        sleep_time = random.uniform(2, 3)

        # Rounding level to prevent repetition
        rounding = np.random.choice(list(range(2, 7)))
        
        time.sleep(round(sleep_time, rounding))
    
    def take_screenshot(self):
        """Take a screenshot of the page and save it to /screenshots/"""
        self.driver.get_screenshot_as_file("screenshots/screenshot.png")

    def apply_stealth(self, driver):
        """Injects stealth JavaScript to override fingerprinting properties"""
        driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
        "source": """
            // Hide webdriver
            delete Object.getPrototypeOf(navigator).webdriver;

            // Fake Chrome runtime
            window.navigator.chrome = {
                runtime: {}
            };

            // Fake languages
            Object.defineProperty(navigator, 'languages', {
                get: () => ['en-US', 'en']
            });

            // Fake plugins with proper structure
            Object.defineProperty(navigator, 'plugins', {
                get: () => {
                // Create real Plugin objects
                    function createPlugin(name, filename, description) {
                        return {
                            name,
                            filename,
                            description,
                            __proto__: Plugin.prototype
                        };
                    }

                    const fakePlugins = [
                        createPlugin("Chrome PDF Plugin", "internal-pdf-viewer", "Portable Document Format"),
                        createPlugin("Chrome PDF Viewer", "mhjfbmdgcfjbbpaeojofohoefgiehjai", ""),
                        createPlugin("Native Client", "internal-nacl-plugin", "")
                    ];

                    const pluginArray = {
                        0: fakePlugins[0],
                        1: fakePlugins[1],
                        2: fakePlugins[2],
                        length: 3,
                        item: function(index) { return this[index]; },
                        namedItem: function(name) {
                            for (let i = 0; i < this.length; i++) {
                                if (this[i].name === name) return this[i];
                            }
                        },
                        [Symbol.iterator]: function* () {
                            for (let i = 0; i < this.length; i++) {
                                yield this[i];
                            }
                        },
                        toString: () => "[object PluginArray]",
                        __proto__: PluginArray.prototype
                    };

                    return pluginArray;
                }
            });
        """
    })
    

In [2]:
class GoodreadsBookLinkScraper(GoodreadsScraper):
    """Doc strings here for GoodreadsBookLinkScraper"""
    
    def __init__(self, user_agent=None, headless=True, csv_file_path="data/goodreads_list.csv"):
        """Initializes GoodreadsBookLinkScraper with an optional user_agent, boolean if headless, csv_file_path"""
        self.user_agent = user_agent or "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"
        self.headless = headless
        self.driver = None
        self.popup_handled = False
        self.initial_search = True
        self.df = self.load_csv(csv_file_path)

    # Defining functions that will clean and normalize the title/author allowing for easy fuzzy matching to books listed on Goodreads

    def clean_title(self, raw_title):
        """Cleans unneccessary characters and strings from the title"""
        # Remove edition, format, date, and extra author mentions in brackets
        title = re.sub(r"\[.*?\]", "", raw_title)  
        title = re.sub(r"\(.*?\)", "", title)
        title = re.sub(r",\s*book.*", "", title, flags=re.IGNORECASE) # remove book number
        title = re.sub(r"\s+", " ", title).strip()  # collapse multiple spaces
        return title

    def clean_author(self, raw_author):
        """Cleans unneccessary characters and strings from the author"""
        # Split by commas or 'and' to separate names
        parts = re.split(r",| and ", raw_author)

        # Filter out parts with roles
        possible_authors = [part for part in parts if not re.search(r"\(.*?\)", part)]
    
        # Use first valid name
        if possible_authors:
            author = possible_authors[0]
        else:
            author = re.sub(r"\(.*?\)", "", parts[0])

        # Clean up whitespace and punctuation
        author = re.sub(r"\s+", " ", author).strip().strip(",")
        return author

    def normalize_for_matching(self, text):
        """ Normalizes text to be easily compared against during fuzzy-matching """
        text = text.lower()
        text = text.replace("-", " ") # replace - with " "
        text = re.sub(r"[^\w\s&]", "", text)  # remove punctuation
        text = re.sub(r"\s+", " ", text).strip()
        return text

    def add_normalized_columns(self):
        """Creates two additional columns in the DataFrame for the normalized titles and authors"""
        self.df["Normalized Title"] = self.df["Title"].apply(
            lambda x: self.normalize_for_matching(self.clean_title(x)) # Clean then normalize titles
        )
        self.df["Normalized Author"] = self.df["Author"].apply(
            lambda x: self.normalize_for_matching(self.clean_author(x)) # Clean then normalize authors
        )

    def close_pop_up_if_present(self):
        """If a pop-up is present on the search page, automatically close it"""
        try:
            dismiss_button = WebDriverWait(self.driver, 3).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, 'div.modal.modal--centered button.gr-iconButton'))
            )
            self.driver.execute_script("arguments[0].click();", dismiss_button)
            print("Pop-up closed")

        except TimeoutException:
            print("Specific pop-up not found")
    
        except Exception as e:
            print(f"Error while trying to close the specific pop-up: {e}")

    def search_normalized_book(self, normalized_title, normalized_author, just_title=False):
        """Given a normalized title, automatically inputs and searches for the book, added exponential backoff for any network erros"""
        
        # Values for exponential backoff
        max_retries = 5
        base_delay = 2

        for attempt in range(max_retries):
            try:
                # Case for intial search
                if self.initial_search:
                    search = WebDriverWait(self.driver, 5).until(
                        EC.presence_of_element_located((By.ID, 'sitesearch_field'))
                    )
                    self.initial_search = False
                else: # All subsequent searches
                    search = WebDriverWait(self.driver, 5).until(
                        EC.presence_of_element_located((By.ID, 'search_query_main'))
                    )
                    search.clear()
                    self.random_sleep()
            
                # Send search request
                if just_title: 
                    search.send_keys(normalized_title) # If the title + author didn't return any results
                else:
                    search.send_keys(f"{normalized_title} by {normalized_author}")
                search.send_keys(Keys.RETURN)
                self.random_sleep()
                return # Successfully search, return
                
            except Exception as e:
                print(f"[Attempt {attempt+1}] Error during search: {e}")
                if attempt < max_retries - 1:
                    delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
                    print(f"Retrying in {delay:.2f} seconds...")
                    time.sleep(delay)
                else:
                    print("Max retries reached. Moving on.")

    def gather_search_results(self):
        """Acquire the title, author, links, and number of reviews from the given search results"""
        
        # Empty arrays to be filled with the respective data as gathered from the given search results
        titles, authors, links = [], [], []

        try:
            table = WebDriverWait(self.driver, 3).until(
                EC.presence_of_element_located((By.CLASS_NAME, "tableList"))
            )
        except TimeoutException:
            return [] # No results were found from the search
    
        found_titles = table.find_elements(By.CLASS_NAME, "bookTitle")
        found_authors = table.find_elements(By.CLASS_NAME, "authorName")
        
        for title_elem, author_elem in zip(found_titles, found_authors):
            title = title_elem.text.strip()
            titles.append(self.normalize_for_matching(title))
    
            author = author_elem.text.strip()
            authors.append(self.normalize_for_matching(author))

            link = title_elem.get_attribute("href")
            links.append(link)

        return list(zip(titles, authors, links))

    def fuzzy_match_for_link(self, normalized_title, normalized_author, search_results):
        """Fuzzy match the search results against our normalized title and author,
        return the link of the book that best matches our normalized input."""
        
        best_link = None
        best_link_reviews = None
        highest_score = 0

        for i in range(len(search_results)):
            title, author, link = search_results[i][0], search_results[i][1], search_results[i][2]
    
            title_score = fuzz.ratio(title, normalized_title)
            author_score = fuzz.ratio(author, normalized_author)

            # Combine scores, treat as equally weighted
            total_score = (title_score + author_score) / 2

            if (total_score > highest_score) :
                highest_score = total_score
                best_link = link
                
        return best_link

    def find_best_books_links(self):
        """Given a DataFrame with normalized titles and authors, finds the best book match and adds the link to the DataFrame."""

        # Initialize book_links list
        book_links = []

        for index, row in self.df.iterrows():

            try:
                normalized_title = row["Normalized Title"]
                normalized_author = row["Normalized Author"]
                
                # Search and gather results (title + author), we account for polite rate limiting in this call
                self.search_normalized_book(normalized_title, normalized_author)

                if not self.popup_handled:
                    # Try and close pop-up so long as its there
                    self.close_pop_up_if_present()
                    self.popup_handled = True

                search_results = self.gather_search_results()

                # Search and gather results (title), append to existing search_results list

                self.search_normalized_book(normalized_title, normalized_author, just_title=True)

                search_results += self.gather_search_results()
                
                # Find best matching book link and its respective review count
                book_link = self.fuzzy_match_for_link(normalized_title, normalized_author, search_results)
                
                # Add the book link to the list if it exists
                book_links.append(book_link if book_link else None)
                
                
            except Exception as e:
                print(f"Error finding book for {normalized_title} by {normalized_author}: {e}")
                book_links.append(None)
            
        # Add the links to the DataFrame
        self.df["Book Link"] = book_links

    def remove_qid_from_url(self, url):
        """Helper function to remove qid from book links, returning link to be used in remove_duplicates"""
        
        return re.sub(r"([&?]qid=[^&]*)", "", url)
        
    def remove_duplicates(self, link_column="Book Link"):
        """Remove duplicate rows which use the same book link, so there is no repeated scraping"""
        
        # Create our temporary column
        self.df["Cleaned Link"] = self.df[link_column].apply(self.remove_qid_from_url)

        # Drop the duplicate column, keeping the first instance, update the indexing
        self.df = self.df.drop_duplicates(subset="Cleaned Link", keep="first").reset_index(drop=True)

        # Drop the temporary column
        self.df = self.df.drop(columns=["Cleaned Link"])

        return self.df


In [3]:
# Code used to generate book links stored in my_goodreads_list.csv
# scraper = GoodreadsBookLinkScraper(headless=False)

# scraper.setup_driver()

# scraper.open_url("https://goodreads.com/")

# scraper.add_normalized_columns()

# # Call the function to update the DataFrame with book links
# scraper.find_best_books_links()

# num_book_links = len(scraper.df)
# print(f"Book links found: {num_book_links}")

# scraper.remove_duplicates()

# duplicates_removed = num_book_links - len(scraper.df)
# print(f"Duplicates removed: {duplicates_removed}")
# print(f"Remaing book count: {len(scraper.df)}")

# # Save the updated dataframe
# scraper.df.to_csv("data/my_goodreads_list.csv", index=False)

# scraper.quit_driver()

# Output:
# Pop-up closed
# Book links found: 471
# Duplicates removed: 3
# Remaing book count: 468

In [8]:
class GoodreadsReviewScraper(GoodreadsScraper):
    """Doc string for GoodreadsReviewScraper here"""

    def __init__(self, scraper_id=None, user_agent=None, headless=True, csv_file_path="data/my_goodreads_list.csv", output_csv=None):
        """Initializes GoodreadsReviewScraper with an optional scraper_id/user_agent, boolean if headless, csv_file_path, and optional output_csv file path"""
        self.scraper_id = scraper_id or str(uuid.uuid4()) # Set to given/autogenerate id
        
        self.user_agent = user_agent or "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"
        self.headless = headless
        self.driver = None
        
        self.popup_handled = False
        self.first_review_page = True
        
        self.df = self.load_csv(csv_file_path)
        self.start_index = 0 # Index of where to begin parsing in csv_file_path
        self.end_index = len(self.df) # Index of where to stop parsing in csv_file_path
        
        self.output_csv = output_csv or f"data/reviews_{self.scraper_id}.csv" # The CSV file path for where this scraper will output all parsed review data
        
        if os.path.exists(self.output_csv): # Clears the file content if file exists
            with open(self.output_csv, 'w', encoding='utf-8') as f:
                f.truncate(0)  
        
        self.review_buffer = [] # Temporary storage for our review data
        self.BATCH_SIZE = 1000 # Number of reviews to accumulate before writing to output_csv
        self.reviews_skipped = 0 # Count of the number of reviews skipped during parsing
        self.reviews_parsed = 0 # Count of the number of reviews parsed

        self.last_page_hash = None # Used to check if we are stuck on the same review page


    def set_range(self, start_index, end_index):
        """Set the index range of the CSV for the review scraper to parse over"""
        try:
            # Validate types
            if not isinstance(start_index, int) or not isinstance(end_index, int):
                raise TypeError("start_index and end_index must be integers.")

            # Validate values
            if start_index < 0 or end_index < 0:
                raise ValueError("start_index and end_index must be non-negative integers.")

            if start_index > end_index:
                raise ValueError("start_index must be less than or equal to end_index.")

            # Set values if all checks pass
            self.start_index = start_index
            self.end_index = end_index
    
        except (TypeError, ValueError) as e:
            print(f"{self.scraper_id}: Invalid range: start_index={start_index}, end_index={end_index} | Error: {e}")
            raise
        
    def close_pop_up_if_present(self):
        """Close a the pop-up if it is present on the review webpage"""
        try:
            dismiss_button = WebDriverWait(self.driver, 3).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, 'button.Button[aria-label="Close"]')) # Different pop-up than the one encountered in during book links
            )
            self.driver.execute_script("arguments[0].click();", dismiss_button)
            self.popup_handled = True
            print(f"{self.scraper_id}: Pop-up closed")
            self.random_sleep()

        except TimeoutException:
            print(f"{self.scraper_id}: Specific pop-up not found")
    
        except Exception as e:
            print(f"{self.scraper_id}: Error while trying to close the specific pop-up: {e}")

    def fast_scroll(self):
        """Scroll down the webpage in 1-2 seconds to mimic human behavior"""
        scroll_height = self.driver.execute_script("return document.body.scrollHeight")
    
        # Target scroll range: 75% to 90% of page height
        target_scroll = random.uniform(0.75, 0.9) * scroll_height

        # Break into realistic steps
        steps = random.randint(4, 6)
        rounding = np.random.choice(list(range(2, 7)))
        pause_time = random.uniform(0.1, 0.3)

        for i in range(1, steps + 1):
            intermediate_scroll = int((i / steps) * target_scroll)
            self.driver.execute_script(f"window.scrollTo(0, {intermediate_scroll});")
            time.sleep(round(pause_time, rounding))
    
        self.random_sleep()  # Final small delay

    def parse_review_page(self, book_id, title, author):
        """Parse over the page of loaded reviews gathering specified information after the page loads"""
        try:
            # Wait for reviews to load
            try:
                WebDriverWait(self.driver, 5).until(
                    EC.presence_of_all_elements_located((By.CLASS_NAME, "ReviewCard"))
                )
            except TimeoutException:
                print(f"{self.scraper_id}: No reviews to load for {title} by {author}")

            # Scroll to mimic behavior
            self.fast_scroll()
            
            # Store the page source
            page_source = self.driver.page_source

            # Switch to BeatutifulSoup to parse page faster, since its now loaded
            soup = BeautifulSoup(page_source, "html.parser")
            
            try:
                # Get all the review cards
                review_cards = soup.find_all("article", class_="ReviewCard")
            except Exception as e:
                print(f"{self.scraper_id}: Error accumulating review card list: {e}")

            # List will store all data on the page
            all_review_data_on_page = []
            
            # Loop through each review on the page
            for review_card in review_cards:
                try:
                    review_data = {
                        "book_id": book_id,
                        "author": author,
                        "title": title,
                    }
                    # Parse the element's text
                    review_text = self.parse_review_text(review_card)
                
                    # Parse the element's profile link
                    review_link = self.parse_profile_link(review_card)
                
                    # Parse the element's upvotes and comments
                    likes_and_comments = self.parse_likes_and_comments(review_card)
                
                    review_upvotes = likes_and_comments[0] 
                    review_comments = likes_and_comments[1]

                    # Parse the element's rating
                    review_rating = self.parse_rating(review_card)
                
                    # Parse the element's date
                    review_date = self.parse_date(review_card)
                
                    # Parse the element's shelf tags
                    review_shelf_tags = self.parse_shelf_tags(review_card)
                
                    review_data.update({
                        "review_text": review_text,
                        "review_rating": review_rating,
                        "reviewer_ID": review_link,
                        "review_upvotes": review_upvotes,
                        "review_date": review_date,
                        "review_comments": review_comments,
                        "review_shelf_tags": review_shelf_tags
                    })

                    all_review_data_on_page.append(review_data)
                    self.reviews_parsed += 1
                
                except Exception as e:
                    print(f"{self.scraper_id}: Unable to parse review: {e}")
                    self.reviews_skipped += 1
            
            return all_review_data_on_page
        
        except Exception as e:
            print(f"{self.scraper_id}: Error loading review page for {title} by {author}: {e}")
            
    def parse_review_text(self, review_card):
        """Parse the text from the review card using BeautifulSoup"""
        try:
            text_elem = review_card.find("span", class_="Formatted")
            # Parse review text
            return text_elem.text.strip()
        except Exception:
            return None # No text found

    def parse_profile_link(self, review_card):
        """Parse the profile link from the review card using BeautifulSoup"""
        try:
            link_elem = review_card.find("div", class_="ReviewerProfile__name")
            # Parse review profile link
            return link_elem.find("a")["href"]
        except Exception:
            return None # No link found

    def parse_likes_and_comments(self, review_card):
        """Parse the likes and comments from the review card using BeautifulSoup"""
        try:
            social_footer_elem = review_card.find("footer", class_="SocialFooter")
            # Parse element for text
            footer_text = social_footer_elem.text
            
            # Use regex to find likes and comments
            likes_match = re.search(r"(\d[\d,]*)\s+likes?", footer_text)
            comments_match = re.search(r"(\d[\d,]*)\s+comments?", footer_text)

            # Set to their number if they exist otherwise return 0
            likes = int(likes_match.group(1).replace(',', '')) if likes_match else 0
            comments = int(comments_match.group(1).replace(',', '')) if comments_match else 0
            
            return likes, comments
            
        except Exception as e:
            print(f"{self.scraper_id}: Error parsing likes and comments: {e}")
            return 0, 0

    def parse_rating(self, review_card):
        """Parse the rating element from the review card using BeautifulSoup"""
        try:
            date_and_rating_elem = review_card.find("section", class_="ReviewCard__row")

            # Parse parent element for rating element
            rating_elem = date_and_rating_elem.find("span", class_="RatingStars RatingStars__small")
            
            # Get aria_label attribute
            aria_label = rating_elem["aria-label"]
            
            # Use regex to find start rating
            rating = re.search(r'\d+', aria_label).group()

            return rating
            
        except Exception as e:
            return None # No rating found

    def parse_date(self, review_card):
        """Parse the date element from the review card using BeautifulSoup"""
        try:
            date_and_rating_elem = review_card.find("section", class_="ReviewCard__row")
            
            # Parse parent element for date element
            date_elem = date_and_rating_elem.find("span", class_="Text Text__body3")

            date = date_elem.text.strip()
            
            return date
            
        except Exception as e:
            return None # No date found

    def parse_shelf_tags(self, review_card): # Only getting tags viewable without opening collapsible list
        """Parse the shelf tags from the review card using BeautifulSoup"""
        try:
            shelf_tag_elem = review_card.find("section", class_="ReviewCard__tags")
            
            # Get all the tag elements
            tag_elems = shelf_tag_elem.find_all("span", class_="Button__labelItem")
            
            # Gather text of all visible tags
            tags = [tag_elem.text for tag_elem in tag_elems if tag_elem.text != "...more"]
            
            # Join list delimit with a ", "
            shelf_tags = ", ".join(tags)

            return shelf_tags
            
        except Exception as e:
            return None # No shelf tags found

    def load_all_review_pages(self, book_id, title, author, book_url):
        """Load all pages of a given book_url and call parse_review_page on each page, storing collected data in output_csv, added exponential backoff"""
        
        # Values for exponential backoff
        max_retries = 5
        base_delay = 2
        
        for attempt in range(max_retries):
            try:
                # Open the given book's URL
                self.open_url(book_url) 
                self.longer_sleep()
                
                first_book_reload = True # Will allow the scraper to look for the correct button when loading more reviews

                while True:
                
                    # Pop-up only appears on first/second review page
                    if not first_book_reload and not self.popup_handled:
                        self.close_pop_up_if_present()

                    # Compare hashing to see if we are blocked by site from loading more (stuck on same review page)
                    current_hash = hashlib.md5(self.driver.page_source.encode("utf-8")).hexdigest()
                    
                    if current_hash == self.last_page_hash:
                        print(f"{self.scraper_id}: Duplicate page detected for {title} by {author} stopping.")
                        self.write_to_output_csv()
                        break
                         
                    self.last_page_hash = current_hash # Update last_page for later comparison
        
                    # Gather all information on the page
                    self.review_buffer.extend(self.parse_review_page(book_id, title, author))

                    # Check if we have reached our batching limit
                    if len(self.review_buffer) > self.BATCH_SIZE:
                        # Write reviews to CSV and clear review_buffer
                        self.write_to_output_csv()
                
                    # Click correct load button (if it exists)
                    try:
                        if first_book_reload:
                            load_button = WebDriverWait(self.driver, 3).until(
                                EC.element_to_be_clickable((By.CSS_SELECTOR, "div.Divider--contents a.Button--transparent"))
                            )
                            self.driver.execute_script("arguments[0].click();", load_button) # Only 1 button exists, we click the first
                            first_book_reload = False
                        else:
                            load_button = WebDriverWait(self.driver, 3).until(
                                EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.Divider--contents div.Button__container button.Button--secondary"))
                            )
                            self.driver.execute_script("arguments[0].click();", load_button[1]) # Two buttons exist, we want to click the second (which is load more)

                    except ElementNotInteractableException:
                        print(f"{self.scraper_id}: Load button exists but is not interactable.")
                        break
                
                    except IndexError: # We have parsed all review pages, only the previous review load button exists
                        self.write_to_output_csv() # Writing remaining elements to CSV
                        print(f"{self.scraper_id}: Finished loading pages for: {title} by {author}")
                        break

                    except TimeoutException:
                        self.write_to_output_csv() # Writing remaining elements to CSV
                        print(f"{self.scraper_id}: No more pages to load for {title} by {author}")
                        break
                
                    except Exception as e:
                        self.write_to_output_csv() # Writing remaining elements to CSV
                        print(f"{self.scraper_id}: Error loading new page: {e}")
                        break

                    if self.first_review_page:
                        self.first_review_page = False
            
                    self.random_sleep() # Avoid rate limiting
            
            except Exception as e:
                print(f"{self.scraper_id}: [Attempt {attempt+1}] Error during search: {e}")
                if attempt < max_retries - 1:
                    delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
                    print(f"{self.scraper_id}: Retrying in {delay:.2f} seconds...")
                    time.sleep(delay)
                    continue
                else:
                    print(f"{self.scraper_id}: Max retries reached. Unable to open book link for {title} by {author}. Moving on.")
                    return # Continue to next book

            return # All pages are loaded
    
    def write_to_output_csv(self):
        """Write accumulated reviews to CSV and update the output_csv"""

        fieldnames = [
            "book_id", "author", "title", "review_text", "review_rating",
            "reviewer_ID", "review_upvotes", "review_date", "review_comments", "review_shelf_tags"
        ]
        try: 
            with open(self.output_csv, mode="a", newline="", encoding="utf-8") as output:
                writer = csv.DictWriter(output, fieldnames=fieldnames)
            
                # If this is the intial write, add the fieldnames
                if output.tell() == 0:
                    writer.writeheader()

                # Write each review to the CSV
                for review in self.review_buffer:
                    writer.writerow(review)
                    
        except Exception as e:
            print(f"{self.scraper_id}: Error writing to {self.output_csv}: {e}")

        # Print our how many reviews we appended
        print(f"{self.scraper_id}: Successfully exported {len(self.review_buffer)} to {self.output_csv}")
        
        # Clear the buffer after exporting all inputs, allows us to free memory
        self.review_buffer.clear()

    def run_scraper_over_range(self):
        """Scrapes over all review pages of all books in defined range (start_index -> end_index, inclusive) from csv_file_path adding to output_csv"""
        for i in range(self.start_index, self.end_index + 1):
            book = self.df.iloc[i] # Load the book we are on
            book_id = book["Book ID"]
            title = book["Title"]
            author = book["Author"]
            book_url = book["Book Link"]

            print(f"{self.scraper_id}: Processing book {i}: {title} by {author}")

            try:
                self.load_all_review_pages(book_id, title, author, book_url)
            except Exception as e:
                print(f"{self.scraper_id}: Failed to process book at index {i}: {e}")
                continue # Try to process next book

        print(f"{self.scraper_id}: Processed {(self.end_index + 1) - self.start_index} books. Reviews skipped: {self.reviews_skipped}, Reviews Parsed: {self.reviews_parsed}")

In [5]:
# Test on index 59 (The Return of the King) (~22,000 reviews) 
# scraper = GoodreadsReviewScraper(scraper_id="test", headless=False)

# scraper.setup_driver()

# scraper.set_range(59, 59)

# scraper.run_scraper_over_range()

# scraper.quit_driver()

# Testing fingerprinting
# ua = UserAgent()

# scraper = GoodreadsReviewScraper(user_agent=ua.random, scraper_id="test", headless=False)

# scraper.setup_driver()

# scraper.open_url("https://bot.sannysoft.com/")

# scraper.random_sleep()

# scraper.take_screenshot()

# scraper.quit_driver()

In [6]:
def divide_workload(total_books, num_workers):
    """Divides the number of books into somewhat equal index ranges based on the number of workers"""
    range_size = total_books // num_workers
    remainder = total_books % num_workers

    ranges = []
    start = 0 # 0 based index
    for i in range(num_workers):
        extra = 1 if i < remainder else 0 # Distribute remainder amongst first few workers
        end = start + range_size + extra - 1 
        ranges.append((start, end))
        start = end + 1

    return ranges

def run_goodreads_review_scraper(start_index, end_index, scraper_id, user_agent=None):
    """Worker function to run an instance of GoodreadsReviewScraper"""
    print(f"Starting scraper {scraper_id} for range {start_index}-{end_index}")
    try:
        if scraper_id == "worker_1" : scraper = GoodreadsReviewScraper(user_agent=user_agent, scraper_id=scraper_id, headless=False)
        else: 
            scraper = GoodreadsReviewScraper(user_agent=user_agent, scraper_id=scraper_id)
        scraper.setup_driver()
        scraper.set_range(start_index, end_index)
        scraper.run_scraper_over_range()
        scraper.quit_driver()
        print(f"Finished scraping for scraper {scraper_id} for range {start_index}-{end_index}")
    except Exception as e:
        print(f"Error in scraper {scraper_id} for range {start_index}-{end_index}: {e}")

def launch_parallel_review_scrapers(total_books=468, num_workers=8):
    ranges = divide_workload(total_books, num_workers)
    ua = UserAgent()

    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = []
        for i, (start, end) in enumerate(ranges):
            user_agent = ua.random
            scraper_id = f"worker_{i}"
            futures.append(executor.submit(run_goodreads_review_scraper, start, end, scraper_id, user_agent))

        concurrent.futures.wait(futures)
        print(f"Finished scraping {total_books} books")

In [7]:
launch_parallel_review_scrapers()

Starting scraper worker_0 for range 0-58Starting scraper worker_1 for range 59-117

Starting scraper worker_2 for range 118-176
Starting scraper worker_3 for range 177-235
Starting scraper worker_4 for range 236-293
Starting scraper worker_5 for range 294-351
Starting scraper worker_6 for range 352-409
Starting scraper worker_7 for range 410-467
worker_6: Processing book 352: The Light Princess by George MacDonaldworker_0: Processing book 0: Seven Husbands of Evelyn Hugo by Taylor Jenkins Reid
worker_7: Processing book 410: The Little Lady of the Big House by Jack London
worker_2: Processing book 118: Darius the Great Is Not Okay by Adib Khorram

worker_5: Processing book 294: Jane Eyre by Charlotte Bronte
worker_4: Processing book 236: How Sweet It Is by Dylan Newton
worker_3: Processing book 177: Panic by Lauren Oliver
worker_1: Processing book 59: The Return of the King by J.R.R. Tolkien
worker_3: Pop-up closed
worker_6: Pop-up closed
worker_5: Pop-up closed
worker_4: Pop-up closed


In [20]:
import glob 

def export_to_main_csv():
    """Export all collected data from workers into a combined reviews_output.csv"""

    # Get all worker csv files
    worker_outputs = glob.glob("data/reviews_worker_*")

    # Load all csv files into DataFrames
    dfs = [pd.read_csv(worker_output) for worker_output in worker_outputs]

    # Combine all DataFrames into one
    reviews_output = pd.concat(dfs, ignore_index=True)

    # Export our final DF to a csv
    reviews_output.to_csv("data/reviews_output.csv", index=False)

    print(f"Successfully exported reviews_output to csv. {len(reviews_)} total reviews.")

In [21]:
export_to_main_csv()

Successfully exported reviews_output to csv. 1360671 total reviews.


In [22]:
def view_main_csv_in_chunks():
    """Inorder to view our file, and make sure it is correct must load in chunks to not exceed memory limits"""
    chunk_size = 250000  # 250000 rows
    chunks = pd.read_csv("data/reviews_output.csv", chunksize=chunk_size)

    for chunk in chunks:
        print(chunk.head())  # Load each chunk

In [23]:
view_main_csv_in_chunks()

   book_id       author                             title  \
0   100161  Jack London  The Little Lady of the Big House   
1   100161  Jack London  The Little Lady of the Big House   
2   100161  Jack London  The Little Lady of the Big House   
3   100161  Jack London  The Little Lady of the Big House   
4   100161  Jack London  The Little Lady of the Big House   

                                         review_text  review_rating  \
0        Ενδιαφέρουσα ιστορία μ΄ ένα απρόβλητο τέλος            4.0   
1  Love triangles stories are always interesting ...            2.0   
2  Джек Лондон - "Малката стопанка на голямата къ...            NaN   
3  3.5Интересното беше, че книгата ми хареса, мак...            4.0   
4  I’ve always admired Jack London for the person...            2.0   

                                         reviewer_ID  review_upvotes  \
0  https://www.goodreads.com/user/show/33767336-s...              15   
1  https://www.goodreads.com/user/show/33049679-j...          