In [None]:
import regex as re
import requests as rq
import time
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import openpyxl
import pandas as pd
import os
import json

### Reading the links of books from the Excel.

In [15]:
links_df = pd.read_excel("Goodreads Book Links.xlsx", engine="openpyxl", dtype=str)
links_df = links_df.iloc[2:,:]

In [38]:
# Function to check if a value is a valid URL
def is_link(value):
    if isinstance(value, str):
        return re.match(r'^(https?://\S+)$', value)  # Match HTTP/HTTPS links
    return False

Creating a list of links from the DataFrame

In [17]:
links = [cell for col in links_df.columns for cell in links_df[col] if is_link(cell)]
links

['https://www.goodreads.com/book/show/62334530-none-of-this-is-true?ref=trend_2_7',
 'https://www.goodreads.com/book/show/210690230-what-the-wife-knew',
 'https://www.goodreads.com/book/show/210365385-the-resurrectionist',
 'https://www.goodreads.com/book/show/203578707-what-the-woods-took',
 'https://www.goodreads.com/book/show/210331628-i-might-be-in-trouble',
 'https://www.goodreads.com/book/show/203578712-trouble-island',
 'https://www.goodreads.com/book/show/210366432-after-the-ocean',
 'https://www.goodreads.com/book/show/209594864-havoc',
 'https://www.goodreads.com/book/show/203579269-booked-for-murder',
 'https://www.goodreads.com/book/show/203579103-alter-ego',
 'https://www.goodreads.com/book/show/209455852-the-rivals',
 'https://www.goodreads.com/book/show/195790797-nobody-s-hero',
 'https://www.goodreads.com/book/show/157896130-agent-vs-assassin',
 'https://www.goodreads.com/book/show/204253885-fireman',
 'https://www.goodreads.com/book/show/209455840-bellevue',
 'https://

This function handles the number followers written in different format to numeric form

In [52]:
def convert_followers(text):
    """Extracts and converts followers count to an integer using regex."""
    match = re.search(r'([\d,.]+)([KM]?)', text.strip(), re.IGNORECASE)  # Extracts number + suffix

    if match:
        number = float(match.group(1).replace(',', ''))  # Convert to float (handles 77.7)
        suffix = match.group(2).upper()  # K or M

        if suffix == "K":
            number *= 1000  # Convert K to actual number
        elif suffix == "M":
            number *= 1000000  # Convert M to actual number

        return int(number)  # Convert final number to integer
    
    return 0  # Return 0 if no number found

The ```save_to_json``` function is triggered everytime we fetch the all the required detalis from the web page and the JSON file is updated simultaneously. 

In [53]:
def save_to_json(data):
    file_path = "books.json"

    if os.path.exists(file_path):  # If file already exists
        try:
            # Read existing data
            with open(file_path, "r", encoding="utf-8") as existing_data:
                book_data = json.load(existing_data)

            # Ensure the file contains a list
            if not isinstance(book_data, list):
                book_data = []

        except (json.JSONDecodeError, TypeError):
            book_data = []  # Reset to empty list if file is corrupted

    else:  # If file does not exist, create it
        book_data = []
        with open(file_path, "w", encoding="utf-8") as file:
            json.dump(book_data, file, ensure_ascii=False, indent=4)
        print("📄 File 'books.json' was missing, so it was created!")

    # Append new data to the list
    book_data.append(data)

    # Write the updated data back to the JSON file
    with open(file_path, "w", encoding="utf-8") as file:
        json.dump(book_data, file, ensure_ascii=False, indent=4)

    print("✅ New book added successfully!")
    return True

### Scraping the Book details 

The book details are scrapped using ```BeautifulSoup```.

In [92]:
def get_book_details(page):
    soup = bs(page.text, 'html')

    details = dict()

    # Book Name
    details['book_name'] = soup.find('div', class_ = "BookPageTitleSection__title").h1.text

    # Author Name
    details['author_name'] = soup.find('span', class_ = "ContributorLink__name").text

    # Number of books
    details['number_of_books_by_author'] = int(re.sub(r'[^\d]', "",soup.find('div', class_ = "FeaturedPerson__infoPrimary").find('span', class_='Text Text__body3 Text__subdued').text[:5]))

    # Number of Followers
    try:
        followers_element = soup.find('div', class_="FeaturedPerson__infoPrimary").find('span', class_='u-dot-before')
        details['number_of_followers'] = convert_followers(followers_element.text) if followers_element else 0
    except AttributeError:
        details['number_of_followers'] = 0  # ✅ Assign 0 if the element is missing

    # Publish date
    details['publish_date'] = soup.find("div", class_="FeaturedDetails").find("p",{'data-testid': 'publicationInfo'}).text.replace("First published ", "").strip()

    # Number of Pages
    try:
        details['number_of_pages'] = int(re.sub(r'[^\d]', "",soup.find("div", class_="FeaturedDetails").find("p",{'data-testid': 'pagesFormat'}).text))
    except ValueError:
        details['number_of_pages'] = 0
        
    # Ratings 
    details['overall_rating'] = float(soup.find('div', class_ = "RatingStatistics__rating").text)

    # Number of ratings
    details['total_num_rating'] = int(re.sub(r'[^\d]', "",soup.find('div', class_ = "RatingStatistics__meta").find("span", {"data-testid" : "ratingsCount"}).text))

    # Number of reviews
    details['total_num_reviews'] = int(re.sub(r'[^\d]', "",soup.find('div', class_ = "RatingStatistics__meta").find("span", class_ = "u-dot-before").text))

    # Genre
    details['genre'] = soup.find("div", class_="BookPageMetadataSection__genres").find("span", {"tabindex":"-1"}).find("span",class_="BookPageMetadataSection__genreButton").text

    return details


Since the ```DOM``` elements on the website are lazy loaded. ```Selenium``` is more effecient option to scrap the website, as we can run the ```JS``` script on the website and fetch the ```html```.

In [None]:
def get_reviews(driver):
    # Scroll down to load reviews
    for _ in range(5):
        driver.execute_script("window.scrollBy(0, 1000);")
        time.sleep(2)

    # source = driver.page_source
    reviews = driver.execute_script("""
        return Array.from(document.getElementsByClassName("ReviewText__content"))
            .slice(0, 10).map(e => e.innerText);""")

    # Store reviews in a dictionary
    review_dict = {f"Review {i+1}": text for i, text in enumerate(reviews)}

    # Print dictionary
    print(review_dict)
    driver.quit()  # Close the browser
    
    return review_dict

In [72]:
def main(iteration, url):
    driver = webdriver.Chrome()
    driver.set_page_load_timeout(120)
    driver.get(url)

    page = rq.get(url)
    # Checking the status
    if page.status_code == 200:
        print(f"Current Iteration: {iteration}......\n")

        # Getting the all the details of the book        
        details = get_book_details(page)
        reviews = get_reviews(driver)

        # Appending the two dictinary
        data = {**details, **reviews}

        if save_to_json(data):
            print("Book Added to Json!")
            return True
        else:
            print("Something Went Wrong!")
            return False

    else:
        print("Something Went Wrong!")
        return False


In [None]:
from selenium import webdriver

for i, l in enumerate(links):
    main(i, l)

Current Iteration: 0......

{'Review 1': 'I was enthralled with Helen Garner’s quest to connect with her grandson, turning from boy to man, across the season of AFL. The training, the games, the team, and their camaraderie. The OBSERVATION of this. Who better to bear witness than this author, I think I’m quite enamoured by HG. Not at all a worry to be the furthest thing from a footy fan.\n\nAsking questions, uncaring if they are simple, inane or something that any fan would already know. I love this about her, it’s what I do. Question, question, question. "Why does mullet equal footy?\' Is this not the best?!\n\nBeing a neighbour to her kin, this family is lovely to observe. The honesty appealing and welcomed, entirely clear their love is strong and real. Her grandson WANTS to be with her. So often they hang out.\n\nI feel old and deaf and awed, in the back seat with the dog.\n\nI sense her awareness of ageing, stating, not emoting. I was fully absorbed in the sparseness of writing. A 