In [1]:
# Import all the required libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [2]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}

In [3]:
base_url = "https://www.goodreads.com/"

In [4]:
response = requests.get(base_url,headers=headers)

In [5]:
response

<Response [200]>

In [6]:
soup = BeautifulSoup(response.text,'html')

In [7]:
genre_links = soup.find_all('a', class_='gr-hyperlink')

In [8]:
#genre_links

In [9]:
genres_dict = {}  
for i in genre_links:
    genre_name = i.text.strip()
    genre_url = 'https://www.goodreads.com' + i['href']
    genres_dict[genre_name] = genre_url

In [10]:
titles = []
authors = []
links = []
images = []
genres = []
ratings = []

In [11]:
#Function to scrape the data related to each book
def books_data(genre_url,genre_name):
    response = requests.get(genre_url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    books = soup.find_all('div', class_='coverWrapper')
    print(f'Found {len(books)} books on this page.')
        
    for book in books:
            # Extract the link
            book_link_tag = book.find('a', href=True)
            link = 'https://www.goodreads.com' + book_link_tag['href'] 
            links.append(link)
            
            # Extract the title
            img_tag = book.find('img')
            title = img_tag['alt'] 
            titles.append(title)
            
            # Extract the author (if available)
            author_tag = book.find_next('a', class_='authorName')
            author = author_tag.get_text(strip=True) if author_tag else 'Unknown Author'
            authors.append(author)
            
            # Extract the image URL
            image_tag = book.find('img', src=True)
            image = image_tag['src'] 
            images.append(image)

            # Genres
            genres.append(genre_name)
    


In [12]:
for genre_name, genre_url in genres_dict.items():
        print(f"Scraping books for genre: {genre_name}")
        books_data(genre_url, genre_name)
        time.sleep(1)  


Scraping books for genre: Terms of Service
Found 0 books on this page.
Scraping books for genre: Privacy Policy
Found 0 books on this page.
Scraping books for genre: Sign In
Found 0 books on this page.
Scraping books for genre: Art
Found 36 books on this page.
Scraping books for genre: Biography
Found 45 books on this page.
Scraping books for genre: Business
Found 33 books on this page.
Scraping books for genre: Children's
Found 44 books on this page.
Scraping books for genre: Christian
Found 43 books on this page.
Scraping books for genre: Classics
Found 31 books on this page.
Scraping books for genre: Comics
Found 45 books on this page.
Scraping books for genre: Cookbooks
Found 30 books on this page.
Scraping books for genre: Ebooks
Found 15 books on this page.
Scraping books for genre: Fantasy
Found 45 books on this page.
Scraping books for genre: Fiction
Found 45 books on this page.
Scraping books for genre: Graphic Novels
Found 44 books on this page.
Scraping books for genre: Hist

In [13]:
# Convert lists to DataFrame
books_df = pd.DataFrame({
        'Title': titles,
        'Genre': genres,
        'Author': authors,
        'Link': links,
        'Image': images,
        
    })
books_df.head(100)


Unnamed: 0,Title,Genre,Author,Link,Image
0,The Color of Everything: A Journey to Quiet th...,Art,Jackie Wullschläger,https://www.goodreads.com/book/show/199373502-...,https://i.gr-assets.com/images/S/compressed.ph...
1,The Nude,Art,Jackie Wullschläger,https://www.goodreads.com/book/show/199532684-...,https://i.gr-assets.com/images/S/compressed.ph...
2,"The Future Was Now: Madmen, Mavericks, and the...",Art,Jackie Wullschläger,https://www.goodreads.com/book/show/58725001-t...,https://i.gr-assets.com/images/S/compressed.ph...
3,Do Something: Coming of Age Amid the Glitter a...,Art,Jackie Wullschläger,https://www.goodreads.com/book/show/121312704-...,https://i.gr-assets.com/images/S/compressed.ph...
4,The Last Sane Woman,Art,Jackie Wullschläger,https://www.goodreads.com/book/show/199775178-...,https://i.gr-assets.com/images/S/compressed.ph...
...,...,...,...,...,...
95,Slow Productivity: The Lost Art of Accomplishm...,Business,Tony Blair,https://www.goodreads.com/book/show/197773418-...,https://i.gr-assets.com/images/S/compressed.ph...
96,Be Useful: Seven Tools for Life,Business,Tony Blair,https://www.goodreads.com/book/show/125063314-...,https://i.gr-assets.com/images/S/compressed.ph...
97,Digital Minimalism: Choosing a Focused Life in...,Business,Tony Blair,https://www.goodreads.com/book/show/40672036-d...,https://i.gr-assets.com/images/S/compressed.ph...
98,Chip War: The Fight for the World's Most Criti...,Business,Tony Blair,https://www.goodreads.com/book/show/60321447-c...,https://i.gr-assets.com/images/S/compressed.ph...


In [14]:
books_df.to_csv('books.csv', index=False)
print('Data saved to books.csv')

Data saved to books.csv


In [17]:
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# Create a session with retry strategy
session = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
session.mount('https://', HTTPAdapter(max_retries=retries))

ratings = []
for one_book_url in books_df['Link']:
    try:
        a = session.get(one_book_url, timeout=10)
        b = BeautifulSoup(a.text, 'html.parser')
        rating_tag = b.find('div', {'class': 'RatingStatistics__rating'}).text
        ratings.append(rating_tag)
        print(f"Successfully fetched rating from: {one_book_url}")
    except requests.exceptions.Timeout:
        print(f"Timeout occurred for URL: {one_book_url}")
        ratings.append('Timeout')
    except requests.exceptions.RequestException as e:
        print(f"Error occurred for URL: {one_book_url} - {e}")
        ratings.append('Error')
    
    time.sleep(1)  # Adding a delay between requests


Successfully fetched rating from: https://www.goodreads.com/book/show/199373502-the-color-of-everything
Successfully fetched rating from: https://www.goodreads.com/book/show/199532684-the-nude
Successfully fetched rating from: https://www.goodreads.com/book/show/58725001-the-future-was-now
Successfully fetched rating from: https://www.goodreads.com/book/show/121312704-do-something
Successfully fetched rating from: https://www.goodreads.com/book/show/199775178-the-last-sane-woman
Successfully fetched rating from: https://www.goodreads.com/book/show/180723951-henna-is
Successfully fetched rating from: https://www.goodreads.com/book/show/199981527-a-death-in-cornwall
Successfully fetched rating from: https://www.goodreads.com/book/show/59109077-horse
Successfully fetched rating from: https://www.goodreads.com/book/show/203315037-the-book-of-bill
Successfully fetched rating from: https://www.goodreads.com/book/show/195430688-the-paris-novel
Successfully fetched rating from: https://www.goo

In [18]:
books_df['Rating'] = ratings


In [19]:
books_df.head()

Unnamed: 0,Title,Genre,Author,Link,Image,Rating
0,The Color of Everything: A Journey to Quiet th...,Art,Jackie Wullschläger,https://www.goodreads.com/book/show/199373502-...,https://i.gr-assets.com/images/S/compressed.ph...,4.21
1,The Nude,Art,Jackie Wullschläger,https://www.goodreads.com/book/show/199532684-...,https://i.gr-assets.com/images/S/compressed.ph...,3.72
2,"The Future Was Now: Madmen, Mavericks, and the...",Art,Jackie Wullschläger,https://www.goodreads.com/book/show/58725001-t...,https://i.gr-assets.com/images/S/compressed.ph...,4.26
3,Do Something: Coming of Age Amid the Glitter a...,Art,Jackie Wullschläger,https://www.goodreads.com/book/show/121312704-...,https://i.gr-assets.com/images/S/compressed.ph...,3.81
4,The Last Sane Woman,Art,Jackie Wullschläger,https://www.goodreads.com/book/show/199775178-...,https://i.gr-assets.com/images/S/compressed.ph...,3.68


In [20]:
books_df.to_csv('books_final.csv', index=False)
print('Data saved to books_final.csv')

Data saved to books_final.csv
