<a href="https://www.kaggle.com/code/nigamshitij/batch-processing-of-genres?scriptVersionId=194009019" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/genres-no-duplicates-processed-files/goodreads_genres_no_duplicates_16.csv
/kaggle/input/genres-no-duplicates-processed-files/goodreads_genres_no_duplicates_21.csv
/kaggle/input/genres-no-duplicates-processed-files/goodreads_genres_no_duplicates_31.csv
/kaggle/input/genres-no-duplicates-processed-files/goodreads_genres_no_duplicates_19.csv
/kaggle/input/genres-no-duplicates-processed-files/goodreads_genres_no_duplicates_32.csv
/kaggle/input/genres-no-duplicates-processed-files/goodreads_genres_no_duplicates_12.csv
/kaggle/input/genres-no-duplicates-processed-files/goodreads_genres_no_duplicates_4.csv
/kaggle/input/genres-no-duplicates-processed-files/goodreads_genres_no_duplicates_43.csv
/kaggle/input/genres-no-duplicates-processed-files/goodreads_genres_no_duplicates_42.csv
/kaggle/input/genres-no-duplicates-processed-files/goodreads_genres_no_duplicates_30.csv
/kaggle/input/genres-no-duplicates-processed-files/goodreads_genres_no_duplicates_29.csv
/kaggle/input/genres-n

In [2]:
pip install beautifulsoup4 requests

Note: you may need to restart the kernel to use updated packages.


In [6]:
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import time
import os

In [7]:
# genre setup

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
}

items_per_page = 100
# total_pages = (total_items + items_per_page - 1)
total_pages = 14
# max_pages_to_scrape = 14

In [10]:
csv_path = '/kaggle/input/book-genres/'
csv_no_dup = '/kaggle/input/genres-no-duplicates-processed-files'
csv_output_path = '/kaggle/working/'

In [12]:
def scrape_goodreads_genre(genre, base_url, max_pages_to_scrape=25, max_retries=3, retry_delay=5):
    books_data = []
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
    }
    
    base_url = "https://www.goodreads.com/shelf/show/"+genre
    
    for page in range(1, max_pages_to_scrape + 1):
        url = f"{base_url}?page={page}"
        retries = 0
        while retries < max_retries:
            try:
                response = requests.get(url, headers=headers, timeout=10)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, "html.parser")
                book_elements = soup.find_all("div", "elementList")
                
                if not book_elements:
                    print(f"No more books found for {genre} after page {page-1}")
                    return pd.DataFrame(books_data)
                
                for book_element in book_elements:
                    try:
                        book_title = book_element.find("a", "bookTitle").text.strip()
                        book_url = "https://www.goodreads.com" + book_element.find("a", "bookTitle").get("href")
                        author = book_element.find("a", "authorName").text.strip()
                        rating_text = book_element.find("span", "greyText smallText").text.split()
                        avg_rating = rating_text[2]
                        ratings = rating_text[4]
                        published_year = rating_text[-1] if len(rating_text) == 9 else ""
                
                        books_data.append({
                            "Title": book_title,
                            "URL": book_url,
                            "Authors": author,
                            "Avg Ratings": avg_rating,
                            "Rating": ratings,
                            "Published_year": published_year,
                            "Genre": genre
                        })
                    except AttributeError:
                        pass
                
                break  # If successful, break out of the retry loop
            
            except RequestException as e:
                retries += 1
                print(f"Error on page {page} of {genre}: {str(e)}. Retry {retries}/{max_retries}")
                if retries == max_retries:
                    print(f"Failed to retrieve page {page} for {genre} after {max_retries} attempts. Moving to next page.")
                    break
                time.sleep(retry_delay)
        
        time.sleep(1)  # Delay between successful page requests
    
    return pd.DataFrame(books_data)

def process_csv_files(start_file=75, num_files=125):
    all_genres_data = []
    
    for i in range(start_file, num_files + 1):
        input_file = f"split_dataframe_{i}.csv"
        input_file_path = os.path.join(csv_path, input_file)
        print(f"Processing file: {input_file}")
        
        # Read the genres from the CSV file
        genres_df = pd.read_csv(input_file_path)
        
        # Loop through genres and scrape data
        for index, row in genres_df.iterrows():
            genre = row['Genre']
            url = row['URL']
            
            print(f"Scraping data for {genre} genre...")
            df = scrape_goodreads_genre(genre, url)
            all_genres_data.append(df)
            print(f"Finished scraping {genre}. DataFrame shape: {df.shape}")
            print("\n" + "="*50 + "\n")
        
        # Combine all genre DataFrames into a single DataFrame
        combined_df = pd.concat(all_genres_data, ignore_index=True)
        
        # Remove duplicates based on Title and Authors
        combined_df.drop_duplicates(subset=['Title', 'Authors'], keep='first', inplace=True)
        
        # Save the combined DataFrame to a CSV file
        output_file = f"goodreads_genres_no_duplicates_{i}.csv"
        output_file_path = os.path.join(csv_output_path, output_file)
        combined_df.to_csv(output_file_path, index=False)
        print(f"Data from file {input_file} has been processed and saved to '{output_file_path}'.")
        print(f"Total number of unique books scraped so far: {len(combined_df)}")
        print("\n" + "="*50 + "\n")

# Run the process starting from file 75
process_csv_files(start_file=75)

Processing file: split_dataframe_75.csv
Scraping data for mira genre...
Finished scraping mira. DataFrame shape: (1250, 7)


Scraping data for mixed-martial-arts genre...
Finished scraping mixed-martial-arts. DataFrame shape: (1250, 7)


Scraping data for mmorpg genre...
Finished scraping mmorpg. DataFrame shape: (1250, 7)


Scraping data for modern genre...
Finished scraping modern. DataFrame shape: (1250, 7)


Scraping data for modern-classics genre...
Finished scraping modern-classics. DataFrame shape: (1250, 7)


Scraping data for mogadishu genre...
Finished scraping mogadishu. DataFrame shape: (100, 7)


Scraping data for mombasa genre...
Finished scraping mombasa. DataFrame shape: (125, 7)


Scraping data for money genre...
Error on page 11 of money: HTTPSConnectionPool(host='www.goodreads.com', port=443): Read timed out. (read timeout=10). Retry 1/3
Error on page 16 of money: HTTPSConnectionPool(host='www.goodreads.com', port=443): Read timed out. (read timeout=10). Retry 1/3
Fi

In [13]:
# After processing all files, combine them into a single CSV
all_data = []

# Read files from 1 to 74 from the output_path directory
for i in range(1, 75):
    file_name = f"goodreads_genres_no_duplicates_{i}.csv"
    file_path = os.path.join(csv_output_path, file_name)
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        all_data.append(df)

# Then, read files from 75 onwards from the output_path directory
for i in range(75, 126):
    file_name = f"goodreads_genres_no_duplicates_{i}.csv"
    file_path = os.path.join(csv_output_path, file_name)
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        all_data.append(df)

final_df = pd.concat(all_data, ignore_index=True)
final_df.drop_duplicates(subset=['Title', 'Authors'], keep='first', inplace=True)
final_output_path = os.path.join(csv_output_path, "goodreads_all_genres_final.csv")
final_df.to_csv(final_output_path, index=False)

print("All CSV files have been processed and combined.")
print(f"Final data has been saved to '{final_output_path}'.")
print(f"Total number of unique books in the final dataset: {len(final_df)}")



All CSV files have been processed and combined.
Final data has been saved to '/kaggle/working/goodreads_all_genres_final.csv'.
Total number of unique books in the final dataset: 35597
