In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/book-review-samples/goodreads_fantasy.csv
/kaggle/input/book-review-samples/goodreads_all_genres.csv
/kaggle/input/book-review-samples/Darth_Plagueis_reviews(4).csv
/kaggle/input/book-review-samples/goodreads_all_genres_final.csv
/kaggle/input/book-review-samples/Dune_Book_1_reviews.csv
/kaggle/input/book-review-samples/Dune_Book_1_reviews(4).csv
/kaggle/input/book-review-samples/goodreads_genres_complete.csv


In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import concurrent.futures
import time
import os

In [3]:
def scrape_book_data(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract genres
        genres = soup.find_all('span', class_='BookPageMetadataSection__genreButton')
        genres = ';'.join([genre.text.strip() for genre in genres])
        
        # Extract cover image ID
        cover_image = soup.find('div', class_='BookCover__image')
        cover_image_id = cover_image.find('img')['src'] if cover_image else ''
        
        # Extract ISBN and ASIN
        edition_details = soup.find('div', class_='EditionDetails')
        isbn = ''
        asin = ''
        if edition_details:
            for detail in edition_details.find_all('div'):
                if 'ISBN' in detail.text:
                    isbn = detail.text.split(':')[-1].strip()
                elif 'ASIN' in detail.text:
                    asin = detail.text.split(':')[-1].strip()
        
        # Extract total pages
        featured_details = soup.find('div', class_='FeaturedDetails')
        total_pages = ''
        if featured_details:
            pages_info = featured_details.find(string=lambda text: 'pages' in text.lower())
            if pages_info:
                total_pages = pages_info.split(',')[0].strip()
        
        return {
            'genres': genres,
            'cover_image_id': cover_image_id,
            'isbn': isbn,
            'asin': asin,
            'total_pages': total_pages
        }
    except Exception as e:
        print(f"Error scraping {url}: {str(e)}")
        return None

In [6]:
def process_batch(batch):
    results = []
    for _, row in batch.iterrows():
        data = scrape_book_data(row['URL'])
        if data:
            results.append({**row.to_dict(), **data})
    return results

In [7]:
input_csv = '/kaggle/input/book-review-samples/goodreads_all_genres_final.csv'

In [8]:
def main():
    # Read the original CSV
    df = pd.read_csv(input_csv)
    
    # Process in batches of 10
    batch_size = 10
    all_results = []
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        for i in range(0, len(df), batch_size):
            batch = df.iloc[i:i+batch_size]
            future = executor.submit(process_batch, batch)
            all_results.extend(future.result())
            
            # Save intermediate results
            batch_df = pd.DataFrame(all_results)
            batch_df.to_csv(f'batch_{i//batch_size}.csv', index=False)
            
            print(f"Processed {len(all_results)} books")
            time.sleep(1)  # To avoid overwhelming the server
    
    # Combine all batches into a single CSV
    final_df = pd.DataFrame(all_results)
    final_df.to_csv('updated_books.csv', index=False)
    
    # Clean up intermediate batch files
    for file in os.listdir():
        if file.startswith('batch_') and file.endswith('.csv'):
            os.remove(file)

In [None]:
if __name__ == "__main__":
    main()

Processed 10 books
Processed 20 books
Processed 30 books
