In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/book-review-samples/goodreads_fantasy.csv
/kaggle/input/book-review-samples/goodreads_all_genres.csv
/kaggle/input/book-review-samples/Darth_Plagueis_reviews(4).csv
/kaggle/input/book-review-samples/goodreads_all_genres_final.csv
/kaggle/input/book-review-samples/Dune_Book_1_reviews.csv
/kaggle/input/book-review-samples/Dune_Book_1_reviews(4).csv
/kaggle/input/book-review-samples/goodreads_genres_complete.csv


In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import concurrent.futures
import time
import os
import glob

In [3]:
def scrape_book_data_old(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract genres
        genres = soup.find_all('span', class_='BookPageMetadataSection__genreButton')
        genres = ';'.join([genre.text.strip() for genre in genres])
        
        # Extract cover image ID
        cover_image = soup.find('div', class_='BookCover__image')
        cover_image_id = cover_image.find('img')['src'] if cover_image else ''
        
        # Extract ISBN and ASIN
        edition_details = soup.find('div', class_='EditionDetails')
        isbn = ''
        asin = ''
        if edition_details:
            for detail in edition_details.find_all('div'):
                if 'ISBN' in detail.text:
                    isbn = detail.text.split(':')[-1].strip()
                elif 'ASIN' in detail.text:
                    asin = detail.text.split(':')[-1].strip()
        
        # Extract total pages
        featured_details = soup.find('div', class_='FeaturedDetails')
        total_pages = ''
        if featured_details:
            pages_info = featured_details.find(string=lambda text: 'pages' in text.lower())
            if pages_info:
                total_pages = pages_info.split(',')[0].strip()
        
        return {
            'genres': genres,
            'cover_image_id': cover_image_id,
            'isbn': isbn,
            'asin': asin,
            'total_pages': total_pages
        }
    except Exception as e:
        print(f"Error scraping {url}: {str(e)}")
        return None

In [4]:
def scrape_book_data(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract genres
        genres = soup.find_all('span', class_='BookPageMetadataSection__genreButton')
        genres = ';'.join([genre.text.strip() for genre in genres])
        
        # Extract cover image ID
        cover_image = soup.find('div', class_='BookCover__image')
        cover_image_id = cover_image.find('img')['src'] if cover_image else ''
        
        # Extract ISBN and ASIN
        edition_details = soup.find('div', class_='EditionDetails')
        isbn = ''
        asin = ''
        if edition_details:
            for detail in edition_details.find_all('div'):
                if 'ISBN' in detail.text:
                    isbn = detail.text.split(':')[-1].strip()
                elif 'ASIN' in detail.text:
                    asin = detail.text.split(':')[-1].strip()
        
        # Extract total pages
        featured_details = soup.find('div', class_='FeaturedDetails')
        total_pages = ''
        if featured_details:
            pages_info = featured_details.find(string=lambda text: 'pages' in text.lower())
            if pages_info:
                total_pages = pages_info.split(',')[0].strip()
        
        return {
            'genres': genres,
            'cover_image_id': cover_image_id,
            'isbn': isbn,
            'asin': asin,
            'total_pages': total_pages
        }
    except Exception as e:
        print(f"Error scraping {url}: {str(e)}")
        return None

In [5]:
def process_batch(batch):
    results = []
    for _, row in batch.iterrows():
        data = scrape_book_data(row['URL'])
        if data:
            results.append({**row.to_dict(), **data})
    return results

In [6]:
input_csv = '/kaggle/input/book-review-samples/goodreads_all_genres_final.csv'

In [7]:
def get_last_processed_batch():
    batch_files = glob.glob('batch_*.csv')
    if not batch_files:
        return -1
    batch_numbers = [int(file.split('_')[1].split('.')[0]) for file in batch_files]
    return max(batch_numbers)

In [8]:
def main_old():
    # Read the original CSV
    df = pd.read_csv(input_csv)
    
    # Process in batches of 10
    batch_size = 10
    all_results = []
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        for i in range(0, len(df), batch_size):
            batch = df.iloc[i:i+batch_size]
            future = executor.submit(process_batch, batch)
            all_results.extend(future.result())
            
            # Save intermediate results
            batch_df = pd.DataFrame(all_results)
            batch_df.to_csv(f'batch_{i//batch_size}.csv', index=False)
            
            print(f"Processed {len(all_results)} books")
            time.sleep(1)  # To avoid overwhelming the server
    
    # Combine all batches into a single CSV
    final_df = pd.DataFrame(all_results)
    final_df.to_csv('updated_books.csv', index=False)
    
    # Clean up intermediate batch files
    for file in os.listdir():
        if file.startswith('batch_') and file.endswith('.csv'):
            os.remove(file)

In [9]:
def main():
    # Read the original CSV
    df = pd.read_csv(input_csv)
    
    # Process in batches of 10
    batch_size = 10
    all_results = []
    
    # Find the last processed batch
    last_batch = get_last_processed_batch()
    start_index = (last_batch + 1) * batch_size
    
    print(f"Resuming from batch {last_batch + 1}")
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        for i in range(start_index, len(df), batch_size):
            batch = df.iloc[i:i+batch_size]
            future = executor.submit(process_batch, batch)
            batch_results = future.result()
            all_results.extend(batch_results)
            
            # Save intermediate results
            batch_df = pd.DataFrame(batch_results)
            batch_df.to_csv(f'batch_{i//batch_size}.csv', index=False)
            
            print(f"Processed batch {i//batch_size} ({len(all_results)} books total)")
            time.sleep(1)  # To avoid overwhelming the server
    
    # Combine all batches into a single CSV
    all_batch_files = sorted(glob.glob('batch_*.csv'), key=lambda x: int(x.split('_')[1].split('.')[0]))
    final_df = pd.concat([pd.read_csv(f) for f in all_batch_files], ignore_index=True)
    final_df.to_csv('updated_books.csv', index=False)
    
    # Clean up intermediate batch files
    for file in all_batch_files:
        os.remove(file)

In [None]:
if __name__ == "__main__":
    main()

Resuming from batch 1088
Processed batch 1088 (10 books total)
Processed batch 1089 (20 books total)
Processed batch 1090 (30 books total)
Processed batch 1091 (40 books total)
Processed batch 1092 (50 books total)
Processed batch 1093 (60 books total)
Processed batch 1094 (70 books total)
Processed batch 1095 (80 books total)
Processed batch 1096 (90 books total)
Processed batch 1097 (100 books total)
Processed batch 1098 (110 books total)
Processed batch 1099 (120 books total)
Processed batch 1100 (130 books total)
Processed batch 1101 (140 books total)
Processed batch 1102 (150 books total)
Processed batch 1103 (160 books total)
Processed batch 1104 (170 books total)
Processed batch 1105 (180 books total)
Processed batch 1106 (190 books total)
Processed batch 1107 (200 books total)
Processed batch 1108 (210 books total)
Processed batch 1109 (220 books total)
Processed batch 1110 (230 books total)
Processed batch 1111 (240 books total)
Processed batch 1112 (250 books total)
Processed

In [None]:
print("test")

In [None]:
print("test2")