<a href="https://www.kaggle.com/code/nigamshitij/parse-book-reviews-per-book?scriptVersionId=193274685" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/book-review-samples/goodreads_fantasy.csv
/kaggle/input/book-review-samples/Darth_Plagueis_reviews(4).csv
/kaggle/input/book-review-samples/Dune_Book_1_reviews.csv
/kaggle/input/book-review-samples/Dune_Book_1_reviews(4).csv


In [2]:
import requests
from bs4 import BeautifulSoup
# import pandas as pd
from datetime import datetime
import time
import random
import re

In [3]:
def remove_boilerplate(text):
    boilerplate = ['cookie policy', 'privacy policy', 'terms of service', 'all rights reserved', '\n']
    for phrase in boilerplate:
        text = re.sub(r'(?i)' + re.escape(phrase) + r'.*', '', text)
    return text

In [4]:
def truncate_at_comments(review_text, threshold_percentage=20, min_review_length=100):
    comment_identifiers = [
        'Comments', 'Leave a comment', 'Reader comments', 
        'What do you think?', 'Join the discussion', 'Add a comment',
        'Post a comment', 'Write a comment', 'Show comments'
    ]
    
    lower_text = review_text.lower()
    text_length = len(lower_text)
    threshold = max(int(text_length * (threshold_percentage / 100)), min_review_length)

    # Check for comment identifiers
    for identifier in comment_identifiers:
        index = lower_text.find(identifier.lower())
        if index != -1 and index > threshold:
            return review_text[:index].strip()
    
    # If no identifiers found, try to detect comment-like structures
    paragraphs = review_text.split('\n\n')
    filtered_paragraphs = []
    
    for paragraph in paragraphs:
        # Skip short paragraphs that might be comments
        if len(paragraph) < 50:
            continue
        
        # Skip paragraphs that start with common comment patterns
        if re.match(r'^(Posted by|From|User|Anonymous|[\d/]+:)', paragraph.strip()):
            continue
        
        filtered_paragraphs.append(paragraph)
    
    # If we've removed some paragraphs, join the remaining ones
    if len(filtered_paragraphs) < len(paragraphs):
        return '\n\n'.join(filtered_paragraphs).strip()
    
    # If we haven't removed any paragraphs, return at least the first part of the text
    return review_text[:max(threshold, len(review_text) // 2)].strip()

In [5]:
def remove_comments(review_soup):
    # Common class names for comment sections
    comment_classes = ['comment', 'comments-list', 'comments-area', 'comments', 'comment-section', 'user-comments', 'disqus_thread']
    
    for class_name in comment_classes:
        comment_section = review_soup.find('div', class_=class_name)
        if comment_section:
            comment_section.decompose()  # This removes the element from the soup
    
    return review_soup

In [6]:
def score_paragraph(paragraph):
    review_keywords = ['review', 'book', 'read', 'author', 'story', 'character', 'plot', 'recommend']
    return sum(keyword in paragraph.lower() for keyword in review_keywords)

In [7]:
def search_book_reviews(book_name, author):
    # Combine book name and author for search query
    search_query = f"{book_name} {author} book review -site:goodreads.com -site:amazon.* -site:reddit.com -site:thestorygraph.* -site:youtube.* -site:*.tv -site:barnesandnoble.com -site:wikipedia.* -site:quora.com&start=0"
    
    # Perform Google search
    url = f"https://www.google.com/search?q={search_query.replace(' ', '+')}"
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    print(f"URL being looked at: {url}")
    
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract search results
    search_results = soup.find_all('div', class_='g')
    
    reviews = []
    
    for result in search_results:
        link = result.find('a')
        if link:
            review_url = link['href']

            # Skip non-http links and known non-review sites
            if not review_url.startswith('http') or any(site in review_url for site in ['google.com', 'wikipedia.org']):
                continue
            
            try:
                # review_url = 'https://booksatruestory.com/2013/03/27/book-review-darth-plagueis-james-luceno/'
                
                # Fetch the review page
                review_response = requests.get(review_url, headers=headers, timeout=10)
                review_soup = BeautifulSoup(review_response.text, 'html.parser')
                
                # Remove Comments
                review_soup = remove_comments(review_soup)
                
                # Review elements
                review_elements = review_soup.find_all(['main','p'])
                scored_paragraphs = [(elem, score_paragraph(elem.text)) for elem in review_elements]
                
                # Extract review text (this is a simplistic approach and may need refinement)
                review_paragraphs = [elem.text for elem, score in sorted(scored_paragraphs, key=lambda x: x[1], reverse=True)[:3]]
                # review_paragraphs = review_soup.find_all('main')
                # review_list_items = review_soup.find_all('li')
                
                # Search for relevant text
                review_text = ' '.join(review_paragraphs)
                # review_text = ' '.join([p.text for p in review_paragraphs if re.search(r'\b(review|book|read|author|story|character)\b', p.text, re.IGNORECASE)])
                
                # Remove boilerplate
                review_text = review_text.replace('\n', ' ')
                review_text = remove_boilerplate(review_text)
                # print(review_text)
                
                # Truncate at Comments
                review_text = truncate_at_comments(review_text)
                # print(review_text)
                
                # original
                # review_text = ' '.join([p.text for p in review_soup.find_all('p')])
                
                # Extract date (this is a placeholder, as date formats vary widely)
                # date = review_soup.find('time')
                # review_date = date['datetime'] if date else 'Unknown'
                review_date = 'Unknown'
                
                reviews.append({
                    'review_text': review_text[:5000],  # Limit to first 500 (original) characters
                    'review_date': review_date,
                    'review_website': review_url
                })
                
            except Exception as e:
                print(f"Error processing {review_url}: {str(e)}")
            
            # Be polite to servers
            time.sleep(random.uniform(1, 3))
    
    # Create DataFrame
    df = pd.DataFrame(reviews)
    
    return df

# Example usage
book_name = "Darth Plagueis"
author = "James Luceno"

df = search_book_reviews(book_name, author)

# Save to CSV
df.to_csv(f"{book_name.replace(' ', '_')}_reviews.csv", index=False)
print(f"Reviews saved to {book_name.replace(' ', '_')}_reviews.csv")

URL being looked at: https://www.google.com/search?q=Darth+Plagueis+James+Luceno+book+review+-site:goodreads.com+-site:amazon.*+-site:reddit.com+-site:thestorygraph.*+-site:youtube.*+-site:*.tv+-site:barnesandnoble.com+-site:wikipedia.*+-site:quora.com&start=0
Reviews saved to Darth_Plagueis_reviews.csv


In [9]:
from tqdm import tqdm

# scrap - won't work.

def process_books_csv(input_file, output_file):
    books_df = pd.read_csv(input_file)
    
    all_reviews = []
    
    for _, book in tqdm(books_df.iterrows(), total=len(books_df), desc="Processing books"):
        book_reviews = search_book_reviews(book['Title'], book['Authors'])
        
        if not book_reviews.empty:
            book_reviews['book_title'] = book['Title']
            book_reviews['book_authors'] = book['Authors']
            book_reviews['book_avg_rating'] = book['Avg Ratings']
            book_reviews['book_rating'] = book['Rating']
            book_reviews['book_published_year'] = book['Published_year']
            book_reviews['book_url'] = book['URL']
            
            all_reviews.append(book_reviews)
        
        time.sleep(random.uniform(5, 10))  # Longer delay between books to avoid detection
    
    if all_reviews:
        final_df = pd.concat(all_reviews, ignore_index=True)
        final_df.to_csv(output_file, index=False)
        print(f"All reviews saved to {output_file}")
    else:
        print("No reviews were found.")

# Usage
input_file = "/kaggle/input/book-review-samples/goodreads_fantasy.csv"  # Your input CSV file
output_file = "book_reviews.csv"  # The output file for reviews

process_books_csv(input_file, output_file)

Processing books:   0%|          | 0/1250 [00:00<?, ?it/s]

URL being looked at: https://www.google.com/search?q=Harry+Potter+and+the+Sorcerer’s+Stone+(Harry+Potter,+#1)+J.K.+Rowling+book+review+-site:goodreads.com+-site:amazon.*+-site:reddit.com+-site:thestorygraph.*+-site:youtube.*+-site:*.tv+-site:barnesandnoble.com+-site:wikipedia.*+-site:quora.com&start=0


Processing books:   0%|          | 1/1250 [00:24<8:35:23, 24.76s/it]

URL being looked at: https://www.google.com/search?q=Harry+Potter+and+the+Chamber+of+Secrets+(Harry+Potter,+#2)+J.K.+Rowling+book+review+-site:goodreads.com+-site:amazon.*+-site:reddit.com+-site:thestorygraph.*+-site:youtube.*+-site:*.tv+-site:barnesandnoble.com+-site:wikipedia.*+-site:quora.com&start=0


Processing books:   0%|          | 2/1250 [01:01<11:03:21, 31.89s/it]

URL being looked at: https://www.google.com/search?q=Harry+Potter+and+the+Prisoner+of+Azkaban+(Harry+Potter,+#3)+J.K.+Rowling+book+review+-site:goodreads.com+-site:amazon.*+-site:reddit.com+-site:thestorygraph.*+-site:youtube.*+-site:*.tv+-site:barnesandnoble.com+-site:wikipedia.*+-site:quora.com&start=0


Processing books:   0%|          | 3/1250 [01:31<10:39:40, 30.78s/it]

URL being looked at: https://www.google.com/search?q=The+Hobbit+(The+Lord+of+the+Rings,+#0)+J.R.R.+Tolkien+book+review+-site:goodreads.com+-site:amazon.*+-site:reddit.com+-site:thestorygraph.*+-site:youtube.*+-site:*.tv+-site:barnesandnoble.com+-site:wikipedia.*+-site:quora.com&start=0


Processing books:   0%|          | 4/1250 [02:08<11:36:15, 33.53s/it]

URL being looked at: https://www.google.com/search?q=Harry+Potter+and+the+Goblet+of+Fire+(Harry+Potter,+#4)+J.K.+Rowling+book+review+-site:goodreads.com+-site:amazon.*+-site:reddit.com+-site:thestorygraph.*+-site:youtube.*+-site:*.tv+-site:barnesandnoble.com+-site:wikipedia.*+-site:quora.com&start=0


Processing books:   0%|          | 5/1250 [02:37<10:56:50, 31.66s/it]

URL being looked at: https://www.google.com/search?q=Harry+Potter+and+the+Order+of+the+Phoenix+(Harry+Potter,+#5)+J.K.+Rowling+book+review+-site:goodreads.com+-site:amazon.*+-site:reddit.com+-site:thestorygraph.*+-site:youtube.*+-site:*.tv+-site:barnesandnoble.com+-site:wikipedia.*+-site:quora.com&start=0


Processing books:   0%|          | 6/1250 [03:06<10:40:58, 30.92s/it]

URL being looked at: https://www.google.com/search?q=Harry+Potter+and+the+Half-Blood+Prince+(Harry+Potter,+#6)+J.K.+Rowling+book+review+-site:goodreads.com+-site:amazon.*+-site:reddit.com+-site:thestorygraph.*+-site:youtube.*+-site:*.tv+-site:barnesandnoble.com+-site:wikipedia.*+-site:quora.com&start=0


Processing books:   1%|          | 7/1250 [03:43<11:19:41, 32.81s/it]

URL being looked at: https://www.google.com/search?q=Harry+Potter+and+the+Deathly+Hallows+(Harry+Potter,+#7)+J.K.+Rowling+book+review+-site:goodreads.com+-site:amazon.*+-site:reddit.com+-site:thestorygraph.*+-site:youtube.*+-site:*.tv+-site:barnesandnoble.com+-site:wikipedia.*+-site:quora.com&start=0


Processing books:   1%|          | 8/1250 [04:13<11:04:35, 32.11s/it]

URL being looked at: https://www.google.com/search?q=A+Game+of+Thrones+(A+Song+of+Ice+and+Fire,+#1)+George+R.R.+Martin+book+review+-site:goodreads.com+-site:amazon.*+-site:reddit.com+-site:thestorygraph.*+-site:youtube.*+-site:*.tv+-site:barnesandnoble.com+-site:wikipedia.*+-site:quora.com&start=0


Processing books:   1%|          | 9/1250 [04:43<10:48:36, 31.36s/it]

URL being looked at: https://www.google.com/search?q=The+Fellowship+of+the+Ring+(Middle+Earth,+#1)+J.R.R.+Tolkien+book+review+-site:goodreads.com+-site:amazon.*+-site:reddit.com+-site:thestorygraph.*+-site:youtube.*+-site:*.tv+-site:barnesandnoble.com+-site:wikipedia.*+-site:quora.com&start=0


Processing books:   1%|          | 10/1250 [05:17<11:04:32, 32.16s/it]

URL being looked at: https://www.google.com/search?q=A+Court+of+Thorns+and+Roses+(A+Court+of+Thorns+and+Roses,+#1)+Sarah+J.+Maas+book+review+-site:goodreads.com+-site:amazon.*+-site:reddit.com+-site:thestorygraph.*+-site:youtube.*+-site:*.tv+-site:barnesandnoble.com+-site:wikipedia.*+-site:quora.com&start=0


Processing books:   1%|          | 11/1250 [05:51<11:17:07, 32.79s/it]

URL being looked at: https://www.google.com/search?q=A+Clash+of+Kings++(A+Song+of+Ice+and+Fire,+#2)+George+R.R.+Martin+book+review+-site:goodreads.com+-site:amazon.*+-site:reddit.com+-site:thestorygraph.*+-site:youtube.*+-site:*.tv+-site:barnesandnoble.com+-site:wikipedia.*+-site:quora.com&start=0
Error processing https://www.barnesandnoble.com/w/a-clash-of-kings-george-r-r-martin/1100179853: HTTPSConnectionPool(host='www.barnesandnoble.com', port=443): Read timed out. (read timeout=10)


Processing books:   1%|          | 12/1250 [06:38<12:44:08, 37.03s/it]

URL being looked at: https://www.google.com/search?q=The+Two+Towers+(Middle+Earth,+#3)+J.R.R.+Tolkien+book+review+-site:goodreads.com+-site:amazon.*+-site:reddit.com+-site:thestorygraph.*+-site:youtube.*+-site:*.tv+-site:barnesandnoble.com+-site:wikipedia.*+-site:quora.com&start=0


Processing books:   1%|          | 13/1250 [07:14<12:35:20, 36.64s/it]

URL being looked at: https://www.google.com/search?q=Six+of+Crows+(Six+of+Crows,+#1)+Leigh+Bardugo+book+review+-site:goodreads.com+-site:amazon.*+-site:reddit.com+-site:thestorygraph.*+-site:youtube.*+-site:*.tv+-site:barnesandnoble.com+-site:wikipedia.*+-site:quora.com&start=0


Processing books:   1%|          | 14/1250 [07:49<12:25:30, 36.19s/it]

URL being looked at: https://www.google.com/search?q=The+Name+of+the+Wind+(The+Kingkiller+Chronicle,+#1)+Patrick+Rothfuss+book+review+-site:goodreads.com+-site:amazon.*+-site:reddit.com+-site:thestorygraph.*+-site:youtube.*+-site:*.tv+-site:barnesandnoble.com+-site:wikipedia.*+-site:quora.com&start=0


Processing books:   1%|          | 15/1250 [08:24<12:20:44, 35.99s/it]

URL being looked at: https://www.google.com/search?q=The+Final+Empire+(Mistborn,+#1)+Brandon+Sanderson+book+review+-site:goodreads.com+-site:amazon.*+-site:reddit.com+-site:thestorygraph.*+-site:youtube.*+-site:*.tv+-site:barnesandnoble.com+-site:wikipedia.*+-site:quora.com&start=0
Error processing https://www.barnesandnoble.com/w/mistborn-brandon-sanderson/1100833141: HTTPSConnectionPool(host='www.barnesandnoble.com', port=443): Read timed out. (read timeout=10)


Processing books:   1%|▏         | 16/1250 [09:09<13:13:32, 38.58s/it]

URL being looked at: https://www.google.com/search?q=The+Return+of+the+King+(Middle+Earth,+#4)+J.R.R.+Tolkien+book+review+-site:goodreads.com+-site:amazon.*+-site:reddit.com+-site:thestorygraph.*+-site:youtube.*+-site:*.tv+-site:barnesandnoble.com+-site:wikipedia.*+-site:quora.com&start=0


Processing books:   1%|▏         | 17/1250 [09:43<12:43:08, 37.14s/it]

URL being looked at: https://www.google.com/search?q=A+Court+of+Mist+and+Fury+(A+Court+of+Thorns+and+Roses,+#2)+Sarah+J.+Maas+book+review+-site:goodreads.com+-site:amazon.*+-site:reddit.com+-site:thestorygraph.*+-site:youtube.*+-site:*.tv+-site:barnesandnoble.com+-site:wikipedia.*+-site:quora.com&start=0
Error processing https://www.barnesandnoble.com/w/a-court-of-mist-and-fury-sarah-j-maas/1123052907: HTTPSConnectionPool(host='www.barnesandnoble.com', port=443): Read timed out. (read timeout=10)


Processing books:   1%|▏         | 17/1250 [10:13<12:22:11, 36.12s/it]


KeyboardInterrupt: 

In [None]:
df

In [None]:
add