<a href="https://www.kaggle.com/code/nigamshitij/parse-book-reviews-per-book?scriptVersionId=193208071" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import requests
from bs4 import BeautifulSoup
# import pandas as pd
from datetime import datetime
import time
import random
import re

In [3]:
def remove_boilerplate(text):
    boilerplate = ['cookie policy', 'privacy policy', 'terms of service', 'all rights reserved', '\n']
    for phrase in boilerplate:
        text = re.sub(r'(?i)' + re.escape(phrase) + r'.*', '', text)
    return text

In [4]:
def truncate_at_comments(review_text, threshold_percentage=20, min_review_length=100):
    comment_identifiers = [
        'Comments', 'Leave a comment', 'Reader comments', 
        'What do you think?', 'Join the discussion', 'Add a comment',
        'Post a comment', 'Write a comment', 'Show comments'
    ]
    
    lower_text = review_text.lower()
    text_length = len(lower_text)
    threshold = max(int(text_length * (threshold_percentage / 100)), min_review_length)

    # Check for comment identifiers
    for identifier in comment_identifiers:
        index = lower_text.find(identifier.lower())
        if index != -1 and index > threshold:
            return review_text[:index].strip()
    
    # If no identifiers found, try to detect comment-like structures
    paragraphs = review_text.split('\n\n')
    filtered_paragraphs = []
    
    for paragraph in paragraphs:
        # Skip short paragraphs that might be comments
        if len(paragraph) < 50:
            continue
        
        # Skip paragraphs that start with common comment patterns
        if re.match(r'^(Posted by|From|User|Anonymous|[\d/]+:)', paragraph.strip()):
            continue
        
        filtered_paragraphs.append(paragraph)
    
    # If we've removed some paragraphs, join the remaining ones
    if len(filtered_paragraphs) < len(paragraphs):
        return '\n\n'.join(filtered_paragraphs).strip()
    
    # If we haven't removed any paragraphs, return at least the first part of the text
    return review_text[:max(threshold, len(review_text) // 2)].strip()

In [5]:
def remove_comments(review_soup):
    # Common class names for comment sections
    comment_classes = ['comment', 'comments-list', 'comments-area', 'comments', 'comment-section', 'user-comments', 'disqus_thread']
    
    for class_name in comment_classes:
        comment_section = review_soup.find('div', class_=class_name)
        if comment_section:
            comment_section.decompose()  # This removes the element from the soup
    
    return review_soup

In [6]:
def score_paragraph(paragraph):
    review_keywords = ['review', 'book', 'read', 'author', 'story', 'character', 'plot', 'recommend']
    return sum(keyword in paragraph.lower() for keyword in review_keywords)

In [7]:
def search_book_reviews(book_name, author):
    # Combine book name and author for search query
    search_query = f"{book_name} {author} book review -site:goodreads.com -site:amazon.* -site:reddit.com -site:thestorygraph.* -site:youtube.* -site:*.tv -site:barnesandnoble.com -site:wikipedia.* -site:quora.com&start=0"
    
    # Perform Google search
    url = f"https://www.google.com/search?q={search_query.replace(' ', '+')}"
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    print(f"URL being looked at: {url}")
    
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract search results
    search_results = soup.find_all('div', class_='g')
    
    reviews = []
    
    for result in search_results:
        link = result.find('a')
        if link:
            review_url = link['href']

            # Skip non-http links and known non-review sites
            if not review_url.startswith('http') or any(site in review_url for site in ['google.com', 'wikipedia.org']):
                continue
            
            try:
                # review_url = 'https://booksatruestory.com/2013/03/27/book-review-darth-plagueis-james-luceno/'
                
                # Fetch the review page
                review_response = requests.get(review_url, headers=headers, timeout=10)
                review_soup = BeautifulSoup(review_response.text, 'html.parser')
                
                # Remove Comments
                review_soup = remove_comments(review_soup)
                
                # Review elements
                review_elements = review_soup.find_all(['main','p'])
                scored_paragraphs = [(elem, score_paragraph(elem.text)) for elem in review_elements]
                
                # Extract review text (this is a simplistic approach and may need refinement)
                review_paragraphs = [elem.text for elem, score in sorted(scored_paragraphs, key=lambda x: x[1], reverse=True)[:3]]
                # review_paragraphs = review_soup.find_all('main')
                # review_list_items = review_soup.find_all('li')
                
                # Search for relevant text
                review_text = ' '.join(review_paragraphs)
                # review_text = ' '.join([p.text for p in review_paragraphs if re.search(r'\b(review|book|read|author|story|character)\b', p.text, re.IGNORECASE)])
                
                # Remove boilerplate
                review_text = review_text.replace('\n', ' ')
                review_text = remove_boilerplate(review_text)
                # print(review_text)
                
                # Truncate at Comments
                review_text = truncate_at_comments(review_text)
                # print(review_text)
                
                # original
                # review_text = ' '.join([p.text for p in review_soup.find_all('p')])
                
                # Extract date (this is a placeholder, as date formats vary widely)
                # date = review_soup.find('time')
                # review_date = date['datetime'] if date else 'Unknown'
                review_date = 'Unknown'
                
                reviews.append({
                    'review_text': review_text[:5000],  # Limit to first 500 (original) characters
                    'review_date': review_date,
                    'review_website': review_url
                })
                
            except Exception as e:
                print(f"Error processing {review_url}: {str(e)}")
            
            # Be polite to servers
            time.sleep(random.uniform(1, 3))
    
    # Create DataFrame
    df = pd.DataFrame(reviews)
    
    return df

# Example usage
book_name = "Darth Plagueis"
author = "James Luceno"

df = search_book_reviews(book_name, author)

# Save to CSV
df.to_csv(f"{book_name.replace(' ', '_')}_reviews.csv", index=False)
print(f"Reviews saved to {book_name.replace(' ', '_')}_reviews.csv")

URL being looked at: https://www.google.com/search?q=Darth+Plagueis+James+Luceno+book+review+-site:goodreads.com+-site:amazon.*+-site:reddit.com+-site:thestorygraph.*+-site:youtube.*+-site:*.tv+-site:barnesandnoble.com+-site:wikipedia.*+-site:quora.com&start=0
Reviews saved to Darth_Plagueis_reviews.csv


In [33]:
df

Unnamed: 0,review_text,review_date,review_website
0,Star Wars: Darth PlagueisStar Wars: Darth Plag...,Unknown,https://www.ign.com/articles/2012/02/03/star-w...
1,The performance by Daniel Davis was superb too...,Unknown,https://jeffreydebris.com/2020/02/15/review-da...
2,Throwback Thursday: Star Wars: Darth Plagueis ...,Unknown,https://unseenlibrary.com/2022/04/15/throwback...
3,Book Review: Darth Plagueis by James Luceno M...,Unknown,https://booksatruestory.com/2013/03/27/book-re...
4,"Luceno’s word structure, or prose, is incredib...",Unknown,https://thegeeksattic.com/darthplagueis/
5,I loved reading about the relationship between...,Unknown,https://starwars.fandom.com/f/p/44000000000037...
6,Book Review: Darth Plagueis Posted on January...,Unknown,https://starwarsreport.com/2012/01/10/book-rev...
7,"James Luceno’s 2012 novel Darth Plagueis, one ...",Unknown,https://greatbooksguy.com/2023/06/17/book-revi...
8,Luceno writes Darth Plagueis in the same clear...,Unknown,http://fangirlblog.com/reviews/star-wars-darth...


In [None]:
add