In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/book-review-samples/goodreads_fantasy.csv
/kaggle/input/book-review-samples/goodreads_all_genres.csv
/kaggle/input/book-review-samples/Darth_Plagueis_reviews(4).csv
/kaggle/input/book-review-samples/Dune_Book_1_reviews.csv
/kaggle/input/book-review-samples/Dune_Book_1_reviews(4).csv


In [2]:
pip install duckduckgo-search

Collecting duckduckgo-search
  Downloading duckduckgo_search-6.2.10-py3-none-any.whl.metadata (25 kB)
Collecting primp>=0.6.1 (from duckduckgo-search)
  Downloading primp-0.6.1-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Downloading duckduckgo_search-6.2.10-py3-none-any.whl (27 kB)
Downloading primp-0.6.1-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: primp, duckduckgo-search
Successfully installed duckduckgo-search-6.2.10 primp-0.6.1
Note: you may need to restart the kernel to use updated packages.


In [3]:
import requests
from bs4 import BeautifulSoup
# import pandas as pd
from datetime import datetime
import time
import random
import re
import html
from duckduckgo_search import DDGS

In [4]:
# boilerplate removal

def remove_boilerplate(text):
    boilerplate = ['cookie policy', 'privacy policy', 'terms of service', 'all rights reserved', '\n']
    for phrase in boilerplate:
        text = re.sub(r'(?i)' + re.escape(phrase) + r'.*', '', text)
    return text

In [5]:
# comment truncation (20% threshold default)

def truncate_at_comments(review_text, threshold_percentage=20, min_review_length=100):
    comment_identifiers = [
        'Comments', 'Leave a comment', 'Reader comments', 
        'What do you think?', 'Join the discussion', 'Add a comment',
        'Post a comment', 'Write a comment', 'Show comments'
    ]
    
    lower_text = review_text.lower()
    text_length = len(lower_text)
    threshold = max(int(text_length * (threshold_percentage / 100)), min_review_length)

    # Check for comment identifiers
    for identifier in comment_identifiers:
        index = lower_text.find(identifier.lower())
        if index != -1 and index > threshold:
            return review_text[:index].strip()
    
    # If no identifiers found, try to detect comment-like structures
    paragraphs = review_text.split('\n\n')
    filtered_paragraphs = []
    
    for paragraph in paragraphs:
        # Skip short paragraphs that might be comments
        if len(paragraph) < 50:
            continue
        
        # Skip paragraphs that start with common comment patterns
        if re.match(r'^(Posted by|From|User|Anonymous|[\d/]+:)', paragraph.strip()):
            continue
        
        filtered_paragraphs.append(paragraph)
    
    # If we've removed some paragraphs, join the remaining ones
    if len(filtered_paragraphs) < len(paragraphs):
        return '\n\n'.join(filtered_paragraphs).strip()
    
    # If we haven't removed any paragraphs, return at least the first part of the text
    return review_text[:max(threshold, len(review_text) // 2)].strip()

In [6]:
# remove comment classes

def remove_comments(review_soup):
    # Common class names for comment sections
    comment_classes = ['comment', 'comments-list', 'comments-area', 'comments', 'comment-section', 'user-comments', 'disqus_thread']
    
    for class_name in comment_classes:
        comment_section = review_soup.find('div', class_=class_name)
        if comment_section:
            comment_section.decompose()  # This removes the element from the soup
    
    return review_soup

In [7]:
# paragraph scoring for review content

def score_paragraph(paragraph):
    review_keywords = ['review', 'book', 'read', 'author', 'story', 'character', 'plot', 'recommend']
    return sum(keyword in paragraph.lower() for keyword in review_keywords)

In [8]:
# add retries with timeouts selectively

from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

def create_session_with_retries():
    session = requests.Session()
    retries = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries=retries)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

In [26]:
exceptions = ('google.', 'goodreads.com', 'amazon.', 'reddit.com', 'thestorygraph', 'youtube','.tv', 'barnesandnoble.com','wikipedia.','quora.com','sparknotes.com', 'grimdarkmagazine.')

def process_review_url(review_url, headers):
    reviews = []
    # Skip known non-review sites
    # if any(site in review_url for site in ['google.com', 'wikipedia.org']):
    if any(site in review_url.lower() for site in exceptions):
        return reviews
    
    session = create_session_with_retries()
    
    try:
        # Fetch the review page
        # review_response = requests.get(review_url, headers=headers, timeout=10)
        review_response = session.get(review_url, headers=headers, timeout=20)  # Increased timeout to 20 seconds
        review_soup = BeautifulSoup(review_response.text, 'html.parser')
        
        # Remove Comments
        review_soup = remove_comments(review_soup)
        
        # Review elements
        review_elements = review_soup.find_all(['main','p'])
        scored_paragraphs = [(elem, score_paragraph(elem.text)) for elem in review_elements]
        
        # Extract review text
        review_paragraphs = [elem.text for elem, score in sorted(scored_paragraphs, key=lambda x: x[1], reverse=True)[:3]]
        
        # Search for relevant text
        review_text = ' '.join(review_paragraphs)
        
        # Remove boilerplate
        review_text = review_text.replace('\n', ' ')
        review_text = remove_boilerplate(review_text)
        
        # Truncate at Comments
        review_text = truncate_at_comments(review_text)
        
        review_date = 'Unknown'
        
        reviews.append({
            'review_text': review_text[:5000],  # Limit to first 5000 characters
            'review_date': review_date,
            'review_website': review_url
        })
        
    except Exception as e:
        print(f"Error processing {review_url}: {str(e)}")
    
    # Be polite to servers
    time.sleep(random.uniform(1, 3))
    
    return reviews

In [27]:
# search_book_review test

def search_book_reviews(book_name, author):
    # Combine book name and author for search query
    search_query = f"{book_name} {author} book review"
    
    # print(f"Searching for: {search_query}")
    
    reviews = []
    
    # Define headers
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    
    # Use DDGS for searching
    with DDGS() as ddgs:
        results = ddgs.text(search_query, max_results=10)  # Adjust max_results as needed
        
        for result in results:
            review_url = result['href']
            reviews.extend(process_review_url(review_url, headers))
    
    # Create DataFrame
    df = pd.DataFrame(reviews)
    
    return df

In [28]:
# Example usage
book_name = "Darth Plagueis"
author = "James Luceno"

df = search_book_reviews(book_name, author)

In [29]:
# Save to CSV
df.to_csv(f"{book_name.replace(' ', '_')}_reviews.csv", index=False)
print(f"Reviews saved to {book_name.replace(' ', '_')}_reviews.csv")

Reviews saved to Darth_Plagueis_reviews.csv


In [30]:
df.head()

Unnamed: 0,review_text,review_date,review_website
0,Pre-publication book reviews and features keep...,Unknown,https://www.kirkusreviews.com/book-reviews/jam...
1,Read more of our book reviews . But perhaps...,Unknown,https://www.gamesradar.com/star-wars-darth-pla...
2,"James Luceno’s 2012 novel Darth Plagueis, one ...",Unknown,https://greatbooksguy.com/2023/06/17/book-revi...
3,Star Wars: Darth PlagueisStar Wars: Darth Plag...,Unknown,https://www.ign.com/articles/2012/02/03/star-w...
4,Add to Bookshelf Read An Excerpt Buy Lo...,Unknown,https://www.penguinrandomhouse.com/books/7859/...


In [31]:
import csv
from tqdm import tqdm

In [32]:
input_csv = '/kaggle/input/book-review-samples/goodreads_all_genres.csv'
output_csv = 'book_reviews.csv'

df = pd.read_csv(input_csv)
df_unique = df.drop_duplicates(subset=['Title', 'Authors'], keep='first')
num_duplicates = len(df) - len(df_unique)
num_total = len(df)
print(f"Removed {num_duplicates} duplicate entries out of {num_total}")

Removed 115123 duplicate entries out of 118700


In [35]:
def process_book_list(input_csv, output_csv):
    # Read the input CSV
    df = pd.read_csv(input_csv)
    
    # remove duplicates
    df_unique = df.drop_duplicates(subset=['Title', 'Authors'], keep='first')
    
    # Print information about removed duplicates
    num_duplicates = len(df) - len(df_unique)
    num_total = len(df)
    print(f"Removed {num_duplicates} duplicate entries out of {num_total}")
    
    all_reviews = []
    
    # Iterate through each row in the dataframe
    for index, row in tqdm(df_unique.iterrows(), total=df_unique.shape[0], desc="Processing books"):
        title = row['Title']
        authors = row['Authors']
        
        try:
            # Get reviews for this book
            reviews_df = search_book_reviews(title, authors)
            
            # Add book information to each review
            reviews_df['Title'] = title
            reviews_df['Authors'] = authors
            reviews_df['Avg Ratings'] = row['Avg Ratings']
            reviews_df['Rating'] = row['Rating']
            reviews_df['Published_year'] = row['Published_year']
            
            all_reviews.append(reviews_df)
        
        except Exception as e:
            print(f"Error processing {title} by {authors}: {str(e)}")
    
    # Combine all reviews into a single dataframe
    if all_reviews:
        final_df = pd.concat(all_reviews, ignore_index=True)
        
        # Save to CSV
        final_df.to_csv(output_csv, index=False, quoting=csv.QUOTE_ALL)
        print(f"Reviews saved to {output_csv}")
    else:
        print("No reviews were collected.")

In [None]:
# Usage
process_book_list(input_csv, output_csv)

Removed 115123 duplicate entries out of 118700


Processing books:   1%|          | 36/3577 [05:33<11:27:53, 11.66s/it]

Error processing https://gavthorpe.co.uk/books/deliverance-lost-by-gav-thorpe/: HTTPSConnectionPool(host='gavthorpe.co.uk', port=443): Max retries exceeded with url: /books/deliverance-lost-by-gav-thorpe/ (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing books:   2%|▏         | 54/3577 [09:36<21:50:08, 22.31s/it]

Error processing https://www.suzannecollinsbooks.com/the_hunger_games_69765.htm: HTTPSConnectionPool(host='www.suzannecollinsbooks.com', port=443): Max retries exceeded with url: /the_hunger_games_69765.htm (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1007)')))


Processing books:   2%|▏         | 59/3577 [10:59<18:19:58, 18.76s/it]

Error processing https://www.washingtonpost.com/entertainment/books/book-review-the-goldfinch-by-donna-tartt/2013/10/22/39b556ac-3837-11e3-ae46-e4248e75c8ea_story.html: HTTPSConnectionPool(host='www.washingtonpost.com', port=443): Max retries exceeded with url: /entertainment/books/book-review-the-goldfinch-by-donna-tartt/2013/10/22/39b556ac-3837-11e3-ae46-e4248e75c8ea_story.html (Caused by ReadTimeoutError("HTTPSConnectionPool(host='www.washingtonpost.com', port=443): Read timed out. (read timeout=20)"))


Processing books:   2%|▏         | 64/3577 [14:02<21:35:03, 22.12s/it]

Error processing https://www.suzannecollinsbooks.com/mockingjay_102797.htm: HTTPSConnectionPool(host='www.suzannecollinsbooks.com', port=443): Max retries exceeded with url: /mockingjay_102797.htm (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1007)')))


Processing books:   3%|▎         | 105/3577 [26:47<11:41:43, 12.13s/it]

Error processing https://www.thatartsyreadergirl.com/2020/05/beach-read-by-emily-henry-book-review/: HTTPSConnectionPool(host='www.thatartsyreadergirl.com', port=443): Max retries exceeded with url: /2020/05/beach-read-by-emily-henry-book-review/ (Caused by ResponseError('too many 500 error responses'))


Processing books:   3%|▎         | 109/3577 [29:14<22:48:40, 23.68s/it]

Error processing https://yourbookcorner.com/the-love-hypothesis-summary-review/: HTTPSConnectionPool(host='yourbookcorner.com', port=443): Max retries exceeded with url: /the-love-hypothesis-summary-review/ (Caused by ResponseError('too many 500 error responses'))


Processing books:   3%|▎         | 120/3577 [35:19<31:14:49, 32.54s/it]

Error processing https://www.washingtonpost.com/entertainment/books/people-we-meet-on-vacation/2021/05/11/0773422a-b274-11eb-ab43-bebddc5a0f65_story.html: HTTPSConnectionPool(host='www.washingtonpost.com', port=443): Max retries exceeded with url: /entertainment/books/people-we-meet-on-vacation/2021/05/11/0773422a-b274-11eb-ab43-bebddc5a0f65_story.html (Caused by ReadTimeoutError("HTTPSConnectionPool(host='www.washingtonpost.com', port=443): Read timed out. (read timeout=20)"))


Processing books:   4%|▍         | 137/3577 [42:09<13:53:05, 14.53s/it]

Error processing https://stieglarsson.com/The-Girl-With-The-Dragon-Tattoo/: HTTPSConnectionPool(host='stieglarsson.com', port=443): Max retries exceeded with url: /The-Girl-With-The-Dragon-Tattoo/ (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing books:   4%|▍         | 138/3577 [42:25<14:15:06, 14.92s/it]

Error processing https://yourbookcorner.com/verity-book-summary-review/: HTTPSConnectionPool(host='yourbookcorner.com', port=443): Max retries exceeded with url: /verity-book-summary-review/ (Caused by ResponseError('too many 500 error responses'))


Processing books:   4%|▍         | 140/3577 [43:39<25:01:44, 26.22s/it]

Error processing https://www.bloomsbury.com/us/house-of-earth-and-blood-9781635574043/: HTTPSConnectionPool(host='www.bloomsbury.com', port=443): Max retries exceeded with url: /us/house-of-earth-and-blood-9781635574043/ (Caused by ResponseError('too many 500 error responses'))


Processing books:   4%|▍         | 141/3577 [44:08<25:42:22, 26.93s/it]

Error processing https://www.washingtonpost.com/entertainment/books/fifty-shades-has-come-to-an-end-what-have-we-learned-from-this-provocative-romance/2021/06/28/942213e2-d680-11eb-9f29-e9e6c9e843c6_story.html: HTTPSConnectionPool(host='www.washingtonpost.com', port=443): Max retries exceeded with url: /entertainment/books/fifty-shades-has-come-to-an-end-what-have-we-learned-from-this-provocative-romance/2021/06/28/942213e2-d680-11eb-9f29-e9e6c9e843c6_story.html (Caused by ReadTimeoutError("HTTPSConnectionPool(host='www.washingtonpost.com', port=443): Read timed out. (read timeout=20)"))


Processing books:   4%|▍         | 149/3577 [48:07<26:50:28, 28.19s/it]