<a href="https://www.kaggle.com/code/nigamshitij/parse-book-reviews-using-duckduckgo?scriptVersionId=195259701" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/book-review-samples/goodreads_fantasy.csv
/kaggle/input/book-review-samples/goodreads_all_genres.csv
/kaggle/input/book-review-samples/Darth_Plagueis_reviews(4).csv
/kaggle/input/book-review-samples/goodreads_all_genres_final.csv
/kaggle/input/book-review-samples/Dune_Book_1_reviews.csv
/kaggle/input/book-review-samples/Dune_Book_1_reviews(4).csv
/kaggle/input/book-review-samples/goodreads_genres_complete.csv


In [2]:
pip install duckduckgo-search

Collecting duckduckgo-search
  Downloading duckduckgo_search-6.2.11-py3-none-any.whl.metadata (24 kB)
Collecting primp>=0.6.1 (from duckduckgo-search)
  Downloading primp-0.6.1-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Downloading duckduckgo_search-6.2.11-py3-none-any.whl (27 kB)
Downloading primp-0.6.1-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: primp, duckduckgo-search
Successfully installed duckduckgo-search-6.2.11 primp-0.6.1
Note: you may need to restart the kernel to use updated packages.


In [3]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import time
import random
import re
import html
from duckduckgo_search import DDGS
import glob
import csv
from tqdm import tqdm

In [4]:
# boilerplate removal

def remove_boilerplate(text):
    boilerplate = ['cookie policy', 'privacy policy', 'terms of service', 'all rights reserved', '\n']
    for phrase in boilerplate:
        text = re.sub(r'(?i)' + re.escape(phrase) + r'.*', '', text)
    return text

In [5]:
# comment truncation (20% threshold default)

def truncate_at_comments(review_text, threshold_percentage=20, min_review_length=100):
    comment_identifiers = [
        'Comments', 'Leave a comment', 'Reader comments', 
        'What do you think?', 'Join the discussion', 'Add a comment',
        'Post a comment', 'Write a comment', 'Show comments'
    ]
    
    lower_text = review_text.lower()
    text_length = len(lower_text)
    threshold = max(int(text_length * (threshold_percentage / 100)), min_review_length)

    # Check for comment identifiers
    for identifier in comment_identifiers:
        index = lower_text.find(identifier.lower())
        if index != -1 and index > threshold:
            return review_text[:index].strip()
    
    # If no identifiers found, try to detect comment-like structures
    paragraphs = review_text.split('\n\n')
    filtered_paragraphs = []
    
    for paragraph in paragraphs:
        # Skip short paragraphs that might be comments
        if len(paragraph) < 50:
            continue
        
        # Skip paragraphs that start with common comment patterns
        if re.match(r'^(Posted by|From|User|Anonymous|[\d/]+:)', paragraph.strip()):
            continue
        
        filtered_paragraphs.append(paragraph)
    
    # If we've removed some paragraphs, join the remaining ones
    if len(filtered_paragraphs) < len(paragraphs):
        return '\n\n'.join(filtered_paragraphs).strip()
    
    # If we haven't removed any paragraphs, return at least the first part of the text
    return review_text[:max(threshold, len(review_text) // 2)].strip()

In [6]:
# remove comment classes

def remove_comments(review_soup):
    # Common class names for comment sections
    comment_classes = ['comment', 'comments-list', 'comments-area', 'comments', 'comment-section', 'user-comments', 'disqus_thread']
    
    for class_name in comment_classes:
        comment_section = review_soup.find('div', class_=class_name)
        if comment_section:
            comment_section.decompose()  # This removes the element from the soup
    
    return review_soup

In [7]:
# paragraph scoring for review content

def score_paragraph(paragraph):
    review_keywords = ['review', 'book', 'read', 'author', 'story', 'character', 'plot', 'recommend']
    return sum(keyword in paragraph.lower() for keyword in review_keywords)

In [8]:
# add retries with timeouts selectively

import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from urllib3.exceptions import ReadTimeoutError
from bs4 import BeautifulSoup
import time
import random

def create_session_with_retries():
    session = requests.Session()
    retries = Retry(total=5, 
                    backoff_factor=1, 
                    status_forcelist=[429, 500, 502, 503, 504],
                    allowed_methods=["HEAD", "GET", "OPTIONS"])
    adapter = HTTPAdapter(max_retries=retries)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

In [9]:
exceptions = ('google.', 'goodreads.com', 'amazon.', 'reddit.com', 'thestorygraph', 'youtube','.tv', 'barnesandnoble.com','wikipedia.','quora.com','sparknotes.com', 'grimdarkmagazine.', 'washingtonpost.')

def process_review_url(review_url, headers):
    reviews = []
    # Skip known non-review sites
    # if any(site in review_url for site in ['google.com', 'wikipedia.org']):
    if any(site in review_url.lower() for site in exceptions):
        return reviews
    
    session = create_session_with_retries()
    
    try:
        # Fetch the review page
        # review_response = requests.get(review_url, headers=headers, timeout=10)
        review_response = session.get(review_url, headers=headers, timeout=10)  # Increased timeout to 20 seconds
        review_soup = BeautifulSoup(review_response.text, 'html.parser')
        
        # Remove Comments
        review_soup = remove_comments(review_soup)
        
        # Review elements
        review_elements = review_soup.find_all(['main','p'])
        scored_paragraphs = [(elem, score_paragraph(elem.text)) for elem in review_elements]
        
        # Extract review text
        review_paragraphs = [elem.text for elem, score in sorted(scored_paragraphs, key=lambda x: x[1], reverse=True)[:3]]
        
        # Search for relevant text
        review_text = ' '.join(review_paragraphs)
        
        # Remove boilerplate
        review_text = review_text.replace('\n', ' ')
        review_text = remove_boilerplate(review_text)
        
        # Truncate at Comments
        review_text = truncate_at_comments(review_text)
        
        review_date = 'Unknown'
        
        reviews.append({
            'review_text': review_text[:5000],  # Limit to first 5000 characters
            'review_date': review_date,
            'review_website': review_url
        })
        
    except ReadTimeoutError as e:
        print(f"Read timeout error for {review_url}: {str(e)}")
    except requests.exceptions.RequestException as e:
        print(f"Error processing {review_url}: {str(e)}")
    except Exception as e:
        print(f"Unexpected error processing {review_url}: {str(e)}")
    
    # Be polite to servers
    time.sleep(random.uniform(1, 3))
    
    return reviews

In [10]:
# search_book_review test

def search_book_reviews(book_name, author):
    # Combine book name and author for search query
    search_query = f"{book_name} {author} book review"
    
    # print(f"Searching for: {search_query}")
    
    reviews = []
    
    # Define headers
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    
    # Use DDGS for searching
    with DDGS() as ddgs:
        results = ddgs.text(search_query, max_results=10)  # Adjust max_results as needed
        
        for result in results:
            review_url = result['href']
            reviews.extend(process_review_url(review_url, headers))
    
    # Create DataFrame
    df = pd.DataFrame(reviews)
    
    return df

In [11]:
# # Example usage
# book_name = "Darth Plagueis"
# author = "James Luceno"

# df = search_book_reviews(book_name, author)

# # Save to CSV
# df.to_csv(f"{book_name.replace(' ', '_')}_reviews.csv", index=False)
# print(f"Reviews saved to {book_name.replace(' ', '_')}_reviews.csv")

# df.head()

In [12]:
# input_csv = '/kaggle/input/book-review-samples/goodreads_all_genres_final.csv'
# output_csv = 'book_reviews.csv'

input_csv = '/kaggle/input/book-review-samples/goodreads_all_genres_final.csv'
output_directory = '/kaggle/working/'
output_csv = 'all_book_reviews.csv'

df = pd.read_csv(input_csv)
df_unique = df.drop_duplicates(subset=['Title', 'Authors'], keep='first')
num_duplicates = len(df) - len(df_unique)
num_total = len(df)
print(f"Removed {num_duplicates} duplicate entries out of {num_total}")

Removed 0 duplicate entries out of 35597


In [13]:
def process_book_list_old(input_csv, output_csv):
    # Read the input CSV
    df = pd.read_csv(input_csv)
    
    # remove duplicates
    df_unique = df.drop_duplicates(subset=['Title', 'Authors'], keep='first')
    
    # Print information about removed duplicates
    num_duplicates = len(df) - len(df_unique)
    num_total = len(df)
    print(f"Removed {num_duplicates} duplicate entries out of {num_total}")
    
    all_reviews = []
    
    # Iterate through each row in the dataframe
    for index, row in tqdm(df_unique.iterrows(), total=df_unique.shape[0], desc="Processing books"):
        title = row['Title']
        authors = row['Authors']
        
        try:
            # Get reviews for this book
            reviews_df = search_book_reviews(title, authors)
            
            # Add book information to each review
            reviews_df['Title'] = title
            reviews_df['Authors'] = authors
            reviews_df['Avg Ratings'] = row['Avg Ratings']
            reviews_df['Rating'] = row['Rating']
            reviews_df['Published_year'] = row['Published_year']
            
            all_reviews.append(reviews_df)
        
        except Exception as e:
            print(f"Error processing {title} by {authors}: {str(e)}")
    
    # Combine all reviews into a single dataframe
    if all_reviews:
        final_df = pd.concat(all_reviews, ignore_index=True)
        
        # Save to CSV
        final_df.to_csv(output_csv, index=False, quoting=csv.QUOTE_ALL)
        print(f"Reviews saved to {output_csv}")
    else:
        print("No reviews were collected.")

In [14]:
def process_book_list_old_2(input_csv, output_csv):
    # Read the input CSV
    df = pd.read_csv(input_csv)
    
    # remove duplicates
    df_unique = df.drop_duplicates(subset=['Title', 'Authors'], keep='first')
    
    # Print information about removed duplicates
    num_duplicates = len(df) - len(df_unique)
    num_total = len(df)
    print(f"Removed {num_duplicates} duplicate entries out of {num_total}")
    
    all_reviews = []
    batch_size = 10
    batch_count = 0
    
    # Iterate through each row in the dataframe
    for index, row in tqdm(df_unique.iterrows(), total=df_unique.shape[0], desc="Processing books"):
        title = row['Title']
        authors = row['Authors']
        
        try:
            # Get reviews for this book
            reviews_df = search_book_reviews(title, authors)
            
            # Add book information to each review
            reviews_df['Title'] = title
            reviews_df['Authors'] = authors
            reviews_df['Avg Ratings'] = row['Avg Ratings']
            reviews_df['Rating'] = row['Rating']
            reviews_df['Published_year'] = row['Published_year']
            
            all_reviews.append(reviews_df)
        
        except Exception as e:
            print(f"Error processing {title} by {authors}: {str(e)}")
        
        # Save batch to CSV every 10 books
        if len(all_reviews) == batch_size:
            batch_df = pd.concat(all_reviews, ignore_index=True)
            batch_filename = f"/kaggle/working/batch_{batch_count}.csv"
            batch_df.to_csv(
                batch_filename, 
                index=False, 
                quoting=csv.QUOTE_ALL,
                escapechar='\\',
                doublequote=False
            )
            print(f"Batch {batch_count} saved to {batch_filename}")
            all_reviews = []
            batch_count += 1
    
    # Save any remaining reviews
    if all_reviews:
        batch_df = pd.concat(all_reviews, ignore_index=True)
        batch_filename = f"/kaggle/working/batch_{batch_count}.csv"
        batch_df.to_csv(batch_filename, index=False, quoting=csv.QUOTE_ALL)
        print(f"Final batch saved to {batch_filename}")
    
    # Combine all batches
    all_files = glob.glob("/kaggle/working/batch_*.csv")
    combined_df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)
    
    # Save combined results
    combined_df.to_csv(
        output_csv, 
        index=False, 
        quoting=csv.QUOTE_ALL,
        escapechar='\\',
        doublequote=False
    )
    print(f"All reviews combined and saved to {output_csv}")
    
    # Optionally, remove batch files
    for f in all_files:
        os.remove(f)
    print("Batch files removed")

In [15]:
def process_book_list(input_csv, output_csv):
    # Read the input CSV
    df = pd.read_csv(input_csv)
    
    # Remove duplicates
    df_unique = df.drop_duplicates(subset=['Title', 'Authors'], keep='first')
    
    # Print information about removed duplicates
    num_duplicates = len(df) - len(df_unique)
    num_total = len(df)
    print(f"Removed {num_duplicates} duplicate entries out of {num_total}")
    
    # Find the last processed batch
    existing_batches = glob.glob("/kaggle/working/batch_*.csv")
    if existing_batches:
        last_batch = max(existing_batches, key=os.path.getctime)
        last_batch_number = int(last_batch.split('_')[1].split('.')[0])
        start_index = (last_batch_number + 1) * 10
    else:
        start_index = 0
    
    print(f"Resuming processing from index {start_index}")
    
    batch_size = 10
    batch_count = start_index // 10
    
    # Iterate through each row in the dataframe, starting from start_index
    for index in tqdm(range(start_index, len(df_unique)), total=len(df_unique)-start_index, desc="Processing books"):
        row = df_unique.iloc[index]
        title = row['Title']
        authors = row['Authors']
        
        try:
            # Get reviews for this book
            reviews_df = search_book_reviews(title, authors)
            
            # Add book information to each review
            reviews_df['Title'] = title
            reviews_df['Authors'] = authors
            reviews_df['Avg Ratings'] = row['Avg Ratings']
            reviews_df['Rating'] = row['Rating']
            reviews_df['Published_year'] = row['Published_year']
            
            # Save batch to CSV every 10 books
            if (index + 1) % batch_size == 0 or index == len(df_unique) - 1:
                batch_filename = f"/kaggle/working/batch_{batch_count}.csv"
                reviews_df.to_csv(batch_filename, index=False, quoting=csv.QUOTE_ALL, escapechar='\\', doublequote=False)
                print(f"Batch {batch_count} saved to {batch_filename}")
                batch_count += 1
        
        except Exception as e:
            print(f"Error processing {title} by {authors}: {str(e)}")
    
    # Combine all batches
    all_files = glob.glob("/kaggle/working/batch_*.csv")
    combined_df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)
    
    # Save combined results
    combined_df.to_csv(output_csv, index=False, quoting=csv.QUOTE_ALL, escapechar='\\', doublequote=False)
    print(f"All reviews combined and saved to {output_csv}")
    
    # Optionally, remove batch files
    for f in all_files:
        os.remove(f)
    print("Batch files removed")

In [None]:
# Usage
process_book_list(input_csv, output_csv)

Removed 0 duplicate entries out of 35597
Resuming processing from index 1560


Processing books:   0%|          | 10/34037 [01:47<108:19:53, 11.46s/it]

Batch 156 saved to /kaggle/working/batch_156.csv


Processing books:   0%|          | 12/34037 [02:13<121:12:57, 12.83s/it]

Error processing https://www.kirkusreviews.com/book-reviews/kim-mclarin/james-baldwins-another-country/: HTTPSConnectionPool(host='www.kirkusreviews.com', port=443): Max retries exceeded with url: /book-reviews/kim-mclarin/james-baldwins-another-country/ (Caused by ReadTimeoutError("HTTPSConnectionPool(host='www.kirkusreviews.com', port=443): Read timed out. (read timeout=10)"))


Processing books:   0%|          | 18/34037 [05:29<203:35:21, 21.54s/it]

Error processing https://www.kirandellimore.com/book-review-notes-of-a-native-son-james-baldwin/: HTTPSConnectionPool(host='www.kirandellimore.com', port=443): Max retries exceeded with url: /book-review-notes-of-a-native-son-james-baldwin/ (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing books:   0%|          | 19/34037 [06:20<286:17:00, 30.30s/it]

Error processing https://paperbacksocial.com/2021/01/30/the-women-of-brewster-place-by-gloria-naylor-review/: HTTPSConnectionPool(host='paperbacksocial.com', port=443): Max retries exceeded with url: /2021/01/30/the-women-of-brewster-place-by-gloria-naylor-review/ (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing books:   0%|          | 20/34037 [07:11<347:17:10, 36.75s/it]

Batch 157 saved to /kaggle/working/batch_157.csv


Processing books:   0%|          | 27/34037 [09:47<232:19:55, 24.59s/it]

Error processing https://www.beverlyjenkins.net/books/old-west-series/forbidden/: HTTPSConnectionPool(host='www.beverlyjenkins.net', port=443): Max retries exceeded with url: /books/old-west-series/forbidden/ (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing books:   0%|          | 30/34037 [11:08<221:53:28, 23.49s/it]

Batch 158 saved to /kaggle/working/batch_158.csv


Processing books:   0%|          | 40/34037 [12:43<94:53:11, 10.05s/it] 

Batch 159 saved to /kaggle/working/batch_159.csv


Processing books:   0%|          | 50/34037 [15:28<215:34:42, 22.83s/it]

Batch 160 saved to /kaggle/working/batch_160.csv


Processing books:   0%|          | 52/34037 [15:50<161:31:13, 17.11s/it]

Error processing https://www.beverlyjenkins.net/books/destiny-trilogy/destinys-embrace/: HTTPSConnectionPool(host='www.beverlyjenkins.net', port=443): Max retries exceeded with url: /books/destiny-trilogy/destinys-embrace/ (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing books:   0%|          | 60/34037 [17:48<130:31:37, 13.83s/it]

Batch 161 saved to /kaggle/working/batch_161.csv


Processing books:   0%|          | 68/34037 [19:08<93:04:29,  9.86s/it] 

Error processing https://www.beverlyjenkins.net/books/women-who-dare-series/rebel/: HTTPSConnectionPool(host='www.beverlyjenkins.net', port=443): Max retries exceeded with url: /books/women-who-dare-series/rebel/ (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))
Error processing https://www.beverlyjenkins.net/books/women-who-dare-series/: HTTPSConnectionPool(host='www.beverlyjenkins.net', port=443): Max retries exceeded with url: /books/women-who-dare-series/ (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing books:   0%|          | 70/34037 [20:26<204:48:18, 21.71s/it]

Batch 162 saved to /kaggle/working/batch_162.csv


Processing books:   0%|          | 76/34037 [21:32<100:51:47, 10.69s/it]

Error processing https://www.readinggroupguides.com/reviews/the-secret-lives-of-baba-segis-wives/guide: HTTPSConnectionPool(host='www.readinggroupguides.com', port=443): Max retries exceeded with url: /reviews/the-secret-lives-of-baba-segis-wives/guide (Caused by ReadTimeoutError("HTTPSConnectionPool(host='www.readinggroupguides.com', port=443): Read timed out. (read timeout=10)"))


Processing books:   0%|          | 80/34037 [23:57<200:22:42, 21.24s/it]

Batch 163 saved to /kaggle/working/batch_163.csv


Processing books:   0%|          | 87/34037 [25:13<113:55:36, 12.08s/it]

Error processing https://thebibliophage.com/book-review-freshwater-akwaeke-emezi/: HTTPSConnectionPool(host='thebibliophage.com', port=443): Max retries exceeded with url: /book-review-freshwater-akwaeke-emezi/ (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing books:   0%|          | 90/34037 [26:59<224:41:15, 23.83s/it]

Batch 164 saved to /kaggle/working/batch_164.csv
Error processing https://www.kirandellimore.com/book-review-julys-people-nadine-gordimer/: HTTPSConnectionPool(host='www.kirandellimore.com', port=443): Max retries exceeded with url: /book-review-julys-people-nadine-gordimer/ (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing books:   0%|          | 96/34037 [28:56<154:08:35, 16.35s/it]

Error processing https://paperbacksocial.com/2022/01/19/woman-at-point-zero-by-nawal-el-saadawi-book-review/: HTTPSConnectionPool(host='paperbacksocial.com', port=443): Max retries exceeded with url: /2022/01/19/woman-at-point-zero-by-nawal-el-saadawi-book-review/ (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing books:   0%|          | 100/34037 [30:20<155:38:20, 16.51s/it]

Batch 165 saved to /kaggle/working/batch_165.csv


Processing books:   0%|          | 106/34037 [32:02<163:15:32, 17.32s/it]

Error processing https://novelfables.com/artificial-condition: HTTPSConnectionPool(host='novelfables.com', port=443): Max retries exceeded with url: /artificial-condition (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7a03e3dabdf0>: Failed to resolve 'novelfables.com' ([Errno -5] No address associated with hostname)"))


Processing books:   0%|          | 110/34037 [33:43<188:28:48, 20.00s/it]

Batch 166 saved to /kaggle/working/batch_166.csv


Processing books:   0%|          | 117/34037 [35:21<137:49:40, 14.63s/it]

Error processing https://diversebooks.org/qa-with-mark-oshiro-the-insiders/: HTTPSConnectionPool(host='diversebooks.org', port=443): Max retries exceeded with url: /qa-with-mark-oshiro-the-insiders/ (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing books:   0%|          | 120/34037 [36:42<197:38:09, 20.98s/it]

Batch 167 saved to /kaggle/working/batch_167.csv


Processing books:   0%|          | 126/34037 [38:24<154:40:32, 16.42s/it]

Error processing https://candidceillie.com/review-baker-thief-claudie-arseneault/: HTTPSConnectionPool(host='candidceillie.com', port=443): Max retries exceeded with url: /review-baker-thief-claudie-arseneault/ (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing books:   0%|          | 130/34037 [39:52<161:29:42, 17.15s/it]

Batch 168 saved to /kaggle/working/batch_168.csv


Processing books:   0%|          | 140/34037 [41:48<110:23:58, 11.72s/it]

Batch 169 saved to /kaggle/working/batch_169.csv


Processing books:   0%|          | 150/34037 [44:25<191:07:31, 20.30s/it]

Batch 170 saved to /kaggle/working/batch_170.csv


Processing books:   0%|          | 160/34037 [45:58<90:20:10,  9.60s/it] 

Batch 171 saved to /kaggle/working/batch_171.csv


Processing books:   0%|          | 170/34037 [47:36<99:31:52, 10.58s/it] 

Batch 172 saved to /kaggle/working/batch_172.csv


Processing books:   1%|          | 180/34037 [48:53<89:01:21,  9.47s/it]

Batch 173 saved to /kaggle/working/batch_173.csv


Processing books:   1%|          | 190/34037 [50:56<107:28:50, 11.43s/it]

Batch 174 saved to /kaggle/working/batch_174.csv


Processing books:   1%|          | 193/34037 [51:04<50:27:05,  5.37s/it] 

Error processing For the Love of Soil: Strategies to Regenerate Our Food Production Systems (Kindle Edition) by Nicole Masters: https://links.duckduckgo.com/d.js?q=For+the+Love+of+Soil%3A+Strategies+to+Regenerate+Our+Food+Production+Systems+%28Kindle+Edition%29+Nicole+Masters+book+review&kl=wt-wt&l=wt-wt&p=&s=0&df=&vqd=4-102985106534766914171393745815591343429&bing_market=wt-WT&ex=-1 202 Ratelimit


Processing books:   1%|          | 200/34037 [52:03<94:56:20, 10.10s/it]

Batch 175 saved to /kaggle/working/batch_175.csv


Processing books:   1%|          | 210/34037 [54:11<127:26:50, 13.56s/it]

Batch 176 saved to /kaggle/working/batch_176.csv


Processing books:   1%|          | 216/34037 [54:51<58:43:03,  6.25s/it] 

Error processing Wings of the Luftwaffe: Flying German Aircraft of the Second World War (Paperback) by Eric M.  Brown: https://links.duckduckgo.com/d.js?q=Wings+of+the+Luftwaffe%3A+Flying+German+Aircraft+of+the+Second+World+War+%28Paperback%29+Eric+M.++Brown+book+review&kl=wt-wt&l=wt-wt&p=&s=0&df=&vqd=4-177837932682667247505135637314895882528&bing_market=wt-WT&ex=-1 202 Ratelimit


Processing books:   1%|          | 220/34037 [55:25<85:18:07,  9.08s/it]

Batch 177 saved to /kaggle/working/batch_177.csv


Processing books:   1%|          | 230/34037 [57:19<120:19:21, 12.81s/it]

Batch 178 saved to /kaggle/working/batch_178.csv


Processing books:   1%|          | 231/34037 [57:37<134:22:12, 14.31s/it]

Error processing https://www.cybermodeler.com/hobby/ref/osp/book_osp_xp02.shtml: HTTPSConnectionPool(host='www.cybermodeler.com', port=443): Max retries exceeded with url: /hobby/ref/osp/book_osp_xp02.shtml (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1007)')))


Processing books:   1%|          | 237/34037 [59:25<99:16:05, 10.57s/it] 

Error processing Luftwaffe Fighter Aces: The Jagdflieger and Their Combat Tactics and Techniques (Paperback) by Mike Spick: https://links.duckduckgo.com/d.js?q=Luftwaffe+Fighter+Aces%3A+The+Jagdflieger+and+Their+Combat+Tactics+and+Techniques+%28Paperback%29+Mike+Spick+book+review&kl=wt-wt&l=wt-wt&p=&s=0&df=&vqd=4-106257053064897860316510485128978703256&bing_market=wt-WT&ex=-1 202 Ratelimit


Processing books:   1%|          | 238/34037 [59:25<70:28:14,  7.51s/it]

Error processing Horrido! Fighter Aces Of The Luftwaffe (Mass Market Paperback) by Raymond F. Toliver: https://links.duckduckgo.com/d.js?q=Horrido%21+Fighter+Aces+Of+The+Luftwaffe+%28Mass+Market+Paperback%29+Raymond+F.+Toliver+book+review&kl=wt-wt&l=wt-wt&p=&s=0&df=&vqd=4-335900581739264176511763004637029815360&bing_market=wt-WT&ex=-1 202 Ratelimit


Processing books:   1%|          | 240/34037 [59:50<99:05:50, 10.56s/it]

Batch 179 saved to /kaggle/working/batch_179.csv


Processing books:   1%|          | 241/34037 [59:51<70:12:44,  7.48s/it]

Error processing Graphic War: The Secret Aviation Drawings and Illustrations of World War II (Hardcover) by Donald Nijboer: https://links.duckduckgo.com/d.js?q=Graphic+War%3A+The+Secret+Aviation+Drawings+and+Illustrations+of+World+War+II+%28Hardcover%29+Donald+Nijboer+book+review&kl=wt-wt&l=wt-wt&p=&s=0&df=&vqd=4-119610666732831612494744400935419261271&bing_market=wt-WT&ex=-1 202 Ratelimit


Processing books:   1%|          | 250/34037 [1:01:21<98:46:15, 10.52s/it]

Batch 180 saved to /kaggle/working/batch_180.csv


Processing books:   1%|          | 259/34037 [1:03:37<145:01:20, 15.46s/it]

Error processing https://escapetotheseventies.com/70s-films/the-incredible-melting-man/: HTTPSConnectionPool(host='escapetotheseventies.com', port=443): Max retries exceeded with url: /70s-films/the-incredible-melting-man/ (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self-signed certificate (_ssl.c:1007)')))


Processing books:   1%|          | 260/34037 [1:04:29<248:27:00, 26.48s/it]

Batch 181 saved to /kaggle/working/batch_181.csv


Processing books:   1%|          | 270/34037 [1:07:47<184:25:41, 19.66s/it]

Batch 182 saved to /kaggle/working/batch_182.csv


Processing books:   1%|          | 280/34037 [1:11:08<138:59:24, 14.82s/it]

Batch 183 saved to /kaggle/working/batch_183.csv


Processing books:   1%|          | 283/34037 [1:11:37<100:46:28, 10.75s/it]

Error processing https://www.flipkart.com/physics-module-v-optics-modern/p/itmetpv2fuzsczvh?pid=9789352605279&lid=LSTBOK9789352605279ZLEZMR&marketplace=FLIPKART: HTTPSConnectionPool(host='www.flipkart.com', port=443): Max retries exceeded with url: /physics-module-v-optics-modern/p/itmetpv2fuzsczvh?pid=9789352605279&lid=LSTBOK9789352605279ZLEZMR&marketplace=FLIPKART (Caused by ReadTimeoutError("HTTPSConnectionPool(host='www.flipkart.com', port=443): Read timed out. (read timeout=10)"))
Error processing https://www.flipkart.com/optics-modern-physics-neet-module-v/p/itmf5rthz3hgehcm?pid=9789387432536&marketplace=FLIPKART: HTTPSConnectionPool(host='www.flipkart.com', port=443): Max retries exceeded with url: /optics-modern-physics-neet-module-v/p/itmf5rthz3hgehcm?pid=9789387432536&marketplace=FLIPKART (Caused by ReadTimeoutError("HTTPSConnectionPool(host='www.flipkart.com', port=443): Read timed out. (read timeout=10)"))


Processing books:   1%|          | 290/34037 [1:16:11<187:42:03, 20.02s/it]

Batch 184 saved to /kaggle/working/batch_184.csv


Processing books:   1%|          | 292/34037 [1:16:41<159:36:33, 17.03s/it]

Error processing https://seedstosuccess.com/know/the-two-bite-club-by-team-nutrition-2012/: HTTPSConnectionPool(host='seedstosuccess.com', port=443): Max retries exceeded with url: /know/the-two-bite-club-by-team-nutrition-2012/ (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing books:   1%|          | 293/34037 [1:17:32<254:26:21, 27.15s/it]

Error processing https://blackmaskmagazine.com/blog/w-t-ballard-an-interview/: HTTPSConnectionPool(host='blackmaskmagazine.com', port=443): Max retries exceeded with url: /blog/w-t-ballard-an-interview/ (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing books:   1%|          | 300/34037 [1:19:58<163:49:17, 17.48s/it]

Batch 185 saved to /kaggle/working/batch_185.csv


Processing books:   1%|          | 310/34037 [1:21:43<101:52:43, 10.87s/it]

Batch 186 saved to /kaggle/working/batch_186.csv


Processing books:   1%|          | 319/34037 [1:23:02<76:52:59,  8.21s/it] 

Error processing https://stuffedpuffin.eu/2021/12/16/book-review-terminal-world-by-alastair-reynolds/: HTTPSConnectionPool(host='stuffedpuffin.eu', port=443): Max retries exceeded with url: /2021/12/16/book-review-terminal-world-by-alastair-reynolds/ (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing books:   1%|          | 320/34037 [1:23:52<194:31:37, 20.77s/it]

Batch 187 saved to /kaggle/working/batch_187.csv


Processing books:   1%|          | 327/34037 [1:25:19<89:48:56,  9.59s/it] 

Error processing The Golden Age of the Great Passenger Airships: Graf Zeppelin and Hindenburg (Paperback) by Harold G. Dick: https://links.duckduckgo.com/d.js?q=The+Golden+Age+of+the+Great+Passenger+Airships%3A+Graf+Zeppelin+and+Hindenburg+%28Paperback%29+Harold+G.+Dick+book+review&kl=wt-wt&l=wt-wt&p=&s=0&df=&vqd=4-255965238779898816532550939081565538751&bing_market=wt-WT&ex=-1 202 Ratelimit


Processing books:   1%|          | 330/34037 [1:26:02<115:28:36, 12.33s/it]

Batch 188 saved to /kaggle/working/batch_188.csv


Processing books:   1%|          | 340/34037 [1:27:38<96:02:31, 10.26s/it] 

Batch 189 saved to /kaggle/working/batch_189.csv


Processing books:   1%|          | 350/34037 [1:29:43<142:31:27, 15.23s/it]

Batch 190 saved to /kaggle/working/batch_190.csv


Processing books:   1%|          | 351/34037 [1:30:00<147:20:20, 15.75s/it]

Error processing https://www.curledup.com/chronsto.htm: HTTPSConnectionPool(host='www.curledup.com', port=443): Max retries exceeded with url: /chronsto.htm (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7a03f1abdea0>, 'Connection to www.curledup.com timed out. (connect timeout=10)'))


Processing books:   1%|          | 356/34037 [1:32:27<183:09:13, 19.58s/it]

Error processing https://albas.al/botime/shpella-e-pirateve/: HTTPSConnectionPool(host='albas.al', port=443): Max retries exceeded with url: /botime/shpella-e-pirateve/ (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7a03f1aff880>: Failed to resolve 'albas.al' ([Errno -2] Name or service not known)"))


Processing books:   1%|          | 360/34037 [1:34:37<218:42:55, 23.38s/it]

Batch 191 saved to /kaggle/working/batch_191.csv


Processing books:   1%|          | 370/34037 [1:37:02<142:15:18, 15.21s/it]

Batch 192 saved to /kaggle/working/batch_192.csv


Processing books:   1%|          | 375/34037 [1:38:41<192:42:31, 20.61s/it]

Error processing https://albanianhistory.org/albanianliterature/authors_classical/koliqi.html: HTTPSConnectionPool(host='albanianhistory.org', port=443): Max retries exceeded with url: /albanianliterature/authors_classical/koliqi.html (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1007)')))


Processing books:   1%|          | 380/34037 [1:40:43<192:41:02, 20.61s/it]

Batch 193 saved to /kaggle/working/batch_193.csv


Processing books:   1%|          | 381/34037 [1:41:06<199:36:25, 21.35s/it]

Error processing https://albanianhistory.org/albanianliterature/authors/classical/spasse/index.html: HTTPSConnectionPool(host='albanianhistory.org', port=443): Max retries exceeded with url: /albanianliterature/authors/classical/spasse/index.html (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1007)')))


Processing books:   1%|          | 390/34037 [1:44:27<206:50:56, 22.13s/it]

Batch 194 saved to /kaggle/working/batch_194.csv
Error processing https://fjalashqip.com/naim-frasheri-fjalet-e-qiririt/: HTTPSConnectionPool(host='fjalashqip.com', port=443): Max retries exceeded with url: /naim-frasheri-fjalet-e-qiririt/ (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing books:   1%|          | 400/34037 [1:48:03<179:30:39, 19.21s/it]

Batch 195 saved to /kaggle/working/batch_195.csv


Processing books:   1%|          | 410/34037 [1:49:22<89:10:43,  9.55s/it] 

Batch 196 saved to /kaggle/working/batch_196.csv


Processing books:   1%|          | 420/34037 [1:50:52<78:28:03,  8.40s/it] 

Batch 197 saved to /kaggle/working/batch_197.csv


Processing books:   1%|▏         | 430/34037 [1:52:29<86:31:45,  9.27s/it] 

Batch 198 saved to /kaggle/working/batch_198.csv
Error processing https://cuddlebuggery.com/blog/2011/11/18/review-liesl-and-po/: HTTPSConnectionPool(host='cuddlebuggery.com', port=443): Max retries exceeded with url: /blog/2011/11/18/review-liesl-and-po/ (Caused by ResponseError('too many 502 error responses'))


Processing books:   1%|▏         | 440/34037 [1:55:22<110:38:11, 11.85s/it]

Batch 199 saved to /kaggle/working/batch_199.csv


Processing books:   1%|▏         | 447/34037 [1:56:16<74:59:25,  8.04s/it] 

Error processing https://www.amystewart.com/books/drunkenbotanist/: HTTPSConnectionPool(host='www.amystewart.com', port=443): Max retries exceeded with url: /books/drunkenbotanist/ (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing books:   1%|▏         | 448/34037 [1:56:50<144:55:36, 15.53s/it]

Error processing https://alcoholexplained.com/: HTTPSConnectionPool(host='alcoholexplained.com', port=443): Max retries exceeded with url: / (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing books:   1%|▏         | 450/34037 [1:57:33<164:06:52, 17.59s/it]

Batch 200 saved to /kaggle/working/batch_200.csv


Processing books:   1%|▏         | 460/34037 [1:59:18<71:53:07,  7.71s/it] 

Batch 201 saved to /kaggle/working/batch_201.csv


Processing books:   1%|▏         | 470/34037 [2:00:36<67:42:14,  7.26s/it]

Batch 202 saved to /kaggle/working/batch_202.csv


Processing books:   1%|▏         | 480/34037 [2:02:10<121:54:47, 13.08s/it]

Batch 203 saved to /kaggle/working/batch_203.csv
Error processing https://varianjohnson.com/books/playing-the-cards-youre-dealt/: HTTPSConnectionPool(host='varianjohnson.com', port=443): Max retries exceeded with url: /books/playing-the-cards-youre-dealt/ (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing books:   1%|▏         | 490/34037 [2:04:06<82:51:40,  8.89s/it] 

Batch 204 saved to /kaggle/working/batch_204.csv


Processing books:   1%|▏         | 492/34037 [2:04:49<145:56:50, 15.66s/it]

Error processing https://www.fullofbooks.com/flow-down-like-silver-hypatia-of-alexandria-by-ki-longfellow-review/: HTTPSConnectionPool(host='www.fullofbooks.com', port=443): Max retries exceeded with url: /flow-down-like-silver-hypatia-of-alexandria-by-ki-longfellow-review/ (Caused by ReadTimeoutError("HTTPSConnectionPool(host='www.fullofbooks.com', port=443): Read timed out. (read timeout=10)"))


Processing books:   1%|▏         | 493/34037 [2:06:27<374:07:44, 40.15s/it]

Error processing https://reviews.metaphorosis.com/review/justine-lawrence-durrell/: HTTPSConnectionPool(host='reviews.metaphorosis.com', port=443): Max retries exceeded with url: /review/justine-lawrence-durrell/ (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing books:   1%|▏         | 500/34037 [2:07:57<120:04:08, 12.89s/it]

Batch 205 saved to /kaggle/working/batch_205.csv


Processing books:   1%|▏         | 509/34037 [2:10:15<74:49:50,  8.03s/it] 

Error processing https://ib.beaconhouse.net/a/pdf/goto/CD/the_rediscovery_of_man_cordwainer_smith.pdf: HTTPSConnectionPool(host='ib.beaconhouse.net', port=443): Max retries exceeded with url: /a/pdf/goto/CD/the_rediscovery_of_man_cordwainer_smith.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7a03f1ac16c0>: Failed to establish a new connection: [Errno 111] Connection refused'))


Processing books:   1%|▏         | 510/34037 [2:11:09<204:59:39, 22.01s/it]

Batch 206 saved to /kaggle/working/batch_206.csv


Processing books:   2%|▏         | 520/34037 [2:12:36<79:37:14,  8.55s/it] 

Batch 207 saved to /kaggle/working/batch_207.csv


Processing books:   2%|▏         | 530/34037 [2:14:29<132:29:02, 14.23s/it]

Batch 208 saved to /kaggle/working/batch_208.csv


Processing books:   2%|▏         | 540/34037 [2:16:25<81:13:02,  8.73s/it] 

Batch 209 saved to /kaggle/working/batch_209.csv


Processing books:   2%|▏         | 550/34037 [2:18:22<115:09:14, 12.38s/it]

Batch 210 saved to /kaggle/working/batch_210.csv


Processing books:   2%|▏         | 560/34037 [2:21:07<227:53:22, 24.51s/it]

Batch 211 saved to /kaggle/working/batch_211.csv


Processing books:   2%|▏         | 570/34037 [2:23:31<102:17:05, 11.00s/it]

Batch 212 saved to /kaggle/working/batch_212.csv


Processing books:   2%|▏         | 580/34037 [2:24:55<99:10:45, 10.67s/it] 

Batch 213 saved to /kaggle/working/batch_213.csv
Error processing https://www.logicmatters.net/2008/06/08/awodeys-category-theory-ch-1/: HTTPSConnectionPool(host='www.logicmatters.net', port=443): Max retries exceeded with url: /2008/06/08/awodeys-category-theory-ch-1/ (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing books:   2%|▏         | 590/34037 [2:27:34<113:50:10, 12.25s/it]

Batch 214 saved to /kaggle/working/batch_214.csv


Processing books:   2%|▏         | 597/34037 [2:28:57<104:03:27, 11.20s/it]

Error processing https://www.beeartless.com/blog/2019/book-reviews/the-fall-albert-camus/: HTTPSConnectionPool(host='www.beeartless.com', port=443): Max retries exceeded with url: /blog/2019/book-reviews/the-fall-albert-camus/ (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7a03e3ddaf80>: Failed to resolve 'www.beeartless.com' ([Errno -2] Name or service not known)"))


Processing books:   2%|▏         | 600/34037 [2:30:58<263:26:29, 28.36s/it]

Batch 215 saved to /kaggle/working/batch_215.csv
Error processing https://www.kirkus.us-east-1.elasticbeanstalk.com/book-reviews/boualem-sansal/the-german-mujahid/: HTTPSConnectionPool(host='www.kirkus.us-east-1.elasticbeanstalk.com', port=443): Max retries exceeded with url: /book-reviews/boualem-sansal/the-german-mujahid/ (Caused by SSLError(SSLCertVerificationError(1, "[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: Hostname mismatch, certificate is not valid for 'www.kirkus.us-east-1.elasticbeanstalk.com'. (_ssl.c:1007)")))


Processing books:   2%|▏         | 610/34037 [2:34:03<113:01:19, 12.17s/it]

Batch 216 saved to /kaggle/working/batch_216.csv


Processing books:   2%|▏         | 613/34037 [2:34:53<129:27:52, 13.94s/it]

Error processing https://www.fullofbooks.com/the-eight-by-katherine-neville-review/: HTTPSConnectionPool(host='www.fullofbooks.com', port=443): Max retries exceeded with url: /the-eight-by-katherine-neville-review/ (Caused by ReadTimeoutError("HTTPSConnectionPool(host='www.fullofbooks.com', port=443): Read timed out. (read timeout=10)"))


Processing books:   2%|▏         | 619/34037 [2:37:47<184:40:11, 19.89s/it]

Error processing https://coalhillreview.com/book-review-the-sheltering-sky-by-paul-bowles/: HTTPSConnectionPool(host='coalhillreview.com', port=443): Max retries exceeded with url: /book-review-the-sheltering-sky-by-paul-bowles/ (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Processing books:   2%|▏         | 620/34037 [2:38:40<276:09:16, 29.75s/it]

Batch 217 saved to /kaggle/working/batch_217.csv


Processing books:   2%|▏         | 630/34037 [2:41:00<117:43:00, 12.69s/it]

Batch 218 saved to /kaggle/working/batch_218.csv


Processing books:   2%|▏         | 632/34037 [2:41:44<161:42:16, 17.43s/it]

Error processing https://blogs.lse.ac.uk/mec/2019/06/26/book-review-a-history-of-algeria-by-james-mcdougall/: HTTPSConnectionPool(host='blogs.lse.ac.uk', port=443): Max retries exceeded with url: /mec/2019/06/26/book-review-a-history-of-algeria-by-james-mcdougall/ (Caused by ReadTimeoutError("HTTPSConnectionPool(host='blogs.lse.ac.uk', port=443): Read timed out. (read timeout=10)"))


Processing books:   2%|▏         | 635/34037 [2:44:02<291:00:29, 31.36s/it]

Error processing https://islamicliterarysociety.com/2020/08/desert-encounter/: HTTPSConnectionPool(host='islamicliterarysociety.com', port=443): Max retries exceeded with url: /2020/08/desert-encounter/ (Caused by ReadTimeoutError("HTTPSConnectionPool(host='islamicliterarysociety.com', port=443): Read timed out. (read timeout=10)"))


Processing books:   2%|▏         | 640/34037 [2:46:48<215:50:20, 23.27s/it]

Batch 219 saved to /kaggle/working/batch_219.csv


Processing books:   2%|▏         | 650/34037 [2:49:24<114:29:17, 12.34s/it]

Batch 220 saved to /kaggle/working/batch_220.csv


Processing books:   2%|▏         | 660/34037 [2:51:22<90:02:07,  9.71s/it] 

Batch 221 saved to /kaggle/working/batch_221.csv


Processing books:   2%|▏         | 663/34037 [2:52:06<132:17:23, 14.27s/it]

Error processing https://www.flipkart.com/data-structures-algorithms-made-easy/p/itmezunqtqmhwwcf?pid=9788193245279&lid=LSTBOK9788193245279EOP6SE&marketplace=FLIPKART: HTTPSConnectionPool(host='www.flipkart.com', port=443): Max retries exceeded with url: /data-structures-algorithms-made-easy/p/itmezunqtqmhwwcf?pid=9788193245279&lid=LSTBOK9788193245279EOP6SE&marketplace=FLIPKART (Caused by ReadTimeoutError("HTTPSConnectionPool(host='www.flipkart.com', port=443): Read timed out. (read timeout=10)"))


Processing books:   2%|▏         | 664/34037 [2:53:45<366:05:58, 39.49s/it]

Error processing https://prod2.galleries.thebarnyardstore.com/files/uploaded-files/Documents/introduction_to_algorithms_a_creative_approach_by_udi_manber.pdf: HTTPSConnectionPool(host='prod2.galleries.thebarnyardstore.com', port=443): Max retries exceeded with url: /files/uploaded-files/Documents/introduction_to_algorithms_a_creative_approach_by_udi_manber.pdf (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7a03f2a2ca90>, 'Connection to prod2.galleries.thebarnyardstore.com timed out. (connect timeout=10)'))


Processing books:   2%|▏         | 670/34037 [2:56:57<220:47:19, 23.82s/it]

Batch 222 saved to /kaggle/working/batch_222.csv


Processing books:   2%|▏         | 674/34037 [2:57:31<110:30:04, 11.92s/it]

Error processing https://www.semanticscholar.org/paper/Problem-solving-with-algorithms-and-data-structures-Miller-Ranum/23da04d438068639f39d8b7871881ad03a48c0a0: HTTPSConnectionPool(host='www.semanticscholar.org', port=443): Max retries exceeded with url: /paper/Problem-solving-with-algorithms-and-data-structures-Miller-Ranum/23da04d438068639f39d8b7871881ad03a48c0a0 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='www.semanticscholar.org', port=443): Read timed out. (read timeout=10)"))


Processing books:   2%|▏         | 680/34037 [3:00:43<179:43:54, 19.40s/it]

Batch 223 saved to /kaggle/working/batch_223.csv


Processing books:   2%|▏         | 690/34037 [3:02:04<104:06:50, 11.24s/it]

Batch 224 saved to /kaggle/working/batch_224.csv


Processing books:   2%|▏         | 700/34037 [3:04:09<89:30:51,  9.67s/it] 

Batch 225 saved to /kaggle/working/batch_225.csv


Processing books:   2%|▏         | 710/34037 [3:05:48<52:07:05,  5.63s/it] 

Batch 226 saved to /kaggle/working/batch_226.csv


Processing books:   2%|▏         | 720/34037 [3:07:44<77:53:17,  8.42s/it] 

Batch 227 saved to /kaggle/working/batch_227.csv


Processing books:   2%|▏         | 730/34037 [3:09:12<91:25:28,  9.88s/it]

Batch 228 saved to /kaggle/working/batch_228.csv


Processing books:   2%|▏         | 731/34037 [3:10:15<238:10:20, 25.74s/it]

Error processing https://www.fullofbooks.com/the-last-hour-of-gann-by-r-lee-smith-review/: HTTPSConnectionPool(host='www.fullofbooks.com', port=443): Max retries exceeded with url: /the-last-hour-of-gann-by-r-lee-smith-review/ (Caused by ReadTimeoutError("HTTPSConnectionPool(host='www.fullofbooks.com', port=443): Read timed out. (read timeout=10)"))
Error processing https://theghastlygrimoire.com/2019/01/31/book-review-the-last-hour-of-gann-by-r-lee-smith/: HTTPSConnectionPool(host='theghastlygrimoire.com', port=443): Max retries exceeded with url: /2019/01/31/book-review-the-last-hour-of-gann-by-r-lee-smith/ (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7a03f16b8f70>: Failed to resolve 'theghastlygrimoire.com' ([Errno -2] Name or service not known)"))


Processing books:   2%|▏         | 736/34037 [3:13:05<184:40:21, 19.96s/it]

Error processing https://mademoisellesmut.com/2020/04/26/barbarians-heart-ice-planet-barbarians-9-by-ruby-dixon/: HTTPSConnectionPool(host='mademoisellesmut.com', port=443): Max retries exceeded with url: /2020/04/26/barbarians-heart-ice-planet-barbarians-9-by-ruby-dixon/ (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7a03e3d4b760>: Failed to resolve 'mademoisellesmut.com' ([Errno -2] Name or service not known)"))


Processing books:   2%|▏         | 740/34037 [3:14:13<137:37:10, 14.88s/it]

Batch 229 saved to /kaggle/working/batch_229.csv


Processing books:   2%|▏         | 750/34037 [3:16:46<206:06:43, 22.29s/it]

Batch 230 saved to /kaggle/working/batch_230.csv


Processing books:   2%|▏         | 751/34037 [3:16:46<145:05:58, 15.69s/it]

Error processing Fatherland (Paperback) by Robert   Harris: https://links.duckduckgo.com/d.js?q=Fatherland+%28Paperback%29+Robert+++Harris+book+review&kl=wt-wt&l=wt-wt&p=&s=0&df=&vqd=4-91934871852219133148408823060287659694&bing_market=wt-WT&ex=-1 202 Ratelimit


Processing books:   2%|▏         | 755/34037 [3:17:34<134:47:12, 14.58s/it]

In [None]:
print("2707 batch 361")

In [None]:
print("test2")