<a href="https://www.kaggle.com/code/nigamshitij/extract-amazon-ratings?scriptVersionId=193713960" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [50]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/book-review-samples/goodreads_fantasy.csv
/kaggle/input/book-review-samples/goodreads_all_genres.csv
/kaggle/input/book-review-samples/Darth_Plagueis_reviews(4).csv
/kaggle/input/book-review-samples/Dune_Book_1_reviews.csv
/kaggle/input/book-review-samples/Dune_Book_1_reviews(4).csv


In [58]:
import csv
import requests
from bs4 import BeautifulSoup
import time
import random
import os
import re

In [59]:
def get_amazon_link(goodreads_url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        # Convert the main book URL to the "get a copy" URL
        book_id = re.search(r'/show/(\d+)', goodreads_url).group(1)
        
        # Find the Amazon link
        amazon_link = f"https://www.goodreads.com/book_link/follow/1?book_id={book_id}&source=compareprices"
        
        if amazon_link:
            print(amazon_link)
            return amazon_link
        else:
            print(f"No Amazon link found for {goodreads_url}")
            return None
    
    except Exception as e:
        print(f"Error fetching Amazon link for {goodreads_url}: {str(e)}")
        return None

In [100]:
def get_amazon_ratings_old(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        # print(soup)
        
        # Find the averageCustomerReviews div
        reviews_div = soup.find('div', {'id': 'averageCustomerReviews'})
        print(reviews_div)
        
        if reviews_div:
            # Find the star rating
            star_rating = reviews_div.find('span', {'class': 'a-size-base a-color-base'})
            if star_rating:
                star_rating = star_rating.text.strip()
            else:
                # Fallback to searching for the rating in the title attribute
                rating_span = reviews_div.find('span', {'class': 'reviewCountTextLinkedHistogram', 'title': True})
                if rating_span:
                    star_rating = rating_span['title'].split(' out of ')[0]
                else:
                    star_rating = "N/A"
            
            # Find the total number of ratings
            total_ratings = reviews_div.find('span', {'id': 'acrCustomerReviewText'})
            if total_ratings:
                total_ratings = total_ratings.text.split(' ratings')[0].replace(',', '')
            else:
                total_ratings = "N/A"
        else:
            print("reviews_div didn't work")
            star_rating = "N/A"
            total_ratings = "N/A"
        
        return star_rating, total_ratings
    
    except Exception as e:
        print(f"Error fetching data for {url}: {str(e)}")
        return "N/A", "N/A"

In [101]:
def get_amazon_ratings(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Try multiple methods to find the star rating
        star_rating = None
        rating_methods = [
            lambda: soup.find('span', {'class': 'a-icon-alt'}),
            lambda: soup.find('span', {'data-hook': 'rating-out-of-text'}),
            lambda: soup.find('span', {'class': 'a-size-base a-color-base'}),
            lambda: soup.find('span', {'class': 'reviewCountTextLinkedHistogram', 'title': True})
        ]
        
        for method in rating_methods:
            rating_elem = method()
            if rating_elem:
                if 'title' in rating_elem.attrs:
                    star_rating = rating_elem['title'].split(' out of ')[0]
                else:
                    star_rating = rating_elem.text.split(' out of ')[0]
                break
        
        # Try multiple methods to find the total ratings
        total_ratings = None
        total_methods = [
            lambda: soup.find('span', {'id': 'acrCustomerReviewText'}),
            lambda: soup.find('span', {'data-hook': 'total-review-count'})
        ]
        
        for method in total_methods:
            total_elem = method()
            if total_elem:
                total_ratings = total_elem.text.split(' rating')[0].replace(',', '')
                break
        
        return star_rating or "N/A", total_ratings or "N/A"
    
    except Exception as e:
        print(f"Error fetching data for {url}: {str(e)}")
        return "N/A", "N/A"

In [102]:
def remove_duplicates(input_file, output_file):
    seen = set()
    with open(input_file, 'r', newline='', encoding='utf-8') as infile, \
         open(output_file, 'w', newline='', encoding='utf-8') as outfile:
        
        reader = csv.DictReader(infile)
        writer = csv.DictWriter(outfile, fieldnames=reader.fieldnames)
        writer.writeheader()
        
        for row in reader:
            # Create a tuple of values that define uniqueness
            key = (row['Title'], row['Authors'])
            if key not in seen:
                seen.add(key)
                writer.writerow(row)

In [103]:
def process_csv(input_file, output_file):
    # First, remove duplicates
    temp_file = 'temp_no_duplicates.csv'
    remove_duplicates(input_file, temp_file)
    
    with open(temp_file, 'r', newline='', encoding='utf-8') as infile, \
         open(output_file, 'w', newline='', encoding='utf-8') as outfile:
        
        reader = csv.DictReader(infile)
        fieldnames = reader.fieldnames + ['Amazon URL', 'Amazon Star Rating', 'Amazon Total Ratings']
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()
        
        for row in reader:
            goodreads_url = row['URL']
            amazon_url = get_amazon_link(goodreads_url)
            row['Amazon URL'] = amazon_url if amazon_url else "N/A"
            
            if amazon_url:
                star_rating, total_ratings = get_amazon_ratings(amazon_url)
                row['Amazon Star Rating'] = star_rating
                row['Amazon Total Ratings'] = total_ratings
            else:
                row['Amazon Star Rating'] = "N/A"
                row['Amazon Total Ratings'] = "N/A"
            
            writer.writerow(row)
            
            # Add a delay to avoid overwhelming the servers
            time.sleep(random.uniform(2, 4))
    
    # Clean up the temporary file
    os.remove(temp_file)

In [104]:
path = '/kaggle/input/book-review-samples/goodreads_all_genres.csv'

In [105]:
if __name__ == "__main__":
    input_file = path
    output_file = 'output_books_with_amazon_ratings.csv'
    process_csv(input_file, output_file)
    print("Processing complete. Check the output file.")

https://www.goodreads.com/book_link/follow/1?book_id=625603&source=compareprices
None ,  None


KeyboardInterrupt: 