<a href="https://colab.research.google.com/github/ranwiththecode/high-fantasy-data-analysis/blob/main/mining_current.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required packages
!pip install requests pandas tqdm

import requests
import json
import time
import pandas as pd
from tqdm import tqdm
from google.colab import drive

BOOK_TITLE = "eragon_paolini"  # Replace with your book's title

# Mount Google Drive
drive.mount('/content/drive')

def get_all_reviews(api_url, initial_payload, headers):
    all_reviews = []
    page_count = 0

    while True:
        try:
            response = requests.post(api_url, headers=headers, json=initial_payload, timeout=15)
            data = response.json()

            # Skip if error in response
            if 'errors' in data:
                print(f"Skipping page due to error: {data['errors']}")
                break

            reviews = data.get('data', {}).get('getReviews', {}).get('edges', [])

            # Process reviews with error handling
            for review in reviews:
                try:
                    # Handle missing creator data
                    creator = review['node']['creator'] or {
                        'name': 'Anonymous',
                        'imageUrlSquare': None
                    }

                    all_reviews.append({
                        **review['node'],
                        'creator': creator
                    })
                except KeyError as e:
                    print(f"Skipping malformed review: {e}")
                    continue

            # Pagination logic
            page_info = data['data']['getReviews']['pageInfo']
            if not page_info.get('nextPageToken'):
                break

            initial_payload['variables']['pagination']['after'] = page_info['nextPageToken']
            page_count += 1
            time.sleep(1.5)

        except Exception as e:
            print(f"Error: {e}. Retrying...")
            time.sleep(5)
            continue

    return all_reviews

# Configuration
config = {
    "api_url": "https://kxbwmqov6jgg3daaamb744ycu4.appsync-api.us-east-1.amazonaws.com/graphql",
    "headers": {
        "Content-Type": "application/json",
        "X-Api-Key": "da2-xpgsdydkbregjhpr6ejzqdhuwy"
    },
    "payload_template": {
        "operationName": "getReviews",
        "query": """query getReviews($filters: BookReviewsFilterInput!, $pagination: PaginationInput) {
            getReviews(filters: $filters, pagination: $pagination) {
                totalCount
                edges {
                    node {
                        id
                        creator { name imageUrlSquare }
                        text
                        rating
                        createdAt
                        updatedAt
                    }
                }
                pageInfo { nextPageToken }
            }
        }""",
        "variables": {
            "filters": {
                "resourceType": "WORK",
                "resourceId": "" # Enter resource ID
            },
            "pagination": {"limit": 30}
        }
    }
}

# Run the scraper
print("🚀 Starting review collection...")
reviews = get_all_reviews(
    config["api_url"],
    config["payload_template"],
    config["headers"]
)

# Process and save results
if reviews:
    print(f"\n📊 Success! Collected {len(reviews)} reviews.")

    df = pd.json_normalize([{
        **r['node'],
        'creator_name': r['node']['creator']['name'],
        'creator_image': r['node']['creator']['imageUrlSquare']
    } for r in reviews])

    # Save to Drive
    # Replace your save code with this:
save_path = '/content/drive/MyDrive/Goodreads_Data/'
!mkdir -p "{save_path}"

# Clean the title (replace spaces with underscores)
clean_title = BOOK_TITLE.replace(" ", "_")

df.to_csv(f'{save_path}{clean_title}_reviews.csv', index=False)
df.to_json(f'{save_path}{clean_title}_reviews.json', indent=2)

print(f"Saved as:\n{save_path}{clean_title}_reviews.csv\n{save_path}{clean_title}_reviews.json")
