<a href="https://colab.research.google.com/github/ranwiththecode/high-fantasy-data-analysis/blob/main/mining_current.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
# Install required packages
!pip install requests pandas tqdm

import requests
import json
import time
import pandas as pd
from tqdm import tqdm
from google.colab import drive

BOOK_TITLE = "tricksters_choice"  # Replace with your book's title

# Mount Google Drive
drive.mount('/content/drive')

def get_all_reviews(api_url, initial_payload, headers):
    all_reviews = []
    page_count = 0
    total_reviews = None

    # Initialize progress bar
    with tqdm(desc="Collecting reviews", unit="page") as pbar:
        while True:
            try:
                response = requests.post(api_url, headers=headers, json=initial_payload, timeout=15)
                data = response.json()

                # Skip if error in response
                if 'errors' in data:
                    print(f"\nSkipping page due to error: {data['errors']}")
                    break

                # Get total count on first page
                if total_reviews is None:
                    total_reviews = data.get('data', {}).get('getReviews', {}).get('totalCount', 0)
                    pbar.total = (total_reviews // initial_payload['variables']['pagination']['limit']) + 1
                    print(f"\nTotal reviews to fetch: ~{total_reviews}")

                reviews = data.get('data', {}).get('getReviews', {}).get('edges', [])
                reviews_collected = len(reviews)

                # Process reviews
                for review in reviews:
                    try:
                        all_reviews.append({
                            'rating': review['node']['rating'],
                            'text': review['node']['text']
                        })
                    except KeyError as e:
                        print(f"\nSkipping malformed review: {e}")
                        continue

                # Update progress bar
                pbar.update(1)
                pbar.set_postfix({
                    'reviews': len(all_reviews),
                    'last_page': reviews_collected
                })

                # Pagination logic
                page_info = data['data']['getReviews']['pageInfo']
                if not page_info.get('nextPageToken'):
                    break

                initial_payload['variables']['pagination']['after'] = page_info['nextPageToken']
                page_count += 1
                time.sleep(1.5)  # Be gentle with the API

            except requests.exceptions.RequestException as e:
                print(f"\nRequest failed: {e}. Retrying...")
                time.sleep(5)
                continue
            except json.JSONDecodeError as e:
                print(f"\nFailed to decode JSON: {e}. Retrying...")
                time.sleep(5)
                continue
            except Exception as e:
                print(f"\nUnexpected error: {e}. Retrying...")
                time.sleep(5)
                continue

    return all_reviews

# Configuration
config = {
    "api_url": "https://kxbwmqov6jgg3daaamb744ycu4.appsync-api.us-east-1.amazonaws.com/graphql",
    "headers": {
        "Content-Type": "application/json",
        "X-Api-Key": "da2-xpgsdydkbregjhpr6ejzqdhuwy"
    },
    "payload_template": {
        "operationName": "getReviews",
        "query": """query getReviews($filters: BookReviewsFilterInput!, $pagination: PaginationInput) {
            getReviews(filters: $filters, pagination: $pagination) {
                totalCount
                edges {
                    node {
                        text
                        rating
                    }
                }
                pageInfo { nextPageToken }
            }
        }""",
        "variables": {
            "filters": {
                "resourceType": "WORK",
                "resourceId": "" # Enter resource ID
            },
            "pagination": {"limit": 30}
        }
    }
}

# Run the scraper
print("🚀 Starting review collection...")
reviews = get_all_reviews(
    config["api_url"],
    config["payload_template"],
    config["headers"]
)

# Process and save results
if reviews:
    print(f"\n📊 Success! Collected {len(reviews)} reviews.")

    # Create DataFrame directly from the collected reviews
    df = pd.DataFrame(reviews)

    # Save to Drive
    save_path = '/content/drive/MyDrive/Goodreads_Data/'
    !mkdir -p "{save_path}"

    # Clean the title (replace spaces with underscores)
    clean_title = BOOK_TITLE.replace(" ", "_")

    df.to_csv(f'{save_path}{clean_title}_reviews.csv', index=False)
    df.to_json(f'{save_path}{clean_title}_reviews.json', indent=2)

    print(f"Saved as:\n{save_path}{clean_title}_reviews.csv\n{save_path}{clean_title}_reviews.json")
else:
    print("\n❌ No reviews were collected. Check the resource ID and API connection.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
🚀 Starting review collection...


Collecting reviews:   2%|▏         | 1/58 [00:00<00:26,  2.17page/s, reviews=30, last_page=30]


Total reviews to fetch: ~1738


Collecting reviews: 100%|██████████| 58/58 [01:45<00:00,  1.81s/page, reviews=1738, last_page=28]


📊 Success! Collected 1738 reviews.
Saved as:
/content/drive/MyDrive/Goodreads_Data/tricksters_choice_reviews.csv
/content/drive/MyDrive/Goodreads_Data/tricksters_choice_reviews.json



