<a href="https://colab.research.google.com/github/ranwiththecode/high-fantasy-data-analysis/blob/main/mining_current.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required packages
!pip install requests pandas tqdm

import requests
import json
import time
import pandas as pd
from tqdm import tqdm
from google.colab import drive

BOOK_TITLE = "eragon_paolini"  # Replace with your book's title

# Mount Google Drive
drive.mount('/content/drive')

def get_all_reviews(api_url, initial_payload, headers):
    all_reviews = []
    page_count = 0

    while True:
        try:
            response = requests.post(api_url, headers=headers, json=initial_payload, timeout=15)
            data = response.json()

            # Skip if error in response
            if 'errors' in data:
                print(f"Skipping page due to error: {data['errors']}")
                break

            reviews = data.get('data', {}).get('getReviews', {}).get('edges', [])

            # Process reviews with error handling
            for review in reviews:
                try:
                    # Handle missing creator data
                    creator = review['node']['creator'] or {
                        'name': 'Anonymous',
                        'imageUrlSquare': None
                    }

                    all_reviews.append({
                        **review['node'],
                        'creator': creator
                    })
                except KeyError as e:
                    print(f"Skipping malformed review: {e}")
                    continue

            # Pagination logic
            page_info = data['data']['getReviews']['pageInfo']
            if not page_info.get('nextPageToken'):
                break

            initial_payload['variables']['pagination']['after'] = page_info['nextPageToken']
            page_count += 1
            time.sleep(1.5)

        except Exception as e:
            print(f"Error: {e}. Retrying...")
            time.sleep(5)
            continue

    return all_reviews

# Configuration
config = {
    "api_url": "https://kxbwmqov6jgg3daaamb744ycu4.appsync-api.us-east-1.amazonaws.com/graphql",
    "headers": {
        "Content-Type": "application/json",
        "X-Api-Key": "da2-xpgsdydkbregjhpr6ejzqdhuwy"
    },
    "payload_template": {
        "operationName": "getReviews",
        "query": """query getReviews($filters: BookReviewsFilterInput!, $pagination: PaginationInput) {
            getReviews(filters: $filters, pagination: $pagination) {
                totalCount
                edges {
                    node {
                        id
                        creator { name imageUrlSquare }
                        text
                        rating
                        createdAt
                        updatedAt
                    }
                }
                pageInfo { nextPageToken }
            }
        }""",
        "variables": {
            "filters": {
                "resourceType": "WORK",
                "resourceId": "kca://work/amzn1.gr.work.v1.xpxwQN_UlodQc9lPpilxhg"
            },
            "pagination": {"limit": 30}
        }
    }
}

# Run the scraper
print("üöÄ Starting review collection...")
reviews = get_all_reviews(
    config["api_url"],
    config["payload_template"],
    config["headers"]
)

# Process and save results
if reviews:
    print(f"\nüìä Success! Collected {len(reviews)} reviews.")

    df = pd.json_normalize([{
        **r['node'],
        'creator_name': r['node']['creator']['name'],
        'creator_image': r['node']['creator']['imageUrlSquare']
    } for r in reviews])

    # Save to Drive
    # Replace your save code with this:
save_path = '/content/drive/MyDrive/Goodreads_Data/'
!mkdir -p "{save_path}"

# Clean the title (replace spaces with underscores)
clean_title = BOOK_TITLE.replace(" ", "_")

df.to_csv(f'{save_path}{clean_title}_reviews.csv', index=False)
df.to_json(f'{save_path}{clean_title}_reviews.json', indent=2)

print(f"Saved as:\n{save_path}{clean_title}_reviews.csv\n{save_path}{clean_title}_reviews.json")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
üöÄ Starting review collection...


Collecting reviews:   2%|‚ñè         | 600/31195 [00:49<47:44, 10.68review/s]

üìñ Collected 600 reviews so far...


Collecting reviews:   4%|‚ñç         | 1200/31195 [01:42<48:16, 10.35review/s]

üìñ Collected 1200 reviews so far...


Collecting reviews:   6%|‚ñå         | 1800/31195 [02:35<47:27, 10.32review/s]

üìñ Collected 1800 reviews so far...


Collecting reviews:   8%|‚ñä         | 2400/31195 [03:29<46:52, 10.24review/s]

üìñ Collected 2400 reviews so far...


Collecting reviews:  10%|‚ñâ         | 3000/31195 [04:24<48:02,  9.78review/s]

üìñ Collected 3000 reviews so far...


Collecting reviews:  12%|‚ñà‚ñè        | 3600/31195 [05:18<44:07, 10.42review/s]

üìñ Collected 3600 reviews so far...


Collecting reviews:  13%|‚ñà‚ñé        | 4200/31195 [06:13<46:31,  9.67review/s]

üìñ Collected 4200 reviews so far...


Collecting reviews:  15%|‚ñà‚ñå        | 4800/31195 [07:11<44:25,  9.90review/s]

üìñ Collected 4800 reviews so far...


Collecting reviews:  17%|‚ñà‚ñã        | 5400/31195 [08:06<43:21,  9.92review/s]

üìñ Collected 5400 reviews so far...


Collecting reviews:  19%|‚ñà‚ñâ        | 6000/31195 [09:02<42:04,  9.98review/s]

üìñ Collected 6000 reviews so far...


Collecting reviews:  21%|‚ñà‚ñà        | 6600/31195 [09:56<40:01, 10.24review/s]

üìñ Collected 6600 reviews so far...


Collecting reviews:  23%|‚ñà‚ñà‚ñé       | 7199/31195 [10:50<38:57, 10.27review/s]

üìñ Collected 7199 reviews so far...


Collecting reviews:  25%|‚ñà‚ñà‚ñå       | 7799/31195 [11:45<38:46, 10.06review/s]

üìñ Collected 7799 reviews so far...


Collecting reviews:  27%|‚ñà‚ñà‚ñã       | 8399/31195 [12:38<37:33, 10.12review/s]

üìñ Collected 8399 reviews so far...


Collecting reviews:  29%|‚ñà‚ñà‚ñâ       | 8999/31195 [13:33<36:03, 10.26review/s]

üìñ Collected 8999 reviews so far...


Collecting reviews:  31%|‚ñà‚ñà‚ñà       | 9598/31195 [14:26<34:48, 10.34review/s]

üìñ Collected 9598 reviews so far...


Collecting reviews:  33%|‚ñà‚ñà‚ñà‚ñé      | 10198/31195 [15:19<33:46, 10.36review/s]

üìñ Collected 10198 reviews so far...


Collecting reviews:  35%|‚ñà‚ñà‚ñà‚ñç      | 10798/31195 [16:13<33:05, 10.27review/s]

üìñ Collected 10798 reviews so far...


Collecting reviews:  37%|‚ñà‚ñà‚ñà‚ñã      | 11398/31195 [17:06<32:05, 10.28review/s]

üìñ Collected 11398 reviews so far...


Collecting reviews:  38%|‚ñà‚ñà‚ñà‚ñä      | 11998/31195 [18:00<30:59, 10.32review/s]

üìñ Collected 11998 reviews so far...


Collecting reviews:  40%|‚ñà‚ñà‚ñà‚ñà      | 12598/31195 [18:53<31:06,  9.96review/s]

üìñ Collected 12598 reviews so far...


Collecting reviews:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 13198/31195 [19:48<30:30,  9.83review/s]

üìñ Collected 13198 reviews so far...


Collecting reviews:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 13798/31195 [20:45<28:37, 10.13review/s]

üìñ Collected 13798 reviews so far...


Collecting reviews:  46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 14398/31195 [21:41<28:03,  9.98review/s]

üìñ Collected 14398 reviews so far...


Collecting reviews:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 14998/31195 [22:37<26:28, 10.20review/s]

üìñ Collected 14998 reviews so far...


Collecting reviews:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 15598/31195 [23:32<26:14,  9.91review/s]

üìñ Collected 15598 reviews so far...


Collecting reviews:  52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 16198/31195 [24:26<23:48, 10.50review/s]

üìñ Collected 16198 reviews so far...


Collecting reviews:  54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 16798/31195 [25:22<23:55, 10.03review/s]

üìñ Collected 16798 reviews so far...


Collecting reviews:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 17398/31195 [26:16<22:54, 10.04review/s]

üìñ Collected 17398 reviews so far...


Collecting reviews:  58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 17998/31195 [27:10<21:33, 10.20review/s]

üìñ Collected 17998 reviews so far...


Collecting reviews:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 18598/31195 [28:04<20:17, 10.35review/s]

üìñ Collected 18598 reviews so far...


Collecting reviews:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 19198/31195 [28:58<19:28, 10.27review/s]

üìñ Collected 19198 reviews so far...


Collecting reviews:  63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 19798/31195 [29:52<18:38, 10.19review/s]

üìñ Collected 19798 reviews so far...


Collecting reviews:  65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 20398/31195 [30:47<18:23,  9.79review/s]

üìñ Collected 20398 reviews so far...


Collecting reviews:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 20998/31195 [31:42<17:00, 10.00review/s]

üìñ Collected 20998 reviews so far...


Collecting reviews:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 21598/31195 [32:36<15:26, 10.36review/s]

üìñ Collected 21598 reviews so far...


Collecting reviews:  71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 22198/31195 [33:29<14:25, 10.39review/s]

üìñ Collected 22198 reviews so far...


Collecting reviews:  73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 22798/31195 [34:23<13:36, 10.29review/s]

üìñ Collected 22798 reviews so far...


Collecting reviews:  75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 23398/31195 [35:16<12:49, 10.13review/s]

üìñ Collected 23398 reviews so far...


Collecting reviews:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 23998/31195 [36:10<11:34, 10.36review/s]

üìñ Collected 23998 reviews so far...


Collecting reviews:  79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 24598/31195 [37:03<10:53, 10.10review/s]

üìñ Collected 24598 reviews so far...


Collecting reviews:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 24838/31195 [37:26<09:35, 11.05review/s]


‚ö†Ô∏è API Error: [{'path': ['getReviews', 'edges', 21, 'node', 'creator'], 'data': None, 'errorType': 'MappingTemplate', 'errorInfo': None, 'locations': [{'line': 7, 'column': 25, 'sourceName': None}], 'message': 'Template transformation yielded an empty response.'}]

üìä Success! Collected 24838 reviews.
Saved as:
/content/drive/MyDrive/Goodreads_Data/eragon_paolini_reviews.csv
/content/drive/MyDrive/Goodreads_Data/eragon_paolini_reviews.json


In [None]:
# ===== 1. RESET ENVIRONMENT =====
# Clean slate installation
!pip install --upgrade --force-reinstall \
    numpy==2.0.2 \
    scipy==1.13.1 \
    pandas==2.2.2 \
    google-cloud-bigquery==3.31.0 \
    beautifulsoup4==4.12.3 \
    emoji==2.11.0 \
    textblob==0.17.1 \
    vaderSentiment==3.3.2 \
    gensim==4.3.2 \
    pyLDAvis==3.4.1 \
    --quiet

# Verify core packages
import numpy, scipy, pandas
print(f"Versions: numpy {numpy.__version__}, scipy {scipy.__version__}, pandas {pandas.__version__}")

# ===== 2. FIX SPACY CONFLICT =====
# Isolate our environment (optional but recommended)
!python -m pip install --user --ignore-installed virtualenv
!python -m virtualenv colab_env
!source colab_env/bin/activate

# ===== 3. DRIVE MOUNT =====
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
print("‚úÖ Environment ready")

[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m60.9/60.9 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m40.5/40.5 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m57.7/57.7 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m19.5/19.5 MB[0m [31m49.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m13.0/13.0 MB[0m [31m39.6 MB/s[0m eta [36m0:00:00[0m
[

In [None]:
# ===== 1. CLEAN INSTALLATION =====
# First uninstall problematic packages
!pip uninstall -y numpy scipy pandas gensim

# Install stable versions
!pip install \
    numpy==1.26.4 \
    scipy==1.13.1 \
    pandas==2.2.2 \
    google-cloud-bigquery==3.31.0 \
    beautifulsoup4==4.12.3 \
    emoji==2.11.0 \
    textblob==0.17.1 \
    vaderSentiment==3.3.2 \
    gensim==4.3.2 \
    pyLDAvis==3.4.1 \
    --quiet

# ===== 2. VERIFY INSTALLS =====
import numpy as np
import scipy
import pandas as pd
from scipy.linalg import triu  # Previously problematic import

print(f"Versions: numpy {np.__version__}, scipy {scipy.__version__}, pandas {pd.__version__}")
print("‚úÖ All packages installed correctly")

# ===== 3. MOUNT DRIVE =====
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: scipy 1.13.1
Uninstalling scipy-1.13.1:
  Successfully uninstalled scipy-1.13.1
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spacy 3.8.7 requires thinc<8.4.0,>=8.3.4, which is not installed.
google-colab 1.0.0 requires google-auth==2.38.0, but you have google-auth 2.40.3 which is incompatible.
google-colab 1.0.0 requires requests==2.32.3, but you have requests 2.32.4 which is incompatible.
sklearn-compat 0.1.3 requires scikit-learn<1.7,>=1.2, but you have scikit-learn 1.7.0 which is incompatible.
tensorflow 2.18.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.3, but you have protobuf 6.31.1 which is incompatible.
ydf 0.12.0 requires protobuf<6.0.0,>=5.29.1, but you have pr

ImportError: cannot import name 'triu' from 'scipy.linalg' (/usr/local/lib/python3.11/dist-packages/scipy/linalg/__init__.py)