In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import duckdb

import io
import os
import json

from datetime import datetime

import pprint
from tabulate import tabulate

from plydata import define, query, select, head, do, group_by, summarize, arrange, left_join, distinct

import mysql.connector
from sqlalchemy import create_engine

In [2]:
def show_table(table):
    print(tabulate(table, headers='keys', showindex=False, tablefmt='presto'))

In [3]:
import requests
from bs4 import BeautifulSoup

def get_request_url(url):
    try:
        response = requests.get(url)
        if response.ok:  # Checks if status code is less than 400, this covers all successful 2xx responses.
            return {'success': True, 'content': response.text}
        else:
            # More detailed error handling based on the status code can be added here
            return {'success': False, 'error': f"Error: Received status code {response.status_code}"}
    except requests.exceptions.HTTPError as http_err:
        return {'success': False, 'error': f'HTTP error occurred: {http_err}'}
    except requests.exceptions.ConnectionError as conn_err:
        return {'success': False, 'error': f'Connection error occurred: {conn_err}'}
    except requests.exceptions.Timeout as timeout_err:
        return {'success': False, 'error': f'Timeout error occurred: {timeout_err}'}
    except requests.exceptions.RequestException as req_err:
        return {'success': False, 'error': f'Error occurred: {req_err}'}

In [4]:
book_url = 'https://ccclib.bibliocommons.com/v2/record/S154C1815241'

In [5]:
book_id = book_url.split('/')[-1]
book_id

'S154C1815241'

In [20]:
%%time
response = get_request_url(book_url)

if response['success']:
    book_html_content = response['content']

CPU times: user 62.7 ms, sys: 5.42 ms, total: 68.2 ms
Wall time: 1.1 s


In [7]:
book_html_content



In [17]:
def get_book_info(book_html_content):

    book_id = book_url.split('/')[-1]
    bs = BeautifulSoup(book_html_content, 'lxml')
    
    rating_text = bs.select_one('span.cp-rating-stars > span.cp-screen-reader-message').text
    rating = rating_text.split('out of')[0].split()[-1]
    rating = float(rating)
    reviews = rating_text.split('based on')[-1].split()[0]
    reviews = int(reviews)
    
    # bib_details = bs.select_one('span.bib-details').text.split(',')
    # book_format = bib_details[0]
    # publication_date = bib_details[1] if len(bib_details) > 1 else None
    
    title = bs.select_one('div.title-wrapper > h1.cp-heading > span').text
    subtitle_tag = bs.select_one('div.title-wrapper > div.sub-title')
    subtitle = subtitle_tag.text if subtitle_tag else None
    title = title + ' ' + subtitle if subtitle else title

    script_tag = bs.find('script', {'type': 'application/json', 'data-iso-key': '_0'})
    json_string = script_tag.string.strip()
    json_data = json.loads(json_string)

    bibs = json_data['entities']['bibs'][book_id]
    brief_info = bibs['briefInfo']
    availability = bibs['availability']
    
    description = brief_info['description']
    primary_language = brief_info['primaryLanguage']

    total_copies = availability['totalCopies']
    available_copies = availability['availableCopies']
    on_hold_copies = availability['heldCopies'] if availability['heldCopies'] is not None else 0
    on_order_copies = availability['onOrderCopies'] if availability['onOrderCopies'] is not None else 0

    audiences_text = brief_info['audiences']
    audiences = ','.join(audiences_text) if audiences_text is not None and type(audiences_text) == list and len(audiences_text) > 0 else None
    
    isbns_text = brief_info['isbns']
    isbns = ','.join(isbns_text) if isbns_text is not None and type(isbns_text) == list and len(isbns_text) > 0 else None 
    isbn = isbns_text[0] if isbns_text is not None and type(isbns_text) == list and len(isbns_text) > 0 else None

    authors_text = brief_info['authors']
    authors = ';'.join(authors_text) if authors_text is not None and type(authors_text) == list and len(authors_text) > 0 else None

    genre_form_text = brief_info['genreForm']
    genres = ','.join(genre_form_text) if genre_form_text is not None and type(genre_form_text) == list and len(genre_form_text) > 0 else None

    book_format_tags = bs.select('div.format-chooser-capsule-desktop > div[data-key=format-chooser-capsule]')
    book_format_cnt = len(book_format_tags)
        # book_format_json = {} if book_format_cnt > 1 else None

    bib_details = book_format_tags[0].select_one('span.bib-details').text.split(',')
    book_format = bib_details[0]
    publication_date = bib_details[1] if len(bib_details) > 1 else None

    book_format_json = { book_format : book_id }
    # print(book_format_json)
    
    for i in range(1, book_format_cnt):
        book_format_tag = book_format_tags[i]
        # print(book_format_tag)
        book_format_text = book_format_tag.select_one('div.manifestation-overview > span.bib-details').text
        # print(book_format_text)
        book_format_a_tag = book_format_tag.select_one('a.format-chooser-capsule-content')
        # print(book_format_a_tag)
        book_format_id = book_format_a_tag['href'].replace('https://ccclib.bibliocommons.com/v2/record/', '').strip() if book_format_a_tag else None
        book_format_json[book_format_text] = book_format_id
        
    book_format_json = dict(sorted(book_format_json.items()))
    book_format_json_text = json.dumps(book_format_json)
    # print(book_format_json_text)
    
    # book_format_ids = list(book_format_json.values()) if book_format_cnt > 1 else []
    book_format_ids = list(book_format_json.values())
    # book_format_ids.append(book_id)
    
    book_format_ids_text = ','.join(sorted(book_format_ids))
    # print(book_format_ids_text)
    
    book_json = {}
    book_json['id'] = book_id
    book_json['title'] = title
    book_json['subtitle'] = subtitle
    book_json['format'] = book_format

    book_json['authors'] = authors
    book_json['audiences'] = audiences
    book_json['genres'] = genres
    
    book_json['rating'] = rating
    book_json['reviews'] = reviews
    
    book_json['description'] = description
    book_json['primary_language'] = primary_language
    book_json['total_copies'] = total_copies
    book_json['available_copies'] = available_copies
    book_json['on_hold_copies'] = on_hold_copies
    book_json['on_order_copies'] = on_order_copies
    
    book_json['isbn'] = isbn
    book_json['isbns'] = isbns

    book_json['book_format_info'] = book_format_json_text
    book_json['all_book_ids'] = book_format_ids_text
    
    book_json['book_url'] = book_url
    book_json['availability_details_url'] = f"https://gateway.bibliocommons.com/v2/libraries/ccclib/bibs/{book_id}/availability"
    book_json['comments_url'] = f"https://ccclib.bibliocommons.com/v2/record/{book_id}/comments"
    book_json['recommendations_url'] = f"https://ccclib.bibliocommons.com/v2/record/{book_id}/recommendations"
    book_json['lists_url'] = f"https://ccclib.bibliocommons.com/v2/record/{book_id}/lists"
    
    return book_json

In [19]:
%%time

book_json = get_book_info(book_html_content)
book_json

CPU times: user 39.4 ms, sys: 2.57 ms, total: 42 ms
Wall time: 41 ms


{'id': 'S154C1815241',
 'title': 'Pete the Cat I Love My White Shoes',
 'subtitle': 'I Love My White Shoes',
 'format': 'eBook',
 'authors': 'Litwin, Eric',
 'audiences': 'JUVENILE',
 'genres': 'Electronic books,Fiction',
 'rating': 4.45,
 'reviews': 975,
 'description': 'As he walks down the street, Pete the cat sings about his brand new white shoes as they change from red to blue to brown to wet.',
 'primary_language': 'eng',
 'total_copies': 2,
 'available_copies': 0,
 'on_hold_copies': 1,
 'on_order_copies': 0,
 'isbn': '9780062065605',
 'isbns': '9780062065605',
 'book_format_info': '{"Picture Book": "S154C1210734", "eBook": "S154C1815241"}',
 'all_book_ids': 'S154C1210734,S154C1815241',
 'book_url': 'https://ccclib.bibliocommons.com/v2/record/S154C1815241',
 'availability_details_url': 'https://gateway.bibliocommons.com/v2/libraries/ccclib/bibs/S154C1815241/availability',
 'comments_url': 'https://ccclib.bibliocommons.com/v2/record/S154C1815241/comments',
 'recommendations_url': 