In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import duckdb

import io
import os
import json

from datetime import datetime

import pprint
from tabulate import tabulate

from plydata import define, query, select, head, do, group_by, summarize, arrange, left_join, distinct

import mysql.connector
from sqlalchemy import create_engine

In [6]:
def show_table(table):
    print(tabulate(table, headers='keys', showindex=False, tablefmt='presto'))

In [19]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from retrying import retry
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [20]:
# Create a custom session with a retry strategy
def create_session_with_retry():
    session = requests.Session()
    retries = Retry(total=3, backoff_factor=0.3, status_forcelist=[500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries=retries)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

In [21]:
def get_session_content(url, content_type='html', session=None): 
    response = requests.get(url) if session is None else session.get(url)
    status_code = response.status_code
    if status_code == 200:
        if content_type == 'html': 
            return response.text
        elif content_type == 'json': 
            return response.json() 
    else:
        print(f"GET request to {url} failed with status code: {status_code}")
        return None

In [22]:
def get_request_content(url, content_type ='html'):
    return get_session_content(url, content_type, session=None)

In [23]:
def get_page_content(url, content_type='html', browser=False):
    html_content = None
    if browser:
        options = webdriver.ChromeOptions()
        # Run the browser in headless mode (no UI).
        options.add_argument("--headless") 
        # Disable GPU acceleration to avoid potential issues.
        options.add_argument("--disable-gpu")  
        # Prevent the browser window from appearing in the menu bar.
        driver = webdriver.Chrome(options=options)
        options.add_argument("--disable-software-rasterizer")  
        driver.get(url)
        html_content = driver.page_source
        driver.close()
    else:
        html_content = get_request_content(url, content_type)
    return html_content

In [24]:
from urllib.parse import urlencode

def build_url_with_parameters(base_url, params):
    # Encode the parameters into a URL-encoded string
    encoded_params = urlencode(params)
#     print(encoded_params)

    # Append the encoded parameters to the base URL
    complete_url = f"{base_url}?{encoded_params}"

    return complete_url

In [25]:
import random, string, uuid

In [36]:
%%time

base_url = 'https://ccclib.bibliocommons.com/v2/search'
params = {
    'searchType': 'keyword',
    #'f_FORMAT': 'BK|PICTURE_BOOK|BOARD_BK|GRAPHIC_NOVEL',
    }
search_query = 'pete the cat'
params['query'] = search_query
request_ts = datetime.now()
request_id = int(str(int(datetime.timestamp(request_ts))) + ''.join(random.choices(string.digits, k=3)) + str(int(uuid.uuid4()) % 10))

page_num = 1 
default_total_page_cnt = 1000

search_result_data = []

print("*" * 120)
print("Search Query: {}".format(search_query))
print("Search Request Timestamp: {}".format(request_ts))
print("Search Request Id: {}".format(request_id))
print("*" * 120)

while page_num <= default_total_page_cnt:
    
    params['page'] = page_num
    search_result_url = build_url_with_parameters(base_url, params)
    
    print("Featching page number {}: {}".format(page_num, search_result_url))
    
    start_ts = datetime.today()
    html_content = get_page_content(search_result_url, browser=True)
    bs = BeautifulSoup(html_content, 'lxml')
    
    if page_num == 1:
        total_search_result_cnt = int(bs.select_one('span.cp-pagination-label').text.split('of')[-1].split()[0])
        total_page_cnt = (total_search_result_cnt - 1) // 10 + 1
        default_total_page_cnt = total_page_cnt
    
    search_results = bs.select('div.cp-search-result-item-content')
    search_results_cnt = len(search_results)

    for i in range(search_results_cnt):
        
        search_result = search_results[i]
        
        title = search_result.select_one('h2.cp-title > a > span.title-content').text
        subtitle_tag = search_result.select_one('h2.cp-title:has(span.cp-subtitle) > span.cp-subtitle')
        subtitle = subtitle_tag.text if subtitle_tag else None
        title = title + ' ' + subtitle if subtitle else title
        
        authors_tag = search_result.select_one('span.cp-author-link')
        authors = authors_tag.text if authors_tag else None
        
        items = search_result.select("div.manifestation-item")
        items_cnt = len(items)
#         print(title, subtitle, authors)
        
        for j in range(items_cnt):
            search_result_json = {}

            search_result_json['id'] =  request_id
            search_result_json['query'] = search_query
            search_result_json['srp_url'] = search_result_url
            search_result_json['page_rank'] = i+1 
            search_result_json['position'] = j+1
            search_result_json['page'] = page_num
            search_result_json['total_pages'] = total_page_cnt
            search_result_json['total_search_results'] = total_search_result_cnt
            search_result_json['title'] = title
            search_result_json['subtitle'] = subtitle
            search_result_json['authors'] = authors
            # print(i+1, j+1, page_num)
            
            item = items[j]
            
            search_result_json['item_url'] = item.select_one('div.manifestation-item-format-info-wrap > a')['href'] 
            data_test_id = item.select_one('div.manifestation-item-format-info-wrap > a')['data-test-id']
            search_result_json['book_id'] = data_test_id.replace('item-link-', '')
            search_result_json['format'] = item.select_one('span.cp-format-indicator').get_text()

            try:
                publication_date_text = item.select_one('span.cp-publication-date').text
                publication_date = publication_date_text.replace('-','').strip()
            except:
                publication_date = None
            search_result_json['publication_date'] = publication_date
            
            search_result_json['book_info'] = item.select_one('span.cp-screen-reader-message').get_text()
            search_result_json['availability_status'] = item.select_one('span.cp-availability-status').get_text()
            
            search_result_json['start_ts'] = str(start_ts)
            complete_ts = datetime.today()
            search_result_json['complete_ts'] = str(complete_ts)
            
            search_result_data.append(search_result_json)
            
    page_num+=1

print("*" * 120)

************************************************************************************************************************
Search Query: pete the cat
Search Request Timestamp: 2024-02-13 23:10:19.327170
Search Request Id: 17078946195594
************************************************************************************************************************
Featching page number 1: https://ccclib.bibliocommons.com/v2/search?searchType=keyword&query=pete+the+cat&page=1
Featching page number 2: https://ccclib.bibliocommons.com/v2/search?searchType=keyword&query=pete+the+cat&page=2
Featching page number 3: https://ccclib.bibliocommons.com/v2/search?searchType=keyword&query=pete+the+cat&page=3
Featching page number 4: https://ccclib.bibliocommons.com/v2/search?searchType=keyword&query=pete+the+cat&page=4
Featching page number 5: https://ccclib.bibliocommons.com/v2/search?searchType=keyword&query=pete+the+cat&page=5
Featching page number 6: https://ccclib.bibliocommons.com/v2/search?searchType=

In [37]:
search_result_data

[{'id': 17078946195594,
  'query': 'pete the cat',
  'srp_url': 'https://ccclib.bibliocommons.com/v2/search?searchType=keyword&query=pete+the+cat&page=1',
  'page_rank': 1,
  'position': 1,
  'page': 1,
  'total_pages': 11,
  'total_search_results': 110,
  'title': 'Pete the Cat Super Pete',
  'subtitle': 'Super Pete',
  'authors': 'Dean, Kim',
  'item_url': '/item/show/1961434154',
  'book_id': 'S154C1961434',
  'format': 'Book',
  'publication_date': None,
  'book_info': 'Book, 2020. First Edition.. Call number: JE DEAN, K.',
  'availability_status': 'Available ',
  'start_ts': '2024-02-13 23:10:19.329544',
  'complete_ts': '2024-02-13 23:10:23.142780'},
 {'id': 17078946195594,
  'query': 'pete the cat',
  'srp_url': 'https://ccclib.bibliocommons.com/v2/search?searchType=keyword&query=pete+the+cat&page=1',
  'page_rank': 2,
  'position': 1,
  'page': 1,
  'total_pages': 11,
  'total_search_results': 110,
  'title': 'Pete the Cat I Love My White Shoes',
  'subtitle': 'I Love My White 

In [38]:
search_results = pd.DataFrame(search_result_data)

In [40]:
show_table(search_results >> head(10))

             id | query        | srp_url                                                                                 |   page_rank |   position |   page |   total_pages |   total_search_results | title                              | subtitle              | authors      | item_url              | book_id      | format                 | publication_date   | book_info                                                              | availability_status   | start_ts                   | complete_ts
----------------+--------------+-----------------------------------------------------------------------------------------+-------------+------------+--------+---------------+------------------------+------------------------------------+-----------------------+--------------+-----------------------+--------------+------------------------+--------------------+------------------------------------------------------------------------+-----------------------+----------------------------+---------------