In [109]:
import pandas as pd
import requests
import time
import pickle
import ast


Two steps are necessary:
- For every isbn, retrieve the "work" ID from OpenLibrary
- For every work, retrieve the associated tags.

We can't diretcly access the tags from the isbn.

# Get works

In [25]:
df = pd.read_csv('books.csv', on_bad_lines='warn')

In [26]:
def get_book_info(isbns):
    url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{','.join(isbns)}&format=json&jscmd=details"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code}")
        return None

# Example usage
isbns = df.isbn.to_list()
book_info_list = []

for i in range(0, len(isbns), 100):
    chunk = isbns[i:i+100]
    book_info = get_book_info(chunk)
    book_info_list.append(book_info)
    if i > 0 and i % 100 == 0:
        time.sleep(5)


In [98]:
def extract_works(book_info_list):
    works={}
    for ch in book_info_list:
        for isbn, info in ch.items():
            isbn = isbn.replace('ISBN:','')
            if 'details' in info and 'works' in info['details']:
                    work_key = info['details']['works'][0]['key'].replace('/works/', '')
                    works[isbn] = work_key
    return works

dict_isbn_works = extract_works(book_info_list)

In [102]:
df_works = pd.DataFrame({"isbn":dict_isbn_works.keys(),"work":dict_isbn_works.values()})

In [103]:
df_works.to_csv("works.csv")

In [105]:
df_works.nunique()

isbn    11055
work     9900
dtype: int64

# Get tags

In [114]:
def get_tags(work):
    url = f"https://openlibrary.org//works/{work}.json"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code}")
    return None

works_list = list(df_works["work"].unique())

for i in works_list:
    if i in tags:
        continue
    response = get_tags(i)
    tags[i] = {}
    tags[i]["tags"] = response.get("tags") 
    tags[i]["subject_people"] = response.get("subject_people")
    tags[i]["subject_times"] = response.get("subject_times")
    tags[i]["subject_places"] = response.get("subject_places")
    time.sleep(0.1)


In [None]:
# Flatten the nested dictionary
flattened_data = {k: flatten_dict(v) for k, v in tags.items()}

# Create the pandas DataFrame
df_tags = pd.DataFrame.from_dict(flattened_data, orient='index')

# Reset the index and rename the columns
df_tags.reset_index(inplace=True)
df_tags.columns = ['work', 'subjects', 'people', 'times', 'places']
df_tags.to_csv("tags.csv")