## Notebook to obtain main subjects of each book (if possible) using a public API

In [2]:
import pandas as pd 
import numpy as np 
import requests

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
def get_summary(isbn):
    url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn}&jscmd=data&format=json"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        book_key = f"ISBN:{isbn}"
        if book_key in data:
            book_data = data[book_key]
            title = book_data.get("title", "No Title Available")
            subjects = book_data.get("subjects", [])
            summary = [subject.get("name", "No Subject Name") for subject in subjects] if subjects else ["No Summary Available"]
            return {"title": title, "summary": summary}
        else:
            return {"title": "Not Found", "summary": "Not Found"}
    else:
        return {"title": "Error", "summary": "API Error"}

In [4]:
books = pd.read_csv("Data/books.csv")
train = pd.read_csv("Data/train.csv")
test = pd.read_csv("Data/test.csv")

In [5]:
newTrain = train.merge(books, left_on="book_id", right_on="book_id")
newTrain.head()

Unnamed: 0,book_id,user_id,rating,ISBN
0,7260,20145,3.5,316171638
1,243238,85182,4.0,316166685
2,9135,45973,1.0,446692298
3,18671,63554,3.0,440944597
4,243293,81002,5.0,439244196


In [6]:
newTest = test.merge(books, left_on="book_id", right_on="book_id")
newTest.head()

Unnamed: 0,id,book_id,user_id,ISBN
0,0,3786,40484,375504397
1,1,1985,47039,449212602
2,2,2290,60111,385484518
3,3,118657,64447,380776162
4,4,1560,2953,452281784


In [7]:
train_isbn = newTrain["ISBN"].unique()
test_isbn = newTest["ISBN"].unique()

Some ISBN values are not correct: either NaN or missing 0 at the beginning. So we have to take care of it, which we do here.

In [9]:
not_valid_isbn_train = [(i, isbn) for i, isbn in enumerate(train_isbn) if len(str(isbn)) != 10]
len(not_valid_isbn_train)

3612

In [10]:
not_valid_isbn_test = [(i, isbn) for i, isbn in enumerate(test_isbn) if len(str(isbn)) != 10]
len(not_valid_isbn_test)

563

In [11]:
for i, isbn in not_valid_isbn_train:
    if str(isbn) == "nan":
        train_isbn[i] = "Not Found"
    else:
        train_isbn[i] = str(isbn).zfill(10)
print(len(train_isbn))

15421


In [12]:
for i, isbn in not_valid_isbn_test:
    if str(isbn) == "nan":
        test_isbn[i] = "Not Found"
    else:
        test_isbn[i] = str(isbn).zfill(10)
print(len(test_isbn))

9348


In [30]:
# Example of format of the output
get_summary(train_isbn[90])

{'title': "Bridget Jones's Diary",
 'summary': ['single women',
  'Diary fiction',
  'Humorous fiction',
  'Fiction',
  'Humorous stories',
  'human relationships',
  'Humour',
  'Dieting for women',
  'Publishing',
  'English Humous fiction',
  'Women',
  'English Diary novels',
  'Open Library Staff Picks',
  'Triangles (Interpersonal relations)',
  'Office romance',
  'Friendship',
  'Single people',
  'Dating (Social customs)',
  'Ficción',
  'Solteras',
  'Single women -- England -- Fiction',
  'Fiction, humorous',
  'Single women, fiction',
  'Fiction, women',
  'Jones, bridget (fictitious character), fiction',
  'London (england), fiction',
  'English literature',
  'Large type books',
  'Novela',
  'Junge Frau',
  'Partnerwahl',
  'New York Times reviewed',
  'Fiction, humorous, general',
  'England, fiction',
  'Femmes seules',
  'Romans, nouvelles',
  'Social life and customs',
  'Diaries',
  'Bridget Jones (Fictitious character)']}

Now, get the subjects for the books, when possible (takes some time).

In [None]:
for isbn in test_isbn:
    summary = get_summary(isbn)
    newTest.loc[newTest["ISBN"] == isbn, "title"] = summary["title"]
    newTest.loc[newTest["ISBN"] == isbn, "summary"] = ", ".join(summary["summary"])

In [None]:
for isbn in train_isbn:
    summary = get_summary(isbn)
    newTrain.loc[newTrain["ISBN"] == isbn, "title"] = summary["title"]
    newTrain.loc[newTrain["ISBN"] == isbn, "summary"] = ", ".join(summary["summary"])


In [None]:
newTrain.as_csv("Data/train_with_subjects.csv", index=False)
newTest.as_csv("Data/test_with_subjects.csv", index=False)