<a href="https://colab.research.google.com/github/siming-deng/cis3120/blob/main/cis3120_hw3_nyt_api.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# importing various libraries for web scraping, API calls, and data frames.
import requests
import pandas as pd
from bs4 import BeautifulSoup
import json
import time

In [2]:
# web page selected to scrape
url = 'https://en.wikipedia.org/wiki/The_New_York_Times_Fiction_Best_Sellers_of_2021'

# get the html from the webpage
page = requests.get(url)

# use BeautifulSoup to parse the html
soup = BeautifulSoup(page.content, 'html.parser')

# select the table wanted
table = soup.find_all('table', class_='wikitable')

# create lists to store the information
date = []
books = []
authors = []
publisher = []

In [3]:
rows = table[0].find_all('tr')

# loop over the rows
for row in rows:
  cells = row.find_all('td')
  if len(cells) != 4:
    continue
  else:
    # get text from the webpage
    book_name = cells[1].get_text().rstrip('\n')
    author_name = cells[2].get_text().rstrip('\n')
    publisher_name = cells[3].get_text().rstrip('\n')

  # if the book name does not in the books list, add the name to the list
  if book_name not in books:
    books.append(book_name)
    authors.append(author_name)
    publisher.append(publisher_name)

# creating series
books_data = {
    'Book': books,
    'Author': authors,
    'Publisher': publisher
}

# creating the data frame for the books
books_df = pd.DataFrame(books_data)

# displaying the data frame
books_df

Unnamed: 0,Book,Author,Publisher
0,A Time for Mercy,John Grisham,Doubleday
1,The Duke and I,Julia Quinn,Avon
2,The Russian,James Patterson and James O. Born,"Little, Brown and Company"
3,The Four Winds,Kristin Hannah,St. Martin's Press
4,A Court of Silver Flames,Sarah J. Maas,Bloomsbury Publishing
5,Life After Death,Sister Souljah,Emily Bestler Books
6,Win,Harlen Coben,Grand Central Publishing
7,The Hill We Climb,Amanda Gorman,Viking Books
8,Ocean Prey,John Sandford,G. P. Putnam's Sons
9,A Gambling Man,David Baldacci,Grand Central Publishing


In [4]:
# key obtained from NYTimes Book API
key = 'asGHW6nc2WhCIpb1bglrJoPecMfUAm7h'

# created lists to store the information
description = []
bestsellers_date = []
rank = []
rank_last_week = []
weeks_on_list = []
nyt_books = []
display = []
isbn = []

# iterating two lists simultaneously 
for (book,author) in zip(books, authors):

  # making the API calls
  url = f'https://api.nytimes.com/svc/books/v3/lists/best-sellers/history.json?title={book}&author={author}&api-key={key}'
  r = requests.get(url)

  # if the call success, print out the results and append the information to the lists created.
  if (r.status_code) == 200:
    data = r.json()
    if not data['results']:
      print("NYTimes Book API does not find any results")
      print(" ")
      continue
    else:
      print(f"The book title is: {data['results'][0]['title']}")
      nyt_books.append(data['results'][0]['title'])
      print(f"Book description is: {data['results'][0]['description']}")
      description.append(data['results'][0]['description'])
      print(f"The rank is: {data['results'][0]['ranks_history'][0]['rank']}")
      rank.append(data['results'][0]['ranks_history'][0]['rank'])
      print(f"Display name is: {data['results'][0]['ranks_history'][0]['display_name']}")
      display.append(data['results'][0]['ranks_history'][0]['display_name'])
      print(f"Best sellers date is: {data['results'][0]['ranks_history'][0]['bestsellers_date']}")
      bestsellers_date.append(data['results'][0]['ranks_history'][0]['bestsellers_date'])
      print(f"Weeks on the list are: {data['results'][0]['ranks_history'][0]['weeks_on_list']}")
      weeks_on_list.append(data['results'][0]['ranks_history'][0]['weeks_on_list'])
      print(f"The last week rank is: {data['results'][0]['ranks_history'][0]['rank_last_week']}")
      rank_last_week.append(data['results'][0]['ranks_history'][0]['rank_last_week'])
      print(f"Primary ISBN13 is: {data['results'][0]['ranks_history'][0]['primary_isbn13']}")
      isbn.append(data['results'][0]['ranks_history'][0]['primary_isbn13'])
      print(" ")
      time.sleep(10)
  else:
    print("API request was unsuccessful.")

# created series 
nytimes_data = {
    'Book': nyt_books,
    'Description': description,
    'Rank': rank,
    'Display': display,
    'Best Sellers Date': bestsellers_date,
    'Weeks on List': weeks_on_list,
    'Rank Last Week': rank_last_week,
    'Primary ISBN13': isbn,
}

# created the nytimes books data frame
nytimes_df = pd.DataFrame(nytimes_data)

The book title is: A TIME FOR MERCY
Book description is: The third book in the Jake Brigance series. A 16-year-old is accused of killing a deputy in Clanton, Miss., in 1990.
The rank is: 14
Display name is: Mass Market
Best sellers date is: 2021-10-30
Weeks on the list are: 0
The last week rank is: 0
Primary ISBN13 is: 9780593157817
 
The book title is: THE DUKE AND I
Book description is: The first book in the Bridgerton series. Daphne Bridgerton’s reputation soars when she colludes with the Duke of Hastings. The basis of the Netflix series “Bridgerton.”
The rank is: 11
Display name is: Mass Market
Best sellers date is: 2021-05-01
Weeks on the list are: 0
The last week rank is: 0
Primary ISBN13 is: 9780063078901
 
The book title is: THE RUSSIAN
Book description is: The 13th book in the Michael Bennett series. An assassin killing a number of women might disrupt the detective’s wedding plans.
The rank is: 10
Display name is: Hardcover Fiction
Best sellers date is: 2021-02-27
Weeks on the

In [5]:
# displaying the nytimes books data frame
nytimes_df

Unnamed: 0,Book,Description,Rank,Display,Best Sellers Date,Weeks on List,Rank Last Week,Primary ISBN13
0,A TIME FOR MERCY,The third book in the Jake Brigance series. A ...,14,Mass Market,2021-10-30,0,0,9780593157817
1,THE DUKE AND I,The first book in the Bridgerton series. Daphn...,11,Mass Market,2021-05-01,0,0,9780063078901
2,THE RUSSIAN,The 13th book in the Michael Bennett series. A...,10,Hardcover Fiction,2021-02-27,5,8,9780316420389
3,THE FOUR WINDS,As dust storms roll during the Great Depressio...,15,Audio Fiction,2021-10-02,0,0,9781250317247
4,A COURT OF SILVER FLAMES,The fifth book in A Court of Thorns and Roses ...,15,Hardcover Fiction,2021-04-03,6,0,9781681196282
5,LIFE AFTER DEATH,"In a sequel to “The Coldest Winter Ever,” Wint...",12,Hardcover Fiction,2021-04-10,6,8,9781982139131
6,THE HILL WE CLIMB,The poem read on President Joe Biden's Inaugur...,12,Hardcover Fiction,2021-06-26,12,0,9780593465271
7,OCEAN PREY,The 31st book in the Prey series. When federal...,15,Hardcover Fiction,2021-05-08,4,7,9780593087022
8,A GAMBLING MAN,"Aloysius Archer, a World War II veteran, seeks...",13,Paperback Trade Fiction,2021-11-06,6,12,9781538719688
9,SOOLEY,Samuel Sooleymon receives a basketball scholar...,13,Hardcover Fiction,2021-07-31,12,0,9780385547680


In [6]:
# converting the book names to lower case
books_df['Book'] = books_df['Book'].str.lower()
nytimes_df['Book'] = nytimes_df['Book'].str.lower()

# merge the two data frames horizontally
merged_df = pd.merge(books_df, nytimes_df, on = ['Book'] , how = 'left')

# display the merged data frame
merged_df

Unnamed: 0,Book,Author,Publisher,Description,Rank,Display,Best Sellers Date,Weeks on List,Rank Last Week,Primary ISBN13
0,a time for mercy,John Grisham,Doubleday,The third book in the Jake Brigance series. A ...,14.0,Mass Market,2021-10-30,0.0,0.0,9780593157817.0
1,the duke and i,Julia Quinn,Avon,The first book in the Bridgerton series. Daphn...,11.0,Mass Market,2021-05-01,0.0,0.0,9780063078901.0
2,the russian,James Patterson and James O. Born,"Little, Brown and Company",The 13th book in the Michael Bennett series. A...,10.0,Hardcover Fiction,2021-02-27,5.0,8.0,9780316420389.0
3,the four winds,Kristin Hannah,St. Martin's Press,As dust storms roll during the Great Depressio...,15.0,Audio Fiction,2021-10-02,0.0,0.0,9781250317247.0
4,a court of silver flames,Sarah J. Maas,Bloomsbury Publishing,The fifth book in A Court of Thorns and Roses ...,15.0,Hardcover Fiction,2021-04-03,6.0,0.0,9781681196282.0
5,life after death,Sister Souljah,Emily Bestler Books,"In a sequel to “The Coldest Winter Ever,” Wint...",12.0,Hardcover Fiction,2021-04-10,6.0,8.0,9781982139131.0
6,win,Harlen Coben,Grand Central Publishing,,,,,,,
7,the hill we climb,Amanda Gorman,Viking Books,The poem read on President Joe Biden's Inaugur...,12.0,Hardcover Fiction,2021-06-26,12.0,0.0,9780593465271.0
8,ocean prey,John Sandford,G. P. Putnam's Sons,The 31st book in the Prey series. When federal...,15.0,Hardcover Fiction,2021-05-08,4.0,7.0,9780593087022.0
9,a gambling man,David Baldacci,Grand Central Publishing,"Aloysius Archer, a World War II veteran, seeks...",13.0,Paperback Trade Fiction,2021-11-06,6.0,12.0,9781538719688.0


In [7]:
# sorted the data frame according to their rank
rank_sorted_merged_df = merged_df.sort_values(by = 'Rank')

# displaying the top 5 rank 
rank_sorted_merged_df.head()

Unnamed: 0,Book,Author,Publisher,Description,Rank,Display,Best Sellers Date,Weeks on List,Rank Last Week,Primary ISBN13
26,apples never fall,Liane Moriarty,Henry Holt and Company,The Delaney siblings suspect their father of c...,3.0,Audio Fiction,2022-01-01,0.0,0.0,9781250810700
28,the lincoln highway,Amor Towles,Viking Books,Two friends who escaped from a juvenile work f...,3.0,Hardcover Fiction,2022-01-22,16.0,5.0,9780735222359
13,the last thing he told me,Laura Dave,Simon & Schuster,Hannah Hall discovers truths about her missing...,6.0,Hardcover Fiction,2022-01-22,36.0,6.0,9781501171345
30,the judge's list,John Grisham,Doubleday,The second book in the Whistler series. Invest...,7.0,Hardcover Fiction,2022-01-22,14.0,8.0,9780385546027
32,game on,Janet Evanovich,Atria Books,The 28th book in the Stephanie Plum series. Di...,8.0,Audio Fiction,2021-11-27,0.0,0.0,9781797128450


In [8]:
# sorted the data frame according to their weeks on list
weeks_on_list_sorted_merged_df = merged_df.sort_values(by = 'Weeks on List', ascending = False)

# display the 5 highest weeks on the list
weeks_on_list_sorted_merged_df.head()

Unnamed: 0,Book,Author,Publisher,Description,Rank,Display,Best Sellers Date,Weeks on List,Rank Last Week,Primary ISBN13
13,the last thing he told me,Laura Dave,Simon & Schuster,Hannah Hall discovers truths about her missing...,6.0,Hardcover Fiction,2022-01-22,36.0,6.0,9781501171345
28,the lincoln highway,Amor Towles,Viking Books,Two friends who escaped from a juvenile work f...,3.0,Hardcover Fiction,2022-01-22,16.0,5.0,9780735222359
27,the wish,Nicholas Sparks,Grand Central Publishing,"Maggie Dawes, a renowned travel photographer, ...",14.0,Hardcover Fiction,2022-01-22,16.0,0.0,9781538728628
30,the judge's list,John Grisham,Doubleday,The second book in the Whistler series. Invest...,7.0,Hardcover Fiction,2022-01-22,14.0,8.0,9780385546027
16,the president's daughter,Bill Clinton and James Patterson,"Little, Brown and Company","Matthew Keating, a past president and former N...",11.0,Hardcover Fiction,2021-09-04,13.0,12.0,9780316540711


In [9]:
# giving the statistical results
merged_df.describe()

Unnamed: 0,Rank,Weeks on List,Rank Last Week
count,34.0,34.0,34.0
mean,11.882353,7.029412,4.941176
std,3.291538,7.03879,5.256691
min,3.0,0.0,0.0
25%,11.0,2.5,0.0
50%,13.0,6.0,3.5
75%,14.0,9.75,10.0
max,15.0,36.0,15.0


In [10]:
# saved the merged data frame into csv file
merged_df.to_csv('The_New_York_Times_Fiction_Best_Sellers_of_2021_Sorted.csv', index = False)