In [57]:
from bs4 import BeautifulSoup
import requests

import pandas as pd

Get subjects (genre and subgenre) for which to scrape and store data.

In [None]:
URL = 'https://openlibrary.org/subjects'
response = requests.get(URL)
soup = BeautifulSoup(response.text, 'html.parser')

In [128]:
genres = [header.text for header in soup.find_all('h3')][:-1]

In [135]:
subgenres = []

for genre in soup.find(id='subjectsPage').find_all('ul'):
    subgenres.append([element.text.strip() for element in genre.find_all('li')])

In [140]:
for genre, associated_subgenres in zip(genres, subgenres):
    for subgenre in associated_subgenres:
        print(f'{genre}: {subgenre}')

Arts: Architecture
Arts: Art Instruction
Arts: Art History
Arts: Dance
Arts: Design
Arts: Fashion
Arts: Film
Arts: Graphic Design
Arts: Music
Arts: Music Theory
Arts: Painting
Arts: Photography
Animals: Bears
Animals: Cats
Animals: Kittens
Animals: Dogs
Animals: Puppies
Fiction: Fantasy
Fiction: Historical Fiction
Fiction: Horror
Fiction: Humor
Fiction: Literature
Fiction: Magic
Fiction: Mystery and detective stories
Fiction: Plays
Fiction: Poetry
Fiction: Romance
Fiction: Science Fiction
Fiction: Short Stories
Fiction: Thriller
Fiction: Young Adult
Science & Mathematics: Biology
Science & Mathematics: Chemistry
Science & Mathematics: Mathematics
Science & Mathematics: Physics
Science & Mathematics: Programming
Business & Finance: Management
Business & Finance: Entrepreneurship
Business & Finance: Business Economics
Business & Finance: Business Success
Business & Finance: Finance
Children's: Kids Books
Children's: Stories in Rhyme
Children's: Baby Books
Children's: Bedtime Books
Childr

If we iterate over each genre, then over each associated subgenre, I can store the data in a temporary framework like below:

In [None]:
{'title': [], 'first_published': [], 'authors': [], 'cover_img_url': [], 'languages_available': [], 'subgenre': [],
'genre': []}

Then update a library dataframe as we go along.

In [None]:
library = pd.concat([library, pd.DataFrame(subgenre)], axis=1)

Before I do any additional work, I want to test that I can instantiate an empty dataframe and concatenate it successfully with a non-empty dataframe with the same columns.

In [126]:
# test out instantiating an empty dataframe and concatenating with a "full" df with same columns
pd.concat([pd.DataFrame({'title': [], 'first_published': [], 'authors': [], 'cover_img_url': [],
                        'languages_available': [], 'subgenre': [], 'genre': []}),
           pd.DataFrame(fantasy)], axis=0)

Unnamed: 0,title,first_published,authors,cover_img_url,languages_available,subgenre,genre
0,Sky Island: being the further exciting adventu...,1912.0,"[L. Frank Baum, Mint Editions, John R. (John R...",archive.org/services/img/skyisland00baum,1.0,,fantasy
1,The Well at the World's End,1896.0,[William Morris],archive.org/services/img/lasourceauboutdu0000morr,3.0,,fantasy
2,Phantastes: a faerie romance,1850.0,[George MacDonald],archive.org/services/img/phantastes00geor,1.0,,fantasy
3,Wet Magic (Books of Wonder),1937.0,[Edith Nesbit],archive.org/services/img/wetmagic0000nesb,1.0,,fantasy
4,The Magic City,1910.0,[Edith Nesbit],archive.org/services/img/magiccity0000nesb,1.0,,fantasy
...,...,...,...,...,...,...,...
995,The Emerald City of Oz,1910.0,"[L. Frank Baum, John R. Neill, Jenny Sánchez, ...",archive.org/services/img/emeraldcityoz00baum,1.0,,fantasy
996,The Silmarillion,1800.0,[J.R.R. Tolkien],archive.org/services/img/silmarilliontolk00tolk,11.0,,fantasy
997,Rinkitink in Oz,1916.0,"[L. Frank Baum, Andrew J. Heller, John Neill]",archive.org/services/img/rinkitinkoz00baum,2.0,,fantasy
998,Ozma of Oz,1907.0,"[L. Frank Baum, Erin Yuen, Taylor Anderson, Jo...",archive.org/services/img/ozmaofoz00baumrich,3.0,,fantasy


---

Below is my work to scrape the genres to include in the dataset from openlibrary.org directly. I ran into some issues because one of the pages I needed to find the actual search link for each genre was dynamically loaded, and I do not have experience (yet) with scraping dynamically loaded web pages.

Once the time spent trying to scrape the page reached a certain threshold, I decided to build a manual work-around (temporarily) and revisit updating this part of the code after I've learned more about scraping dynamic content using Selenium.

---

Working on getting multiple genres scraped:

In [87]:
URL = 'https://openlibrary.org/subjects'
response = requests.get(URL)
soup = BeautifulSoup(response.text, 'html.parser')

In [98]:
[header.text for header in soup.find_all('h3')][:-1]

['Arts',
 'Animals',
 'Fiction',
 'Science & Mathematics',
 'Business & Finance',
 "Children's",
 'History',
 'Health & Wellness',
 'Biography',
 'Social Sciences',
 'Places',
 'Textbooks',
 'Books by Language']

In [115]:
root_url = 'https://openlibrary.org'

for name, location_in_soup in zip([header.text for header in soup.find_all('h3')][:-1],
                                  soup.find(id='subjectsPage').find_all('ul')):
    print(name)
    list_of_urls = [root_url+a_tag.find('a')['href'] for a_tag in location_in_soup.find_all('li')]
    print(list_of_urls)
    
    test_url = list_of_urls[0]

Arts
['https://openlibrary.org/subjects/architecture', 'https://openlibrary.org/subjects/art__art_instruction', 'https://openlibrary.org/subjects/history_of_art__art__design_styles', 'https://openlibrary.org/subjects/dance', 'https://openlibrary.org/subjects/design', 'https://openlibrary.org/subjects/fashion', 'https://openlibrary.org/subjects/film', 'https://openlibrary.org/subjects/graphic_design', 'https://openlibrary.org/subjects/music', 'https://openlibrary.org/subjects/music_theory', 'https://openlibrary.org/subjects/painting__paintings', 'https://openlibrary.org/subjects/photography']
Animals
['https://openlibrary.org/subjects/bears', 'https://openlibrary.org/subjects/cats', 'https://openlibrary.org/subjects/kittens', 'https://openlibrary.org/subjects/dogs', 'https://openlibrary.org/subjects/puppies']
Fiction
['https://openlibrary.org/subjects/fantasy', 'https://openlibrary.org/subjects/historical_fiction', 'https://openlibrary.org/subjects/horror', 'https://openlibrary.org/subj

In [116]:
print(test_url)

https://openlibrary.org/search?q=language%3Aeng


In [113]:
intermediate_response = requests.get(test_url)
intermediate_soup = BeautifulSoup(intermediate_response.text, 'html.parser')
intermediate_soup.find('div', class_='page-heading-search-box')

In [114]:
intermediate_soup.find_all('div', id='test-body-mobile') # dynamically loaded content? try selenium?

[<div id="test-body-mobile">
 <div class="flash-messages">
 <div class="error"><span>/openlibrary/openlibrary/templates/work_search.html: error in processing template: ReadTimeout: HTTPConnectionPool(host='ol-solr0', port=8984): Read timed out. (read timeout=10) (falling back to default template)</span></div>
 </div>
     
     
     Unable to render this page.
   </div>]

In [118]:
intermediate_soup.find_all('a')

[<a class="iaLogo" href="https://archive.org"><img alt="Internet Archive logo" src="/static/images/ia-logo.svg" width="160"/></a>,
 <a class="ghost-btn" data-ol-link-track="IABar|DonateButton" href="https://archive.org/donate/?platform=ol&amp;origin=olwww-TopNavDonateButton">Donate <span aria-hidden="true" class="heart">♥</span></a>,
 <a data-lang-id="cs" href="#" lang="cs" title="Czech">Čeština (cs)</a>,
 <a data-lang-id="de" href="#" lang="de" title="German">Deutsch (de)</a>,
 <a data-lang-id="en" href="#" lang="en" title="English">English (en)</a>,
 <a data-lang-id="es" href="#" lang="es" title="Spanish">Español (es)</a>,
 <a data-lang-id="fr" href="#" lang="fr" title="French">Français (fr)</a>,
 <a data-lang-id="hr" href="#" lang="hr" title="Croatian">Hrvatski (hr)</a>,
 <a data-lang-id="pt" href="#" lang="pt" title="Portuguese">Português (pt)</a>,
 <a data-lang-id="te" href="#" lang="te" title="Telugu">తెలుగు (te)</a>,
 <a data-lang-id="uk" href="#" lang="uk" title="Ukrainian">Укр

---

In [None]:
%%time

# come back to this when iterating over genres
# # set initial URL
# URL = 

# reset fantasy dictionary to store information from scraping
fantasy = {'title': [], 'first_published': [], 'authors': [], 'cover_img_url': [], 'languages_available': [],
           'genre': []}

# set initial value for page number
page_no = 1

# set flag to end while loop
flag = 1

# check whether there are still books to be scraped
while flag == 1:
    # set URL of web page to scrape; get new response; make soup
    URL = f'https://openlibrary.org/search?subject=Fantasy&page{page_no}'
    response = requests.get(URL)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # if there are no books on this page, exit loop
    books_in_soup = soup.find_all('li', class_='searchResultItem')
    
    if books_in_soup:    
        for book in books_in_soup:
        # append title, year of first publication, list of authors, cover image URL and number of languages available
            try:
                fantasy['title'].append(book.find('h3', class_='booktitle').find('a').text)
            except:
                print('Warning: title null')
                fantasy['title'].append(None)

            try:
                fantasy['first_published'].append(
                    int(book.find('span', class_='publishedYear').text.strip().split()[-1]))
            except:
                print('Warning: year of first publication null')
                fantasy['first_published'].append(None)

            try:
                fantasy['authors'].append(
                    [a_tag.text for a_tag in book.find('span', class_='bookauthor').find_all('a', class_='results')])
            except:
                print('Warning: authors null')
                fantasy['authors'].append(None)

            try:
                fantasy['cover_img_url'].append(book.find_all('img')[-1]['src'].strip('/'))
            except:
                print('Warning: cover image URL null')
                fantasy['cover_img_url'].append(None)

            try:
                fantasy['languages_available'].append(
                    int(book.find('span', class_='languages').find('a').text.split()[0]))
            except:
                print('Warning: number of languages available null')
                fantasy['languages_available'].append(None)
        
            # also append genre (in this case, fantasy)
            fantasy['genre'].append('fantasy')
            
        # when finished scraping books in soup, increment page number
        page_no += 1
        
        # set a limit on number of pages to scrape
        if page_no <= 50:
            flag = 1
        else:
            flag = 0
            break
        
    else:
        flag = 0
        break

In [84]:
%%time

# come back to this when iterating over genres
# # set initial URL
# URL = 

# reset fantasy dictionary to store information from scraping
fantasy = {'title': [], 'first_published': [], 'authors': [], 'cover_img_url': [], 'languages_available': [],
           'genre': []}

# set initial value for page number
page_no = 1

# set flag to end while loop
flag = 1

# check whether there are still books to be scraped
while flag == 1:
    # set URL of web page to scrape; get new response; make soup
    URL = f'https://openlibrary.org/search?subject=Fantasy&page{page_no}'
    response = requests.get(URL)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # if there are no books on this page, exit loop
    books_in_soup = soup.find_all('li', class_='searchResultItem')
    
    if books_in_soup:    
        for book in books_in_soup:
        # append title, year of first publication, list of authors, cover image URL and number of languages available
            try:
                fantasy['title'].append(book.find('h3', class_='booktitle').find('a').text)
            except:
                print('Warning: title null')
                fantasy['title'].append(None)

            try:
                fantasy['first_published'].append(
                    int(book.find('span', class_='publishedYear').text.strip().split()[-1]))
            except:
                print('Warning: year of first publication null')
                fantasy['first_published'].append(None)

            try:
                fantasy['authors'].append(
                    [a_tag.text for a_tag in book.find('span', class_='bookauthor').find_all('a', class_='results')])
            except:
                print('Warning: authors null')
                fantasy['authors'].append(None)

            try:
                fantasy['cover_img_url'].append(book.find_all('img')[-1]['src'].strip('/'))
            except:
                print('Warning: cover image URL null')
                fantasy['cover_img_url'].append(None)

            try:
                fantasy['languages_available'].append(
                    int(book.find('span', class_='languages').find('a').text.split()[0]))
            except:
                print('Warning: number of languages available null')
                fantasy['languages_available'].append(None)
        
            # also append genre (in this case, fantasy)
            fantasy['genre'].append('fantasy')
            
        # when finished scraping books in soup, increment page number
        page_no += 1
        
        # set a limit on number of pages to scrape
        if page_no <= 50:
            flag = 1
        else:
            flag = 0
            break
        
    else:
        flag = 0
        break

CPU times: user 10.1 s, sys: 258 ms, total: 10.3 s
Wall time: 2min 39s


In [86]:
pd.DataFrame(fantasy)

Unnamed: 0,title,first_published,authors,cover_img_url,languages_available,genre
0,Sky Island: being the further exciting adventu...,1912,"[L. Frank Baum, Mint Editions, John R. (John R...",archive.org/services/img/skyisland00baum,1,fantasy
1,The Well at the World's End,1896,[William Morris],archive.org/services/img/lasourceauboutdu0000morr,3,fantasy
2,Phantastes: a faerie romance,1850,[George MacDonald],archive.org/services/img/phantastes00geor,1,fantasy
3,Wet Magic (Books of Wonder),1937,[Edith Nesbit],archive.org/services/img/wetmagic0000nesb,1,fantasy
4,The Magic City,1910,[Edith Nesbit],archive.org/services/img/magiccity0000nesb,1,fantasy
...,...,...,...,...,...,...
995,The Emerald City of Oz,1910,"[L. Frank Baum, John R. Neill, Jenny Sánchez, ...",archive.org/services/img/emeraldcityoz00baum,1,fantasy
996,The Silmarillion,1800,[J.R.R. Tolkien],archive.org/services/img/silmarilliontolk00tolk,11,fantasy
997,Rinkitink in Oz,1916,"[L. Frank Baum, Andrew J. Heller, John Neill]",archive.org/services/img/rinkitinkoz00baum,2,fantasy
998,Ozma of Oz,1907,"[L. Frank Baum, Erin Yuen, Taylor Anderson, Jo...",archive.org/services/img/ozmaofoz00baumrich,3,fantasy


Below this point is my process to create the above loop, piece by piece, with the information I want to extract from openlibrary.org.

In [75]:
URL = 'https://openlibrary.org/search?subject=Fantasy&page=1'

In [76]:
response = requests.get(URL)
print(response.status_code)

200


In [77]:
soup = BeautifulSoup(response.text, 'html.parser')

In [78]:
len(soup.find_all('li', class_='searchResultItem'))

20

Test out extracting information on an Example Book:

In [79]:
example_book = soup.find_all('li', class_='searchResultItem')[0]
print(example_book)

<li class="searchResultItem" itemscope="" itemtype="https://schema.org/Book">
<span class="bookcover">
<a href="/works/OL262385W?edition=ia%3Acihm_78964"><img alt="Cover of: Sky Island: being the further exciting adventures of Trot and Cap'n Bill after their visit to the sea fairies" itemprop="image" src="//covers.openlibrary.org/b/olid/OL19285157M-M.jpg" title="Cover of: Sky Island: being the further exciting adventures of Trot and Cap'n Bill after their visit to the sea fairies"/></a>
</span>
<div class="details">
<div class="resultTitle">
<h3 class="booktitle" itemprop="name">
<a class="results" href="/works/OL262385W?edition=ia%3Acihm_78964" itemprop="url">Sky Island: being the further exciting adventures of Trot and Cap'n Bill after their visit to the sea fairies</a>
</h3>
</div>
<span class="bookauthor" itemprop="author" itemscope="" itemtype="https://schema.org/Organization">
        
by <a class="results" href="/authors/OL9348793A/L._Frank_Baum">L. Frank Baum</a>, <a class="res

**Find the title of the book** // string

In [80]:
example_book.find('h3', class_='booktitle').find('a').text

"Sky Island: being the further exciting adventures of Trot and Cap'n Bill after their visit to the sea fairies"

**Find the year of (first) publication** // integer

In [54]:
int(example_book.find('span', class_='publishedYear').text.strip().split()[-1])

1912

**Find the link to cover image of the book** // string

In [40]:
example_book.find_all('img')[-1]['src'].strip('/')

'archive.org/services/img/skyisland00baum'

**Find the author name(s)** // list of strings

In [46]:
[a_tag.text for a_tag in example_book.find('span', class_='bookauthor').find_all('a', class_='results')]

['L. Frank Baum', 'Mint Editions', 'John R. (John Rea) Neill']

**Find the number of languages in which this book is available** // int

In [52]:
int(example_book.find('span', class_='languages').find('a').text.split()[0])

1

Test out the above on all books on the page (then expand to iterate over all pages!)

Store data in dictionaries, to transform into a Pandas DataFrame of fantasy books.

In [62]:
fantasy = {'title': [], 'first_published': [], 'authors': [], 'cover_img_url': [], 'languages_available': [],
           'genre': []}

for book in soup.find_all('li', class_='searchResultItem'):
    # append title, year of first publication, list of authors, cover image URL and number of languages available
    try:
        fantasy['title'].append(book.find('h3', class_='booktitle').find('a').text)
    except:
        print('Warning: title null')
        fantasy['title'].append(None)
        
    try:
        fantasy['first_published'].append(
            int(book.find('span', class_='publishedYear').text.strip().split()[-1]))
    except:
        print('Warning: year of first publication null')
        fantasy['first_published'].append(None)
        
    try:
        fantasy['authors'].append(
            [a_tag.text for a_tag in book.find('span', class_='bookauthor').find_all('a', class_='results')])
    except:
        print('Warning: authors null')
        fantasy['authors'].append(None)
        
    try:
        fantasy['cover_img_url'].append(book.find_all('img')[-1]['src'].strip('/'))
    except:
        print('Warning: cover image URL null')
        fantasy['cover_img_url'].append(None)
        
    try:
        fantasy['languages_available'].append(
            int(book.find('span', class_='languages').find('a').text.split()[0]))
    except:
        print('Warning: number of languages available null')
        fantasy['languages_available'].append(None)
        
    fantasy['genre'].append('fantasy')
        
library = pd.DataFrame(fantasy)

library.head()

Unnamed: 0,title,first_published,authors,cover_img_url,languages_available,genre
0,Sky Island: being the further exciting adventu...,1912,"[L. Frank Baum, Mint Editions, John R. (John R...",archive.org/services/img/skyisland00baum,1,fantasy
1,The Well at the World's End,1896,[William Morris],archive.org/services/img/lasourceauboutdu0000morr,3,fantasy
2,Phantastes: a faerie romance,1850,[George MacDonald],archive.org/services/img/phantastes00geor,1,fantasy
3,Wet Magic (Books of Wonder),1937,[Edith Nesbit],archive.org/services/img/wetmagic0000nesb,1,fantasy
4,The Magic City,1910,[Edith Nesbit],archive.org/services/img/magiccity0000nesb,1,fantasy
