In [2]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np

### Initialise downloads page and get links for the first 4 pages

In [3]:
url = '''https://www.gutenberg.org/ebooks/search/?sort-order=downloads&sort_order=downloads''' # Downloads page
start_indices = [str(index+1) for index in range(0,76,25)]
responses = [requests.get(url + '&start_index=' + index) for index in start_indices]

books_soups = [BeautifulSoup(response.text, 'html.parser') for response in responses] # Get soups for each page

### Get all \<a> tags with class of link in each of the soups

In [4]:
# Get all the <a> tag with class of 'link'
book_tags = []

for books_soup in books_soups:
  book_tags.extend(books_soup.find_all('a', attrs={'class': "link"}))

# Get the name of the link to each book download page (testing if the href starts with '/ebooks' and ends with an integer)
book_tags = [tag.attrs['href'] for tag in book_tags 
             if tag.attrs['href'].startswith('/ebooks') & 
            tag.attrs['href'].endswith(tuple([str(digit) for digit in range(0,10)]))]

# Display book tags
print("In total we have " + str(len(book_tags)) + " book titles")
print("Displaying 10 titles")
print(book_tags[:10]) 

In total we have 100 book titles
Displaying 10 titles
['/ebooks/1342', '/ebooks/84', '/ebooks/11', '/ebooks/1661', '/ebooks/2701', '/ebooks/1232', '/ebooks/16328', '/ebooks/174', '/ebooks/98', '/ebooks/64317']


In [5]:
for book_soup in books_soups:
  book_soup.find_all('')

### Combine href attribute with Gutenberg URL to get link for each e-book

In [6]:
base_url = "https://www.gutenberg.org"

# Get e-book links
book_links = [base_url + tag for tag in book_tags]
print("In total we have " + str(len(book_links)) + " books")
print("Displaying 10 book links")
print(book_links)

In total we have 100 books
Displaying 10 book links
['https://www.gutenberg.org/ebooks/1342', 'https://www.gutenberg.org/ebooks/84', 'https://www.gutenberg.org/ebooks/11', 'https://www.gutenberg.org/ebooks/1661', 'https://www.gutenberg.org/ebooks/2701', 'https://www.gutenberg.org/ebooks/1232', 'https://www.gutenberg.org/ebooks/16328', 'https://www.gutenberg.org/ebooks/174', 'https://www.gutenberg.org/ebooks/98', 'https://www.gutenberg.org/ebooks/64317', 'https://www.gutenberg.org/ebooks/25344', 'https://www.gutenberg.org/ebooks/5200', 'https://www.gutenberg.org/ebooks/1952', 'https://www.gutenberg.org/ebooks/345', 'https://www.gutenberg.org/ebooks/2591', 'https://www.gutenberg.org/ebooks/2542', 'https://www.gutenberg.org/ebooks/6130', 'https://www.gutenberg.org/ebooks/1260', 'https://www.gutenberg.org/ebooks/205', 'https://www.gutenberg.org/ebooks/1080', 'https://www.gutenberg.org/ebooks/43', 'https://www.gutenberg.org/ebooks/2600', 'https://www.gutenberg.org/ebooks/74', 'https://www.g

### Get the book names and author names

In [7]:
book_names = []
book_authors = []
for link in book_links:
  book_soup = BeautifulSoup(requests.get(link).text, 'html.parser') # Get the soup of each book link
  # Get the header of each book link which contains the name of book and the author separated by the word 'by'
  book_and_author = book_soup.find('h1').get_text().split(" by ")
  # If no author displayed add an empty string
  if len(book_and_author) == 1:
    book_and_author.append('')

  # Add book names and authors to list
  book_names.append(book_and_author[0])
  book_authors.append(book_and_author[1])

### Get the html or pdf links to all the books

In [8]:
book_links_soups = [BeautifulSoup(requests.get(book_link).text, 'html.parser') for book_link in book_links]

full_book_links = []
for soup in book_links_soups:
  # Find the html copy of the book in each page by searching for the correct link
  book_tag = soup.find('a', attrs={'type': re.compile(r'^text/html')})
  # If no html copy exists, get the pdf version
  if book_tag is None:
    book_tag = soup.find('a', attrs={'type': "application/pdf"})
  # Add to list
  full_book_links.append(book_tag)

full_book_links = [base_url + link['href'] for link in full_book_links]

In [9]:
len(full_book_links)

100

In [10]:
lig = 'sect'
tryma = 'Sect. 209. But if either these illegal acts have extended to the majority o'
lig in tryma.lower()

True

In [17]:
ex = re.compile(r'^[—"\'“][AI][^a-zA-Z]|^[AI][^a-zA-Z]|^[—"\'“][AI]$|^[AI]$|^[—"\'“][O]$|^O$')
re.match(ex, '"A')

<re.Match object; span=(0, 2), match='"A'>

### Define function to obtain and clean text from each book

In [34]:
from random import randint

char_replace = {'â\x80\x9c':'"', 'â\x80\x9d':'"','â\x80\x94':'-',
                'â\x80\x98':"'", 'â\x80\x99':"'", '“':'"', '”':'"', "’":"'"}
exclude_words = ['gutenberg', 'transcribe', 'ebook', '[', ']', 'publish',
                 'manuscript', 'text', 'writer', 'content', 'author', 'title', 
                 'illustrat', 'chapter', 'edit', 'reserve', 'copyright', 
                 'rights', 'epilogue', 'part', 'canto', 'page', 'footnote']

def get_excerpts(book_links):
  '''Extracts text excerpts from a list of book links and cleans the text'''

  # Initialise excerpts list
  excerpts = []

  for i in range(0, len(book_links)):
    text = []

    # Get book soups, ignoring 'pdf's and attempt to decode as utf-8
    if book_links[i][-3:] == "pdf":
      excerpts.append(' ')
      continue
    try:
      book_soup = BeautifulSoup(requests.get(book_links[i]).content.decode("utf-8"), 'html.parser')
    except UnicodeDecodeError:
      book_soup = BeautifulSoup(requests.get(book_links[i]).content, 'html.parser')

    # Exception for Dante's Inferno: link provides a contents page linking to other webpages which contain the text
    if book_links[i].endswith('8800.html.images'):
      child_link = book_soup.find('a', attrs = {'href': re.compile(r'link[0-9]+$')})['href']

      child_soup = BeautifulSoup(requests.get(child_link).content.decode("utf-8"), 'html.parser')

      # Find all paragraph tags
      paragraph_tags = child_soup.find_all('p')
    elif book_links[i].endswith('16328.html.images'):
      paragraph_tags = book_soup.find_all('div', attrs = {'class': 'l'})
    else:
      paragraph_tags = book_soup.find_all('p')

    # Exclude paragraph tags with certain key words
    paragraph_tags = [tag for tag in paragraph_tags 
                      if not any(substring in tag.get_text().lower() for substring in exclude_words)]

    # Skip certain number of tags to get paragraph deeper in text
    skip_num = randint(4, len(paragraph_tags)-10)

    # Iterate through each paragraph tag
    for num, tag in enumerate(paragraph_tags):
      
      # Remove paragraph tags which display the book chapters
      try:
        if 'toc' in tag.attrs['class']:
          continue
      except:
        pass

      # Skip over contents list in "Songs of Innocence, and Songs of Experience"
      if book_links[i].endswith("1934-h.htm") and tag.parent.name == "td":
        continue

      # Skip bibliography and footnotes in "The Art of War"
      if book_links[i].endswith("132-h.htm") and tag.parent.find('h3').get_text == "Bibliography":
        continue

      # Skip a certain number of paragraphs
      if num <= skip_num:
        continue
    
      # Obtain paragraph tag text
      paragraph_text = tag.get_text()

      # Remove new lines and join them with spaces
      try: 
        paragraph_text = ' '.join(paragraph_text.split())
      except AttributeError:
        continue

      # Add the text in the current <p> tag to the list
      text.append(paragraph_text)

      # Join sections from each <p> tag in the list
      text = [' '.join(text)]

      # Cut if the text length exceeds 100
      if len(text[0].split()) >= 100 or num == len(paragraph_tags) - 1:

        # Remove all caps words (some texts are plays and contain character names
        # in all caps), but make exception for variants of I, A and O (rare but 
        # does exist)
        temp = text[0].split()
        I_or_A = re.compile(r'^[—"\'“][AI][^a-zA-Z]|^[AI][^a-zA-Z]|^[—"\'“][AI]$|^[AI]$|^[—"\'“][O]$|^O$')

        # Page numbers are contained with <p> tags for "Common Sense" (should be removed)
        if book_links[i].endswith("147-h.htm"):
          numbers = re.compile(r'^[0-9]')
          temp = [word for word in text[0].split() if not re.match(numbers, word)] 

        # Rejoin word list
        text[0] = " ".join([word for word in temp if not word.isupper() or re.match(I_or_A, word)])
          
        split_text = text[0].split()
        for j in range(len(split_text)):
          # Checks if word ends with full stop to get full sentence, and if first 
          # letter is capital (avoid Mrs./Mr. etc...)
          if split_text[j][-1] == "." and not split_text[j][0].isupper() and j >= 100:
            text[0] = ' '.join(split_text[:j+1])
            break
        
        print(str(i), text[0])
        excerpts.append(text[0])
        break
      
  return excerpts

### Extract excerpts multiple times and convert them to csv

In [35]:
for i in range(1,11):
  print("\n***Batch " + str(i) + "***\n")

  excerpts = get_excerpts(full_book_links)

  # Add text and book identifiers to Pandas DataFrame
  output = pd.DataFrame({'book': book_names, 'author': book_authors, 'url': full_book_links, 'excerpt': excerpts})

  # Convert to csv
  output.to_csv('gutenberg-excerpts-' + str(i) + '.csv', index = False, encoding="utf-8-sig")


***Batch 1***

0 “You are a very strange creature by way of a friend!—always wanting me to play and sing before anybody and everybody! If my vanity had taken a musical turn, you would have been invaluable; but as it is, I would really rather not sit down before those who must be in the habit of hearing the very best performers.” On Miss Lucas’s persevering, however, she added, “Very well; if it must be so, it must.” And gravely glancing at Mr. Darcy, “There is a fine old saying, which everybody here is of course familiar with—‘Keep your breath to cool your porridge,’—and I shall keep mine to swell my song.”
1 In the mean time I worked on, and my labour was already considerably advanced. I looked towards its completion with a tremulous and eager hope, which I dared not trust myself to question but which was intermixed with obscure forebodings of evil that made my heart sicken in my bosom. I sat one evening in my laboratory; the sun had set, and the moon was just rising from the sea; I 