In [None]:
import requests
import re
from bs4 import BeautifulSoup
import string
from time import sleep
from collections import namedtuple

CORPUS_URL = {
    'The Valley of Fear': "http://www.gutenberg.org/files/3289/3289-h/3289-h.htm",
    'A Study in Scarlet': "http://www.gutenberg.org/files/244/244-h/244-h.htm",
    'The Sign of the Four': "http://www.gutenberg.org/files/2097/2097-h/2097-h.htm",
    'The Hound of the Baskervilles': "http://www.gutenberg.org/files/2852/2852-h/2852-h.htm",
    # NOTE: This file is a compilation of adventures where "The Boscombe Valley Mystery" is Adventure 4
    'The Boscombe Valley Mystery': 'https://www.gutenberg.org/files/1661/1661-h/1661-h.htm',
}

In [None]:
# Causes output text to wrap in output cells
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
Match = namedtuple('Match', ['chapter_number', 'sentence_number', 'matches', 'text'])


class Scraper:
    def __init__(self, urls):
        self.urls = urls

    def scrape(self, *, responses=None):
        books = []
        responses = [requests.get(url) for url in self.urls] if responses is None else responses

        for response in responses:

            positions = [
                         response.text.find("***END"),
                         response.text.find("*** END"),
                         response.text.find("THE END"),
                         response.text.find("End of")
                        ]
            end_pos = min([pos for pos in positions if pos > 0])
            book_html = response.text[:end_pos]

            def search(pattern):
                return re.search(pattern, book_html).group(1)

            title = search('Title: ([^\n|\r\n|\r]*)')
            author = search('Author: ([^\n|\r\n|\r]*)')
            release = search('Release Date: ([A-Za-z]*[\s0-9]*?, [0-9]*)')

            soup = BeautifulSoup(book_html)

            chapters = self.find_chapters(soup)

            books.append(Book(author, chapters, release, title))

        return books

    def find_chapters(self, soup):
        chapters = []
        chapter_tags = soup.find_all(
            re.compile("(h2|h3)"),
            string=re.compile("Chapter", re.IGNORECASE)
        )
        print(chapter_tags)
        for chapter_number, chapter_tag in enumerate(chapter_tags, start=1):
            chapter_text = []
            chapter_title = None
            for i, sibling in enumerate(chapter_tag.next_siblings):
                if i == 1:
                    chapter_title = string.capwords(sibling.text)
                elif sibling.name == 'p':
                    chapter_text.append(sibling.text)
                elif sibling.name == 'h2':
                    break
                else:
                    continue

            chapters.append(
                Chapter(chapter_title, chapter_number, chapter_text)
            )

        return chapters

class Chapter:
    def __init__(self, title, number, text):
        self.title = title
        self.number = number
        self.text = self.clean(text)

    def find(self, pattern):
        all_matches = []
        for sentence_number, sentence in self.sentences:
            matches = list(re.finditer(pattern, sentence))
            if len(matches):
                all_matches.append(
                    Match(self.number, sentence_number, matches, sentence)
                )

        return all_matches

    @staticmethod
    def clean(chapter_text):
        chapter_text = ' '.join(chapter_text)
        chapter_text = chapter_text.replace('\r\n', ' ')
        return chapter_text

    @property
    def sentences(self):
        '''
        Splits the chapter text into sentences. This is a hard task so some decisions need to be made.

        Case 1: (?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z][a-z][a-z]\.)(?<=[.?!])\s
            (?<!\w\.\w.): Negative lookbehind to prevent splitting on i.e. and e.g.
        Python requires fixed width patterns for lookbehinds, so we have split these
            (?<![A-Z][a-z]\.): Negative lookbehind to prevent splitting on Mr.
            (?<![A-Z][a-z][a-z]\.): Negative lookbehind to prevent splitting on Mrs.
            (?<=[.?!]): Positive lookbehind to make sure we're only splitting after ., ?, or !
            \s: Any white-space character
        Case 2: (?<=[.?!][\"])\s(?=[\"A-Z])
            (?<=[.?!][\"]): Positive lookbehind to make sure we're only splitting after .", ?", or !"
            \s: Any white-space character
            (?=[\"A-Z]): Positive lookahead to make sure we're only splitting before " or a capital letter

        Examples:
            ...a fashion which had become a habit. Miss Penkridge...
                Sentence 1: ...a fashion which had become a habit.
                Sentence 2: Miss Penkridge...
            ...content. "So he did it! Now, I should never have thought it! The last person...
                Sentence 1: ...content.
                Sentence 2: "So he did it!
                Sentence 3: Now, I should never have thought it!
                Sentence 4: The last person...
            ...this sort of stuff?" "Stuff?" demanded Miss Penkridge, who had resumed her...
                Sentence 1: ...this sort of stuff?"
                Sentence 2: "Stuff?" demanded Miss Penkridge, who had resumed her...
        '''
        splits = re.split('(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z][a-z][a-z]\.)(?<=[.?!])\s|(?<=[.?!][\"])\s(?=[\"A-Z])', self.text)
        return zip(range(1, len(splits)+1), splits)

    def __len__(self):
        return len(list(self.sentences))

    def __str__(self):
        return self.title

    def __repr__(self):
        return (
            f'Chapter {self.number}: {self.title}\n'
            f'Content: {self.text[:20]}...{self.text[-20:]}'
        )

class Book:
    def __init__(self, author, chapters, release, title):
        self.author = author
        self.chapters = chapters
        self.release = release
        self.title = title

    def find(self, pattern):
        all_matches = []
        for chapter in self.chapters:
            matches = chapter.find(pattern)
            if len(matches):
                all_matches.extend(matches)

        return all_matches

    def __len__(self):
        return len(self.chapters)

    def __str__(self):
        return self.title

    def __repr__(self):
        return (
            f'{self.title} by {self.author}, '
            f'released in {self.release} and '
            f'contains {len(self)} chapters'
        )

In [None]:
responses = [requests.get(url) for title, url in CORPUS_URL.items()]

In [None]:
books = Scraper([url for title, url in CORPUS_URL.items()]).scrape(responses=responses)

[<h2>
    </h2>, <h2>
      Chapter 2—Sherlock Holmes Discourses
    </h2>, <h2>
      Chapter 3—The Tragedy of Birlstone
    </h2>, <h2>
      Chapter 4—Darkness
    </h2>, <h2>
      Chapter 5—The People of the Drama
    </h2>, <h2>
      Chapter 6—A Dawning Light
    </h2>, <h2>
      Chapter 7—The Solution
    </h2>, <h2>
      Chapter 1—The Man
    </h2>, <h2>
      Chapter 2—The Bodymaster
    </h2>, <h2>
      Chapter 3—Lodge 341, Vermissa
    </h2>, <h2>
      Chapter 4—The Valley of Fear
    </h2>, <h2>
      Chapter 5—The Darkest Hour
    </h2>, <h2>
      Chapter 6—Danger
    </h2>, <h2>
      Chapter 7—The Trapping of Birdy Edwards
    </h2>]
[<h2>
      CHAPTER I. MR. SHERLOCK HOLMES.
    </h2>, <h2>
      CHAPTER II. THE SCIENCE OF DEDUCTION.
    </h2>, <h2>
      CHAPTER IV. WHAT JOHN RANCE HAD TO TELL.
    </h2>, <h2>
      CHAPTER V. OUR ADVERTISEMENT BRINGS A VISITOR.
    </h2>, <h2>
      CHAPTER VI. TOBIAS GREGSON SHOWS WHAT HE CA

In [None]:
print(books)

[The Valley of Fear by Sir Arthur Conan Doyle, released in February 28, 2009 and contains 14 chapters, A Study In Scarlet by Arthur Conan Doyle, released in July 12, 2008 and contains 13 chapters, The Sign of the Four by Arthur Conan Doyle, released in March, 2000 and contains 0 chapters, The Hound of the Baskervilles by Arthur Conan Doyle, released in December 8, 2008 and contains 0 chapters, The Adventures of Sherlock Holmes by Arthur Conan Doyle, released in November 29, 2002 and contains 0 chapters]
