In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
from unidecode import unidecode
import os, json, re

def load_text(book_id: int) -> str:
    """
    Load a book from the Gutenberg project.

    :param book_id: The id of the book to load.
    :return: The text of the book.
    """
    START_MARKERS = ['*** START OF']
    END_MARKERS = ['*** END OF']
    print(f'loading text from https://www.gutenberg.org/files/{book_id}/{book_id}-0.txt')
    with urlopen(f'https://www.gutenberg.org/files/{book_id}/{book_id}-0.txt') as response:
        text = []
        # ignore lines up to start markers
        for line in response:
            line = line.decode("utf-8-sig").strip()
            if any(line.startswith(token) for token in START_MARKERS):
                break
        # add all lines up to end markers
        for line in response:
            line = line.decode("utf-8-sig").strip()
            if any(line.startswith(token) for token in END_MARKERS):
                break
            text.append(line)
    return '\n'.join(text).strip()

def from_cache(book_id: int) -> str:
    """
    Load a book from local cache. If not in cache,
    return None.

    :param book_id: The id of the book to load.
    :return: The text of the book.
    """
    if not os.path.exists(f'books/{book_id}.json'):
        return None
    with open(f'books/{book_id}.json', 'r') as f:
        book = json.load(f)
    with open(f'books/{book_id}.txt', 'r') as f:
        book['text'] = f.read()
    return book

def to_cache(book_id: int, book: dict) -> None:
    """
    Write book to local cache.

    :param book_id: The id of the book to load.
    :return: The text of the book.
    """
    # ensure books folder exists
    os.makedirs('books', exist_ok=True)
    # dump json of book without property text
    text = book['text']
    book = book.copy()
    del book['text']
    # write book to cache
    with open(f'books/{book_id}.json', 'w') as f:
        json.dump(book, f)
    print(f'wrote book to cache: books/{book_id}.json')
    with open(f'books/{book_id}.txt', 'w') as f:
        f.write(text)
    print(f'wrote book to cache: books/{book_id}.txt')

def get_book_ids() -> list:
    """
    Load a book from the Gutenberg project.

    :return: List of ids in cache
    """
    # ensure books folder exists
    os.makedirs('books', exist_ok=True)
    
    # scan file names from *.txt files from the books folder
    return [int(file.split('.')[0]) for file in os.listdir('books') if file.endswith('.json')]

def load_book(book_id: int) -> dict:
    """
    Load a book from the Gutenberg project.

    :param book_id: The id of the book to load.
    :return: The text of the book.
    """
    # read meatdata from page
    print(f'loading metadata from https://www.gutenberg.org/ebooks/{book_id}')
    with urlopen(f'https://www.gutenberg.org/ebooks/{book_id}') as response:
        page = BeautifulSoup(response, 'html.parser')
    book = { 'id': book_id }
    rows = page.find('table', {'class': 'bibrec'}).find_all('tr')
    fields = dict([[row.find('th').get_text().lower().strip(), row.find('td').get_text().strip()] for row in rows if row.find('th')])
    a = fields['author'].split(',')
    book['yearspan'] = a.pop().strip()
    book['author'] = ','.join(a)
    book['title'] = fields['title']
    book['language'] = fields['language']
    book['release_date'] = fields['release date']

    # load text from Gutenberg project
    book['text'] = load_text(book_id)

    # create dictionary with book id as key and book text as value
    return book

def get_book(book_id: int, reload: bool = False) -> dict:
    """
    Load a book from the Gutenberg project.

    :param book_id: The id of the book to load.
    :return: The text of the book.
    """
    # check if book is in cache
    if not(reload) and (book := from_cache(book_id)):
        return book
    # fetch book from Gutenberg project
    book = load_book(book_id)
    to_cache(book_id, book)
    return book

book = get_book(244, reload=False)

In [11]:
def clean_text(text: str) -> str:
    """
    Remove accents and whitespaces from a string.

    :param str: The string to clean.
    :return: The cleaned string.
    """
    return re.sub(r'\s+', ' ', unidecode(text))

text = clean_text(book['text'])

In [13]:
import nltk
nltk.download('punkt')
nltk.sent_tokenize(text)


[nltk_data] Downloading package punkt to /Users/drweb/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['A STUDY IN SCARLET By A. Conan Doyle CONTENTS A STUDY IN SCARLET.',
 'PART I.',
 'CHAPTER I. MR. SHERLOCK HOLMES.',
 'CHAPTER II.',
 'THE SCIENCE OF DEDUCTION.',
 'CHAPTER III.',
 'THE LAURISTON GARDENS MYSTERY CHAPTER IV.',
 'WHAT JOHN RANCE HAD TO TELL.',
 'CHAPTER V. OUR ADVERTISEMENT BRINGS A VISITOR.',
 'CHAPTER VI.',
 'TOBIAS GREGSON SHOWS WHAT HE CAN DO.',
 'CHAPTER VII.',
 'LIGHT IN THE DARKNESS.',
 'PART II.',
 'THE COUNTRY OF THE SAINTS CHAPTER I.',
 'ON THE GREAT ALKALI PLAIN.',
 'CHAPTER II.',
 'THE FLOWER OF UTAH.',
 'CHAPTER III.',
 'JOHN FERRIER TALKS WITH THE PROPHET.',
 'CHAPTER IV.',
 'A FLIGHT FOR LIFE.',
 'CHAPTER V. THE AVENGING ANGELS.',
 'CHAPTER VI.',
 'A CONTINUATION OF THE REMINISCENCES OF JOHN WATSON, M.D.',
 'CHAPTER VII.',
 'THE CONCLUSION.',
 'A STUDY IN SCARLET.',
 'PART I.',
 '(_Being a reprint from the Reminiscences of_ JOHN H. WATSON, M.D., _Late of the Army Medical Department._) CHAPTER I. MR. SHERLOCK HOLMES.',
 'In the year 1878 I took my degree o

In [None]:
import nltk
import re

def tokenize_text(text: str):
    
    # lowercase the text
    text = text.lower()
    
    # remove punctuation from text
    text = re.sub(r"[^\w\s]", "", text)
    
    # tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # remove stopwords from txt_tokens and word_tokens
    from nltk.corpus import stopwords
    english_stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in english_stop_words]
    
    # return your tokens
    return tokens

tokens = tokenize_text(text = great_gatsby)
print(tokens)

In [15]:
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
sent_detector.tokenize(text.strip())

['A STUDY IN SCARLET By A. Conan Doyle CONTENTS A STUDY IN SCARLET.',
 'PART I.',
 'CHAPTER I. MR. SHERLOCK HOLMES.',
 'CHAPTER II.',
 'THE SCIENCE OF DEDUCTION.',
 'CHAPTER III.',
 'THE LAURISTON GARDENS MYSTERY CHAPTER IV.',
 'WHAT JOHN RANCE HAD TO TELL.',
 'CHAPTER V. OUR ADVERTISEMENT BRINGS A VISITOR.',
 'CHAPTER VI.',
 'TOBIAS GREGSON SHOWS WHAT HE CAN DO.',
 'CHAPTER VII.',
 'LIGHT IN THE DARKNESS.',
 'PART II.',
 'THE COUNTRY OF THE SAINTS CHAPTER I.',
 'ON THE GREAT ALKALI PLAIN.',
 'CHAPTER II.',
 'THE FLOWER OF UTAH.',
 'CHAPTER III.',
 'JOHN FERRIER TALKS WITH THE PROPHET.',
 'CHAPTER IV.',
 'A FLIGHT FOR LIFE.',
 'CHAPTER V. THE AVENGING ANGELS.',
 'CHAPTER VI.',
 'A CONTINUATION OF THE REMINISCENCES OF JOHN WATSON, M.D.',
 'CHAPTER VII.',
 'THE CONCLUSION.',
 'A STUDY IN SCARLET.',
 'PART I.',
 '(_Being a reprint from the Reminiscences of_ JOHN H. WATSON, M.D., _Late of the Army Medical Department._) CHAPTER I. MR. SHERLOCK HOLMES.',
 'In the year 1878 I took my degree o