In [1]:
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup
import sys
import spacy

In [2]:
# Load the English language model
nlp = spacy.load("en_core_web_sm")

In [3]:
def chap_as_list(items):
    #filter out and store all chapters in a list
    chapters = []
    for c in items:
        if 'chap' in c.get_name():
            chapters.append(c)
    return chapters

In [4]:
def word_count_func(words):
    return len(nlp(words))

In [5]:
def split_text_by_word_count(text, word_count):
    #checks for the length of text passed in, splitting if text is longer than word_count
    segments = []
    curr_segment = []
    curr_word_count = 0

    for token in text.split():
        curr_word_count += 1
        curr_segment.append(token)

        if curr_word_count >= word_count:
            segments.append(" ".join(curr_segment))
            curr_segment = []
            curr_word_count = 0

    # Append remaining chunk to the last segment if the chunk is longer than 50 words
    if curr_word_count > 50:
        segments.append(" ".join(curr_segment))

    #returns a list of segments, all under set word count
    return segments

In [6]:
#use BeautifulSoup to read out only the text in each chapter, saved as a list of paragraphs
def chapter_to_str(chapter):
    soup = BeautifulSoup(chapter.get_body_content(), 'html.parser')
    text = [para.get_text() for para in soup.find_all('p')]
    for paragraph in text:
        #infinite loop, not in use
        text += split_text_by_word_count(paragraph, 250)
        print('chunked')
    return text

In [7]:
def chunk(chapters):
    #save chunks into dictionary
    texts = {}
    for c in chapters:
        texts[c.get_name()] = chapter_to_str(c)
    return texts

In [8]:
'''
def main():
    #opens source file and returns pickled dictionary of text in chunks of length 50-200
    book = epub.read_epub(sys.argv[1])
    items = list(book.get_items_of_type(ebooklib.ITEM_DOCUMENT))
    #save all chapters in a list
    chapters = chap_as_list(items)
    #chunk texts, limiting length
    chunks = chunk(chapters)
    return chunks
    '''

'\ndef main():\n    #opens source file and returns pickled dictionary of text in chunks of length 50-200\n    book = epub.read_epub(sys.argv[1])\n    items = list(book.get_items_of_type(ebooklib.ITEM_DOCUMENT))\n    #save all chapters in a list\n    chapters = chap_as_list(items)\n    #chunk texts, limiting length\n    chunks = chunk(chapters)\n    return chunks\n    '

In [9]:
#to help iterate through files in a folder
import os

In [10]:
folder_path = '../wdf_epubs'

In [57]:
#master dictionary with all books chunked in their respective sublists
book_chunks = {}

In [15]:
os.listdir(folder_path)

['oblivion.epub', '.DS_Store']

In [58]:
for book_name in os.listdir(folder_path):
    if book_name != '.DS_Store':
        book = epub.read_epub('../wdf_epubs/' + book_name)
        items = list(book.get_items_of_type(ebooklib.ITEM_DOCUMENT))
        #save all chapters in a list
        chapters = chap_as_list(items)
        #chunk texts, limiting length
        chunks = []
        for c in chapters:
            soup = BeautifulSoup(c.get_body_content(), 'html.parser')
            text = [para.get_text() for para in soup.find_all('p')]
            for paragraph in text:
                chunks += split_text_by_word_count(paragraph, 250)
            #chunks += chapter_to_str(c)
        book_chunks[book_name] = chunks


In [59]:
book_chunks.keys()

dict_keys(['oblivion.epub', 'infinite_jest.epub', 'broom_of_the_system.epub', 'simba.epub', 'last_interview.epub', 'fun_thing.epub', 'both_flesh_and_not.epub', 'girl_with_curious_hair.epub', 'fate_time_language.epub', 'brief_interviews.epub', 'lobster.epub', 'pale_king.epub'])

In [60]:
book_chunks

{'oblivion.epub': ['The Focus Group was then reconvened in another of Reesemeyer Shannon Belt Advertising’s nineteenth-floor conference rooms. Each member returned his Individual Response Profile packets to the facilitator, who thanked each in turn. The long conference table was equipped with leather executive swivel chairs; there was no assigned seating. Bottled spring water and caffeinated beverages were made available to those who thought they might want them. The exterior wall of the conference room was a thick tinted window with a broad high-altitude view of points NE, creating a spacious, attractive, and more or less natural-lit environment that was welcome after the bland fluorescent enclosure of the testing cubicles. One or two members of the Targeted Focus Group unconsciously loosened their neckties as they settled into the comfortable chairs.',
  'This facilitator, just like the one who’d led the large Product Test and Initial Response assembly earlier that morning before all

In [61]:
book_chunks['fun_thing.epub']

['When I left my boxed township of Illinois farmland to attend my dad’s alma mater in the lurid jutting Berkshires of western Massachusetts, I all of a sudden developed a jones for mathematics. I’m starting to see why this was so. College math evokes and catharts a Midwesterner’s sickness for home. I’d grown up inside vectors, lines and lines athwart lines, grids—and, on the scale of horizons, broad curving lines of geographic force, the weird topographical drain-swirl of a whole lot of ice-ironed land that sits and spins atop plates. The area behind and below these broad curves at the seam of land and sky I could plot by eye way before I came to know infinitesimals as easements, an integral as schema. Math at a hilly Eastern school was like waking up; it dismantled memory and put it in light. Calculus was, quite literally, child’s play.',
 'In late childhood I learned how to play tennis on the blacktop courts of a small public park carved from farmland that had been nitrogenized too o

In [88]:
book_chunks['infinite_jest.epub']

['I am seated in an office, surrounded by heads and bodies. My posture is consciously congruent to the shape of my hard chair. This is a cold room in University Administration, wood-walled, Remington-hung, double-windowed against the November heat, insulated from Administrative sounds by the reception area outside, at which Uncle Charles, Mr. deLint and I were lately received.',
 'I have committed to crossing my legs I hope carefully, ankle on knee, hands together in the lap of my slacks. My fingers are mated into a mirrored series of what manifests, to me, as the letter X. The interview room’s other personnel include: the University’s Director of Composition, its varsity tennis coach, and Academy prorector Mr. A. deLint. C.T. is beside me; the others sit, stand and stand, respectively, at the periphery of my focus. The tennis coach jingles pocket-change. There is something vaguely digestive about the room’s odor. The high-traction sole of my complimentary Nike sneaker runs parallel to

In [63]:
book_chunks['simba.epub']

[]

In [65]:
book = epub.read_epub("../wdf_epubs/simba.epub")
items = list(book.get_items_of_type(ebooklib.ITEM_DOCUMENT))



In [75]:
for item in items:
    print(item.get_name())

cover.xml
00-front/00-cover.html
00-front/01-alsoby.html
00-front/02-title.html
00-front/03-copyright.html
00-front/04-contents.html
00-front/04-note.html
01-body-1/index.html
01-body-1/footnote-1.html
01-body-1/footnote-2.html
01-body-1/footnote-3.html
01-body-1/pressRelease.html
02-back/00-acknowledgments.html


In [78]:
items[7].get_body_content()

b'\n  <div class="body">&#13;\n<?page-break ??>&#13;\n<div class="Title-105-ya-f"><a id="Title-1"/><strong>INTRODUCTION TO THE ELECTRONIC EDITION, MANDATED AND OVERSEEN BY LITTLE, BROWN AND COMPANY OR IPUBLISH.COM OR WHATEVER THE ACTUAL VECTOR HERE MIGHT BE</strong></div>&#13;\n<div class="Body1-90-xe-l">Dear Person Reading This:</div>&#13;\n<div class="Body1-First-90-xe-fi" style="margin-top: 5%">This is the part where I\xe2\x80\x99m supposed to say what the following document is and where it came from.</div>&#13;\n<div class="Body1-90-xe-fi">From what I understand, this past fall the powers that be at <em>Rolling Stone</em> magazine decided they wanted to get four writers who were not political journalists to do articles on the four big presidential candidates and their day-to-day campaigns in the early primaries. Luckily my own resum\xc3\xa9\xe2\x80\x99s got \xe2\x80\x98NOT A POLITICAL JOURNALIST\xe2\x80\x99 right at the very top, and <em>Rolling Stone</em> magazine called, and pitc

In [84]:
soup = BeautifulSoup(items[9].get_body_content(), 'html.parser')
text = [para.get_text() for para in soup.find_all('p')]

In [85]:
text

[]

In [86]:
book.get_items_of_type(ebooklib.ITEM_SCRIPT)

<generator object EpubBook.get_items_of_type.<locals>.<genexpr> at 0x7faf4bf2a820>

In [66]:
chapters = chap_as_list(items)

In [67]:
chapters

[]

In [46]:
counts = [word_count_func(chunk) for chunk in brief]

In [47]:
counts

[]

In [39]:
import numpy as np

In [40]:
np.argmax(counts)

308

In [41]:
min(counts)

57

In [43]:
len(brief)

502

In [44]:
np.mean(counts)

205.33466135458167

In [87]:
chunks['infinite_jest.epub']

TypeError: list indices must be integers or slices, not str