# <span style="color:red"> Import ebooks to text files </span>
* select books based on format suffix
* epub read using ebooklib and BeautifulSoup
* docx read using docx package
* pdf read using textraxt
* each text format book saved as a pickle archive

In [1]:
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup
import pickle
import os
import docx
import textract

## Read in epub format files
Functions are defined to convert the epub file to html and then to text format. These functions are applied only to epub format files from the ebook folder. The resulting text files are saved as pickles.

In [3]:
def epub2thtml(epub_path):
    book = epub.read_epub(epub_path)
    chapters = []
    for item in book.get_items():
        if item.get_type() == ebooklib.ITEM_DOCUMENT:
            chapters.append(item.get_content())
    return chapters

In [4]:
def chap2text(chap):
    blacklist = ['[document]','noscript','header','html','meta','head','input','script']
    output = ''
    soup = BeautifulSoup(chap, 'html.parser')
    text = soup.find_all(text=True)
    for t in text:
        if t.parent.name not in blacklist:
            output += '{} '.format(t)
    return output

In [5]:
def thtml2ttext(thtml):
    Output = []
    for html in thtml:
        text =  chap2text(html)
        Output.append(text)
    return Output

In [6]:
def epub2text(epub_path):
    chapters = epub2thtml(epub_path)
    ttext = thtml2ttext(chapters)
    return ttext

In [7]:
book_dir = '../../../../Documents/murakami/ebooks/'
epub_list = os.listdir(book_dir)
epub_list = [x.split('.')[0] for x in epub_list if x.split('.')[1] == 'epub']
epub_list

['HarukiMurakami_ColorlessTsukuruTazaki',
 'HarukiMurakami_TheWindUpBirdChronicle',
 'HarukiMurakami_AWildSheepChase',
 'HarukiMurakami_NorwegianWood',
 'HarukiMurakami_KafkaOnTheShore',
 'HarukiMurakami_DanceDanceDance']

In [8]:
out_dir = '../../../../Documents/murakami/pkl_raw_books/'
for book in epub_list:
    epub_full_path = book_dir + book + '.epub'
    out_full_path = out_dir + book + '.pkl'
    output=epub2text(epub_full_path)
    with open(out_full_path, 'wb') as fp:
        pickle.dump(output, fp)

## Read in docx format files
A function is defined to read in text from docx files  and applied to docx format files from the ebook folder. The resulting files are again stored as pickles.

In [9]:
docx_list = os.listdir(book_dir)
docx_list = [x.split('.')[0] for x in docx_list if x.split('.')[1] == 'docx']
docx_list

['HarukiMurakami_TheElephantVanishes']

In [10]:
def getText(filename):
    doc = docx.Document(filename)
    fullText = []
    for para in doc.paragraphs:
        fullText.append(para.text)
    return '\n'.join(fullText)

In [11]:
for book in docx_list:
    docx_full_path = book_dir + book + '.docx'
    out_full_path = out_dir + book + '.pkl'
    output = getText(docx_full_path)
    with open(out_full_path, 'wb') as fp:
        pickle.dump(output, fp)

## Read in pdf format files
Selecting only pdf format files from the ebook folder.
They all relate to a single book in this case ad so are grouped together. 

In [12]:
pdf_list = os.listdir(book_dir)
pdf_list = [x.split('.')[0] for x in pdf_list if x.split('.')[1] == 'pdf']
pdf_list.sort()
pdf_list

['nw_b_split_004-021_1page',
 'nw_b_split_022-049_1page',
 'nw_b_split_050-066_1page',
 'nw_b_split_067-080_1page',
 'nw_b_split_081-109_1page',
 'nw_b_split_110-129_1page']

In [13]:
# read in each pdf chunk and form into string
texts = str()
for pdf_ in pdf_list:
    pdf_full_path = book_dir + pdf_ + '.pdf'
#     need decode("utf-8") to convert to string
#     need split to split into pages and then select alternate pages as each one is read twice
    text = textract.process(pdf_full_path, language='eng', method='pdfminer').decode("utf-8").split('\x0c')[::2]
    text_sel = str()
    for t in text:
        text_sel += t
    texts += text_sel

In [14]:
out_full_path = out_dir + 'HarukiMurakami_NorwegianWoodB' + '.pkl'
with open(out_full_path, 'wb') as fp:
    pickle.dump(texts, fp)