In [1]:
import glob, re
from bs4 import BeautifulSoup
import nltk

In [2]:
htmllist = glob.glob('/Users/rok/python_src/gutenberg/dl-cache/*html')

In [12]:
with open('/Users/rok/python_src/gutenberg/dl-cache/101.html') as f : 
    soup = BeautifulSoup(f, 'html.parser')
text = soup('body')[0].text.lower()

In [131]:
#
# from https://github.com/kiwix/gutenberg
#

def get_formatted_number(num):
    """
    Get a formatted string of a number from a not-predictable-string
    that may or may not actually contain a number.
    Append a BC notation to the number num with, if the
    number is negative.
    returns: a formatted string of the number, or num if
             num is not negative or None.
    """
    if not num:
        return None
    if all(['-' in num, num.replace('-', '').strip().isdigit()]):
        return ' '.join([num, 'BC'])
    return num

class RdfParser():
    def __init__(self, rdf_data, gid):
        self.rdf_data = rdf_data
        self.gid = gid

        self.author_id = None
        self.author_name = None
        self.first_name = None
        self.last_name = None

    def parse(self):
        soup = BeautifulSoup(self.rdf_data, 'lxml', from_encoding='utf-8')

        # The tile of the book: this may or may not be divided
        # into a new-line-seperated title and subtitle.
        # If it is, then we will just split the title.
        self.title = soup.find('dcterms:title')
        self.title = self.title.text if self.title else '- No Title -'
        self.title = self.title.split('\n')[0]
        self.subtitle = ' '.join(self.title.split('\n')[1:])
        self.author_id = None

        # Parsing the name of the Author. Sometimes it's the name of
        # an organization or the name is not known and therefore
        # the <dcterms:creator> or <marcrel:com> node only return
        # "anonymous" or "unknown". For the case that it's only one word
        # `self.last_name` will be null.
        # Because of a rare edge case that the field of the parsed author's name
        # has more than one comma we will join the first name in reverse, starting
        # with the second item.
        self.author = soup.find('dcterms:creator') or soup.find('marcrel:com')
        if self.author:
            self.author_id = self.author.find('pgterms:agent')
            self.author_id = self.author_id.attrs['rdf:about'].split('/')[-1] \
                if 'rdf:about' in getattr(self.author_id, 'attrs', '') else None

            if self.author.find('pgterms:name'):
                self.author_name = self.author.find('pgterms:name')
                self.author_name = self.author_name.text.split(',')

                if len(self.author_name) > 1:
                    self.first_name = ' '.join(self.author_name[::-2]).strip()
                self.last_name = self.author_name[0]

        # Parsing the birth and (death, if the case) year of the author.
        # These values are likely to be null.
        self.birth_year = soup.find('pgterms:birthdate')
        self.birth_year = self.birth_year.text if self.birth_year else None
        self.birth_year = get_formatted_number(self.birth_year)

        self.death_year = soup.find('pgterms:deathdate')
        self.death_year = self.death_year.text if self.death_year else None
        self.death_year = get_formatted_number(self.death_year)

        # ISO 639-3 language codes that consist of 2 or 3 letters
        try : 
            self.language = soup.find('dcterms:language').find('rdf:value').text
        except AttributeError : 
            self.language = None

        # The download count of the books on www.gutenberg.org.
        # This will be used to determine the popularity of the book.
        self.downloads = soup.find('pgterms:downloads').text

        # The book might be licensed under GPL, public domain
        # or might be copyrighted
        self.license = soup.find('dcterms:rights').text

        # Finding out all the file types this book is available in
        file_types = soup.find_all('pgterms:file')
        self.file_types = {}
        for x in file_types:
            if not x.find('rdf:value').text.endswith('application/zip'):
                k = x.attrs['rdf:about'].split('/')[-1]
                v = x.find('rdf:value').text
                self.file_types.update({k:v})

        return self

In [132]:
def clean_html(source) :
    """Clean HTML tags, escape characters, special unicode, punctuation, and empty spaces from the raw html"""
    
    # lower case
    source = source.lower()
    
    # define the regular expressions
    
    # remove tags and punctuation 
    no_tags = re.compile('<.*>')
    cleaned = no_tags.sub('',source)
    
    # remove escape characters
    no_escape = re.compile('\r?\n|\r')
    cleaned = no_escape.sub(' ', cleaned)
    
    # remove all non-ascii
    no_non_ascii = re.compile('[^\x00-\x7F]+')
    cleaned = no_non_ascii.sub(' ', cleaned)
    
    # remove punctuation
    no_punctuation = re.compile('[^a-zA-Z0-9\s]')
    cleaned = no_punctuation.sub('', cleaned)
    
    # remove numbers
    no_numbers = re.compile('[0-9]+')
    cleaned = no_numbers.sub('', cleaned)
   
    # remove empty white space and numbers
    no_empty_space = re.compile('\s+')
    cleaned = no_empty_space.sub(' ', cleaned) 
     
    # when returning, remove also the left and right space padding
    return cleaned.strip()

In [133]:
html_path = '/Users/rok/python_src/gutenberg/dl-cache/101.html'

def get_metadata(html_path, rdf_path = '/Users/rok/python_src/gutenberg/rdf-files/') :
    obj_id = os.path.splitext(os.path.basename(html_path))[0]
    rdf_file = glob.glob(rdf_path+obj_id+'/*')[0]
    with open(rdf_file) as f :
        rdf_data = f.read()
    return RdfParser(rdf_data, obj_id).parse()

In [134]:
meta = get_metadata(html_path)

In [135]:
meta.birth_year

u'1954'

In [136]:
meta.downloads

u'352'