In [1]:
from bs4 import BeautifulSoup
import re
import json
import numpy as np



In [2]:
# NavigableString is a type in BeautifulSoup. It is similar to strings but are not strings.
# This function converts each NavigableString of the input list to string
def concatenate_NavigableString(NS_list):
    result = ''
    for NS in NS_list:
        result = result + str(NS)
    return result
        

In [3]:

path = 'book/' 
book_file = 'abacaxi.html' 
full_name = path + book_file 
doc = open(full_name,'r') 
html_doc = doc.read()

In [4]:
from pathlib import Path

path = 'book/'  
book_file = 'abacaxi.html'

full_name = path + book_file
with open(full_name, 'r', encoding='utf-8') as doc:
    html_doc = doc.read()

print("Arquivo carregado com sucesso:", full_name)


Arquivo carregado com sucesso: book/abacaxi.html


In [5]:
soup = BeautifulSoup(html_doc, 'html.parser')

tuple_list = []
book = soup.title.string

# Head attributes
list_meta = soup.find_all("meta")
list_link = soup.find_all("link")

for meta_data in list_meta:
    if meta_data.get("name", None) == "epub":
        epub = meta_data.get("content", None)
    elif meta_data.get("name", None) == "pdf":
        pdf = meta_data.get("content", None)
    elif meta_data.get("name", None) == "identifier":
        identifier = meta_data.get("content", None)
    elif meta_data.get("name", None) == "year":
        year = meta_data.get("content", None)

for link_data in list_link:
    if "license" in link_data.get("rel", None):  # rel is string list
        license = link_data.get("href", None)
        
pre_body = soup.body  # It can contain empty lines

body = []

# Remove empty lines or lines with ' ', \t, ou \n
for element in pre_body:
    line = str(element).strip()
    if len(line) > 0:
        body.append(element)

resposta = ""
capitulo = ""

for element in body:

    ## --- Tag h1 ---
    if element.name == "h1":
        capitulo = str(element.string)
    
    ## --- Tag p ---
    elif element.name == "p":    
        if "class" in element.attrs:
            
            tag_class = element["class"]   # class = "separador"; or there is no class in <p>
            
            ## --- <p class="separador"> --
            if "separador" in tag_class:
                tuple_list.append((identifier, book, nro_pergunta, capitulo, pergunta, resposta))
                resposta = ""
                
            ## --- <p class="pergunta"> --
            elif "pergunta" in tag_class:
                
                pergunta_inteira = concatenate_NavigableString(element.contents)

                pergunta_inteira = pergunta_inteira.replace('\n',' ')
                # It is important to remove \n from question so it does not cause errors in pattern match (re.search)
                # For answer there is not this kind of match. So it can be done after this function
                # in answer processing with 'process_text' for bulk file generation
                
                q = re.search('^ *([0-9]+) *\) *(.*)', pergunta_inteira)   
                # '^ *([0-9]+) *\) *'
                # '^ *'  =  string begins ('^') with zero or more blanks (' *') => '^ *'
                # After, it comes a natural number [0-9]+ with one or more ('+') digits  =>  '[0-9]+'
                # Parenthesis indicates a group, the first group   =>  '([0-9]+)'
                # group(0) gets all the matched string
                # group(1) gets the first group '([0-9]+)', the natural number
                # group(2) gets what comes inside the second parenthesis: (.*)
                # After the natural number, there are zero ou more blanks => ' *'
                # Then, the character ')' . It needs the scape char '\'  =>  '\)' 
                # Again zero ou more blanks => ' *'
                # Then the second group, which represents the question text with 0 ou more characters => '(.*)'
                
                nro_pergunta = int(q.group(1))  # ([0-9]+)
                pergunta = q.group(2)           # (.*) 
                
            ## --- <p> with class, but not "separador" or "pergunta" ---
            else: # it is <p> but it is not a question and it is not separator => it is answer
                content = '<p>' + concatenate_NavigableString(element.contents) + '</p>'
                resposta = resposta + content
        
        ## --- <p> without class ---
        else: # it is <p> but it is not a question and it is not separator => it is answer
            content = '<p>' + concatenate_NavigableString(element.contents) + '</p>'
            resposta = resposta + content
           
    ## --- Outras tags ---
    else:
        content = str(element)
        resposta = resposta + content
        
    ## Other tags are ignored. e.g. div
    

In [8]:
def process_text(text):
    return text.replace('"','\\"').replace('\n','')
    # scape char " with \"
    # there is no newline for bulk text generation

questions_checking = np.zeros(500)  

# format of the output file name
outFileName = path + 'bulk-' + identifier + ".txt"  
outFile = open(outFileName, 'w')


index_name = "my-index-000001"

for tuple in tuple_list:
    identifier, book, question_number, chapter, question, answer = tuple
    ## The string number of the question has three chars: 001, 002, ..., 500
    key = identifier + '_' + '{:03d}'.format(question_number)  
    questions_checking[question_number-1] = 1
    

    ## writing in output file 
    print("{\"index\":{\"_id\": \"" + key  + "\"}}", file = outFile)
    print("{\"question_number\": " + str(question_number) + ", \"question\":\"" + process_text(question) + "\", \"answer\":" +
          "\"" + process_text(answer) + "\", \"chapter\": \"" + process_text(chapter) + "\", \"book\": \"" + process_text(book) + 
          "\", \"book_id\": \"" + identifier + "\", \"epub\": \"" + epub + "\", \"pdf\": \"" + pdf + "\", \"year\": " + year +          
          "}", file = outFile)    
    
outFile.close()


In [9]:
for i in range(0,500):
    if questions_checking[i]==0:
        print("Not added question: ",i+1)

The generated bulk file can be used to index the content in Elasticsearch with curl command as in:

curl -H "Content-Type:application/x-ndjson" -XPOST "http://localhost:9200/my-index/_bulk?pretty" --data-binary @"bulk-algodao.txt"

content-type is x-ndjson: json objects separated by newlines.