# Content Design for RAG
This notebook is part of a collection of material related to content design principles for retrieval-augmented generation (RAG).

You can explore the complete collection here: [Content Design for RAG on GitHub](https://github.com/spackows/RAG-CD/blob/main/README.md)

## HTML to text
The sample code below demonstrates a simple method for converting HTML to a ~text format.  

The text format has some Markdown elements:
- Headings (eg. #, ##, ###)
- Ordered lists
- Unordered lists
- Tables converted to lists of lists

**Contents**
1. Download sample HTML files
2. Convert HTML to text

### 1. Download sample HTML files

The sample html files below are from the [Natural Questions](https://ai.google.com/research/NaturalQuestions) data set and benchmark.

In [2]:
file_names_arr = [
"Abundance-of-elements-in-Earths-crust.html",
"Atmosphere-of-Earth.html",
"Axial-precession.html",
"Axial-tilt.html",
"Carbon-cycle.html",
"Carbon-dioxide-in-Earths-atmosphere.html",
"Continent.html",
"Crust-geology.html",
"Earth.html",
"Earths-energy-budget.html",
"Earths-internal-heat-budget.html",
"Earths-magnetic-field.html",
"Earths-orbit.html",
"Earths-rotation.html",
"Inner-core.html",
"Mantle-geology.html",
"Mantle-convection.html",
"Plate-tectonics.html",
"Structure-of-the-Earth.html"
]

In [3]:
url_base = "https://raw.githubusercontent.com/spackows/RAG-CD/main/Natural-Questions/html/"

In [None]:
!pip install wget

In [5]:
import os
import wget

for file_name in file_names_arr:
    url = url_base + file_name
    if not os.path.isfile( file_name ):
        wget.download( url, out = file_name )
        
!ls

Abundance-of-elements-in-Earths-crust.html  Earths-internal-heat-budget.html
Atmosphere-of-Earth.html		    Earths-magnetic-field.html
Axial-precession.html			    Earths-orbit.html
Axial-tilt.html				    Earths-rotation.html
Carbon-cycle.html			    Inner-core.html
Carbon-dioxide-in-Earths-atmosphere.html    Mantle-convection.html
Continent.html				    Mantle-geology.html
Crust-geology.html			    Plate-tectonics.html
Earth.html				    Structure-of-the-Earth.html
Earths-energy-budget.html


In [6]:
from bs4 import BeautifulSoup
import re
import math


def removeCitation( txt ):
    txt_out = re.sub( r"\[\d+\]", " ", txt )
    txt_out = re.sub( r"\s+", " ", txt_out ).strip()
    return txt_out

def removeUnwantedDivs( soup ):
    nav_arr = soup.find_all( "div", { "role" : "navigation" } )
    for nav in nav_arr:
        nav.decompose()
    nav_arr = soup.find_all( "div", { "class" : "navbar" } )
    for nav in nav_arr:
        nav.decompose()
    toc_arr = soup.find_all( "div", { "class" : "toc" } )
    for toc in toc_arr:
        toc.decompose()
    edit_arr = soup.find_all( "span", { "class" : "mw-editsection" } )
    for edit in edit_arr:
        edit.decompose()
    show_arr = soup.find_all( "span", { "class" : "collapseButton" } )
    for show in show_arr:
        show.decompose()
    access_arr = soup.find_all( "span", { "class" : "cite-accessibility-label" } )
    for label in access_arr:
        label.decompose()
    sortkey_arr = soup.find_all( "span", { "class" : "sortkey" } )
    for span in sortkey_arr:
        span.decompose()

def processH( obj_txt, obj_name ):
    txt = ""
    if( re.match( r"\S", obj_txt ) ):
        num_str = re.sub( r"[^\d]", "", obj_name )
        if( re.match( r"\S", num_str ) ):
            num = int( num_str )
            obj_txt = "#" * num + " " + obj_txt
        txt += "\n\n" + obj_txt
    return txt

def processP( obj_txt ):
    txt = ""
    if( re.match( r"\S", obj_txt ) ):
        txt += "\n\n" + obj_txt
    return txt

def processOL( obj ):
    txt = ""
    count = 1
    for li in obj.children:
        li_txt = re.sub( r"\s+", " ", li.get_text() ).strip()
        li_txt = removeCitation( li_txt )
        if( re.match( r"\S", li_txt ) ):
            txt += "\n" + str( count ) + ". " + li_txt
            count += 1
    return txt

def processUL( obj ):
    txt = ""
    for li in obj.children:
        li_txt = re.sub( r"\s+", " ", li.get_text() ).strip()
        li_txt = removeCitation( li_txt )
        if( re.match( r"\S", li_txt ) ):
            txt += "\n- " + li_txt
    return txt

def processDL( obj ):
    txt = ""
    for d_item in obj.children:
        if( "dt" == d_item.name ):
            dt_txt = re.sub( r"\s+", " ", d_item.get_text() ).strip()
            if( re.match( r"\S", dt_txt ) ):
                txt += "\n" + dt_txt + ":"
        else:
            dd_txt = re.sub( r"\s+", " ", d_item.get_text() ).strip()
            if( re.match( r"\S", dd_txt ) ):
                txt += "\n" + dd_txt
    return txt
    
def convertTableToLists( table ):
    # caption
    caption_txt = ""
    caption_obj = table.find( "caption" )
    caption_txt = re.sub( r"\s+", " ", caption_obj.get_text() ).strip() if caption_obj else ""
    caption_txt = removeCitation( caption_txt )
    rows = table.find_all( "tr" )
    # Headers
    col_headings = []
    for row in rows:
        th_arr = row.find_all( "th" )
        td_arr = row.find_all( "td" )
        if( ( len( col_headings ) > 0 ) and \
            ( ( len( th_arr ) < 1 ) or ( len( td_arr ) > 0 ) ) ):
            break
        this_row_headings = []
        for th in th_arr:
            th_txt = re.sub( r"\s+", " ", th.get_text() ).strip() if th.get_text() else ""
            th_txt = removeCitation( th_txt )
            num_cols = math.floor( float( th["colspan"] ) ) if th.has_attr( "colspan" ) else 1
            this_row_headings += [ th_txt ] * num_cols
        for i in range ( len( this_row_headings ) ):
            if( len( col_headings ) <= i ):
                col_headings.append( this_row_headings[i] )
                continue
            if( re.match( r"\S", this_row_headings[i] ) ):
                col_headings[i] = col_headings[i] + ", " + this_row_headings[i]
    # Make lists
    row_lists = []
    for row in rows:
        td_arr = row.find_all( "td" )
        if( len( td_arr ) < 1 ):
            continue
        list_txt = ""
        col_num = 0
        cols_arr = row.find_all( [ "th", "td" ] )
        for col in cols_arr:
            header_txt = col_headings[ col_num ] if ( col_num < len( col_headings ) ) else ""
            col_num += 1
            col_txt = re.sub( r"\s+", " ", col.get_text() ).strip() if col.get_text() else ""
            col_txt = removeCitation( col_txt )
            if( col_txt ):
                list_txt += "\n- " + header_txt + ": " + col_txt
        if( list_txt ):
            row_lists.append( list_txt )
    txt = ""
    if( caption_txt ):
        txt += "\n\n" + caption_txt + ":"
    if( len( row_lists ) > 0 ):
        txt += "\n" + "\n".join( row_lists ) + "\n"
    return txt
    
def HTMLToText( file_names_arr, b_debug=False ):
    
    for file_name in file_names_arr:
        
        print( file_name + "..." )
        
        f = open( file_name, "r" )
        html = f.read()
        f.close()

        if b_debug:
            print( "\nHTML:\n" + html )
            
        soup = BeautifulSoup( html, "html.parser" )
        removeUnwantedDivs( soup )
        txt = ""
        for obj in soup.find_all( [ "h1", "h2", "h3", "h4", "p", "ol", "ul", "dl", "table"] ):
            obj_name = re.sub( r"\s+", " ", obj.name ).strip().lower() if obj.name else ""
            class_names_arr = obj["class"] if obj.has_attr("class") else []
            obj_txt = re.sub( r"\s+", " ", obj.get_text() ).strip() if obj.get_text() else ""
            obj_txt = removeCitation( obj_txt )
            if( "references" == obj_txt.lower() ) or \
              ( "see also" == obj_txt.lower() ):
                break
            if( "ambox" in class_names_arr ) or \
              ( obj.parent and obj.parent.has_attr("class") and ( "toctitle" in obj.parent["class"] ) ):
                continue
            if( re.match( r"^h\d$", obj_name ) ):
                txt += processH( obj_txt, obj_name )
                continue
            if( "p" == obj_name ):
                txt += processP( obj_txt )
                continue
            if( "ol" == obj_name ):
                txt += processOL( obj )
                continue
            if( "ul" == obj_name ):
                txt += processUL( obj )
                continue
            if( "dl" == obj_name ):
                txt += processDL( obj )
                continue
            if( "table" == obj_name ):
                txt += convertTableToLists( obj )
                continue
            if( obj.name and obj.get_text() ):
                print( "\n\nUNKNOWN: '" + obj_name + "'\t (" + str( class_name ) + ") '" + obj_txt[0:100] + "'" )
        
        if b_debug:
            print( "\nText:\n" + txt )
            
        file_name_out = re.sub( r"\.html$", ".org.txt", file_name )
        f = open( file_name_out, "w" )
        f.write( txt )
        f.close()

### 2. Convert HTML to text

In [None]:
HTMLToText( [ file_names_arr[12] ], True )