In [1]:
# standard path wrangling to be able to import config, data, and sources from project root
import os
import sys
root = os.path.dirname(os.getcwd())
sys.path.append(root)

In [2]:
# pip install bs4

In [3]:
# built-in
import csv
import html
from os import listdir
from os.path import join
from typing import Type, List, Tuple, Callable
import re

# third-party
import pandas as pd
from bs4.element import Tag
from bs4 import BeautifulSoup

# custom

# HELPER FUNCTIONS

In [4]:
def process_files(
        dirpath: str, fname_start: str, fname_end: str,
        extraction_fn: Callable[[Type[Tag]], List[Tuple[str, List[str]]]]
) -> List[Tuple[str, List[str]]]:
    """
    Iterate over files in `dirpath` starting from `fname_start` until `fname_end`.
    Run `extraction_fn` on each file.
    """

    terms: List[Tuple[str, List[str]]] = []

    for fname in sorted(listdir(dirpath)):

        # skip any non-xml files
        if not fname.endswith('.xml'):
            continue

        # skip beginning and end files
        # this is a bit dirty, maybe should be replaced with some nicer code
        if fname < fname_start or fname > fname_end:
            continue

        # load file with BeautifulSoup
        fpath = join(dirpath, fname)
        with open(fpath) as f_in:
            tree = BeautifulSoup(f_in)

        # extract terms from file
        terms.extend(extraction_fn(tree))

#     num_altterms = sum(len(x[1]) for x in terms if x[1] is not None)
    print(f'Found {len(terms)} terms')
    return terms

def save_output(output_fp: str, data: List[Tuple[str, List[str]]]) -> None:
    """
    Save `data` (terms) in a CSV file located in `output_fp`.
    """
    with open(output_fp, 'w') as f_out:
        writer = csv.writer(f_out)
        for term in data:
            if len(term[0]) >= 3:
                writer.writerow((term[0], ))
        for term in data:
            if term[1] is not None:
                for altterm in term[1]:
                    if len(altterm) >= 3:
                        writer.writerow((altterm, ))

In [5]:
all_terms = []

In [6]:
# ============================================================= #
# Tables                                                     #
# ============================================================= #

def extract_terms_Table_1(xml_tree: Type[Tag]) -> List[Tuple[str, List[str]]]:
    """
    Example structure:
    
    ...
    <ce:textbox-head>
        <ce:title id="tit0015">Root causes of diagnostic error in studies</ce:title>
    </ce:textbox-head>
    <ce:textbox-body>
        <ce:sections>
            <ce:para id="p0130">
                <ce:display>
                    <ce:table frame="topbot" id="t0010">
                        <ce:alt-text id="atte0065" role="short">Unlabelled table</ce:alt-text>
                        <tgroup cols="2">
                            <colspec colname="col1" colnum="1"/>
                            <colspec colname="col2" colnum="2"/>
                            <thead>
                                <row rowsep="1">
                                    <entry align="left">Error category</entry>
                                    <entry align="left">Examples</entry>
                                </row>
                            </thead>
                            <tbody>
                                <row rowsep="1">
                                    <entry align="left">
                                        <ce:bold>No fault</ce:bold>
                                    </entry>
                                    <entry align="left">
                                        <ce:list id="ulist0030">
                                            <ce:list-item id="u0120">
                                                <ce:para id="p0135">Unusual presentation of a disease</ce:para>
                                            </ce:list-item>
                                            <ce:list-item id="u0125">
                                                <ce:para id="p0140">Missing information</ce:para>
                                            </ce:list-item>
                                        </ce:list>
                                    </entry>
                                </row>    
        
    """
    
    # build a list of terms
    terms = []
    relations=[]
    title=None
    current_term = None
    relation = None
    cols=None
    total_columns=None
    lists=[]

    for maintextbox in xml_tree.find_all('ce:textbox'):
        for head in maintextbox.find_all('ce:textbox-head'):
            for titles in head.find_all('ce:title'):
                title=html.unescape(titles.getText())
                table_id=titles.get('id')
        for groups in maintextbox.find_all('ce:textbox-body'):
            for group in groups.find_all('tgroup'):
                total_columns=group.get('cols')
                for head in group.find_all('thead'):
                    i=0
                    for rows in head.find_all('entry'):
                        i=i+1
                        globals()[f"header{i}"] = html.unescape(rows.getText())
#                     print(header1)
#                     print(header2)
                row=1
                for body in group.find_all('tbody'):
                    for rows in body.find_all('row'):
                        i=0
                        col=1
                        group=0
                        for entries in rows.find_all('entry'):
                            for children in entries.children:
                                if children.name == 'ce:bold':
                                    i=1
                                    ls=[table_id,title,html.unescape(children.getText()),globals()[f"header{i}"],total_columns,row,col,group]
#                                     ls=[row,col,group]
                                    print(ls)
                                elif children.name == 'ce:list':
                                    i=1
                                    i=i+1
                                    col=col+1
                                    for lists in children.find_all('ce:list-item'):
                                        ls=[table_id,title,html.unescape(lists.getText().strip('\n')),globals()[f"header{i}"],total_columns,row,col,group]
#                                         ls=[row,col,group]
                                        group=group+1
                                        print(ls)
                                    row=row+1
                                    col=1
                                    group=0
                                        
                break
                break
            break
        break
        
    return lists



In [7]:
# ============================================================= #
# Tables                                                     #
# ============================================================= #

def extract_terms_Table_2(xml_tree: Type[Tag]) -> List[Tuple[str, List[str]]]:
    """
    Example structure:
    
    ...
    
        
    """
    
    # build a list of terms
    terms = []
    relations=[]
    title=None
    current_term = None
    relation = None
    cols=None
    total_columns=None
    lists=[]
    final=[]

    for maintextbox in xml_tree.find_all('ce:textbox'):
        for head in maintextbox.find_all('ce:textbox-head'):
            for titles in head.find_all('ce:title'):
                title=html.unescape(titles.getText())
                table_id=titles.get('id')
        for groups in maintextbox.find_all('ce:textbox-body'):
            for group in groups.find_all('tgroup'):
                total_columns=group.get('cols')
                for head in group.find_all('thead'):
                    i=0
                    for rows in head.find_all('entry'):
                        i=i+1
                        globals()[f"header{i}"] = html.unescape(rows.getText())
#                     print(header1)
#                     print(header2)
                row=0
                for body in group.find_all('tbody'):
                    for rows in body.find_all('row'):
                        i=0
                        col=1
                        group=0
                        num_of_nodes_row=0
                        for entries in rows.find_all('entry'):
                            num_of_nodes_row = num_of_nodes_row + 1
                        less_row_element=0
                        if num_of_nodes_row < int(total_columns):
                            less_row_element=1
                        if less_row_element!=1:row=row+1
                        for entries in rows.find_all('entry'):
#                             current_term=(entries.text)
#                             lists.append(current_term)
#                             print('endshere')
                            global bold_flag
                            bold_flag=0
                            global list_item_flag
                            list_item_flag=0

                            for children in entries.children:

                                if children.name == 'ce:bold' and less_row_element==0:
#                                     i=1
                                    final.append([table_id,title,html.unescape(children.getText()),globals()[f"header{col}"],total_columns,row,col,group])
                                    bold_flag=1
#                                     print(ls)
                                    col=col+1
                                elif children.name == 'ce:list'and less_row_element==0:
                                    list_item_flag=1
#                                     i=1
#                                     i=i+1
#                                     col=col+1
                                    for lists in children.find_all('ce:list-item'):
                                        final.append([table_id,title,html.unescape(lists.getText().strip('\n')),globals()[f"header{col}"],total_columns,row,col,group])
                                        group=group+1
#                                         print(ls)
                                    col=col+1
#                                     row=row+1
                                    col=1
                                    group=0

                                    
                            if bold_flag==0 and list_item_flag==0 and less_row_element==0:
#                                     i=1
#                                     i=i+1
                                    final.append([table_id,title,html.unescape(entries.getText().strip('\n')),globals()[f"header{col}"],total_columns,row,col,group])
#                                     print(ls)
                                    group=group+1
                                    col=col+1
#                                     col=1
                                    group=0

                                    
                                    
                            
                                        
        
    return final



In [8]:
dirname = 'XML'
dirpath = join('/home/jupyter-sindhwah/Table_Extraction/', dirname) # replace this with your own path if needed
print(f'Loading from {dirpath}')

# # Index
# fname_start = 'B978-0-323-61242-5.18001-7.xml'
# fname_end = 'B978-0-323-61242-5.18001-7.xml'
# terms=process_files(dirpath, fname_start, fname_end, extract_index_425)

#Chapters
fname_start = 'B978-0-7020-7028-0.00001-9.xml'
fname_end = 'B978-0-7020-7028-0.00035-4.xml'
# fname_start = 'B978-0-7020-7028-0.00002-0.xml'
# fname_end = 'B978-0-7020-7028-0.00002-0.xml'
# fname_start = 'B978-0-7020-7028-0.00002-0.xml'
# fname_end = 'B978-0-7020-7028-0.00002-0.xml'
title=process_files(dirpath, fname_start, fname_end, extract_terms_Table_2 )

Loading from /home/jupyter-sindhwah/Table_Extraction/XML




Found 15370 terms


In [9]:
df = pd.DataFrame(title, columns = ['table_id', 'Table_name', 'Value','Table_Column','Total_columns','Row','Column','Group'])

In [10]:
df

Unnamed: 0,table_id,Table_name,Value,Table_Column,Total_columns,Row,Column,Group
0,tit0015,Root causes of diagnostic error in studies,No fault,Error category,2,1,1,0
1,tit0015,Root causes of diagnostic error in studies,Unusual presentation of a disease,Examples,2,1,2,0
2,tit0015,Root causes of diagnostic error in studies,Missing information,Examples,2,1,2,1
3,tit0015,Root causes of diagnostic error in studies,System error,Error category,2,2,1,0
4,tit0015,Root causes of diagnostic error in studies,Inadequate diagnostic support,Examples,2,2,2,0
...,...,...,...,...,...,...,...,...
15365,tit0055,Analytes that may be significantly affected by...,0.38–4.04 mIU/L,Second trimester,4,8,4,0
15366,tit0055,Analytes that may be significantly affected by...,"Thyroxine (free), (free T4)",Analyte,4,9,1,0
15367,tit0055,Analytes that may be significantly affected by...,10–18 pmol/L0.77–1.40 ng/dL,Reference range,4,9,2,0
15368,tit0055,Analytes that may be significantly affected by...,9–16 pmol/L0.70–1.24 ng/dL,First trimester,4,9,3,0


In [11]:
df.to_excel('All_Chapters_9780702070280_Table_Extraction.xlsx', header=True, index=False, encoding='utf-8')

In [None]:
### END ###