In [1]:
# standard path wrangling to be able to import config, data, and sources from project root
import os
import sys
root = os.path.dirname(os.getcwd())
sys.path.append(root)

In [2]:
# pip install bs4

In [3]:
# built-in
import csv
import html
from os import listdir
from os.path import join
from typing import Type, List, Tuple, Callable
import re

# third-party
import pandas as pd
from bs4.element import Tag
from bs4 import BeautifulSoup

# custom

# HELPER FUNCTIONS

In [4]:
def process_files(
        dirpath: str, fname_start: str, fname_end: str,
        extraction_fn: Callable[[Type[Tag]], List[Tuple[str, List[str]]]]
) -> List[Tuple[str, List[str]]]:
    """
    Iterate over files in `dirpath` starting from `fname_start` until `fname_end`.
    Run `extraction_fn` on each file.
    """

    terms: List[Tuple[str, List[str]]] = []

    for fname in sorted(listdir(dirpath)):

        # skip any non-xml files
        if not fname.endswith('.xml'):
            continue

        # skip beginning and end files
        # this is a bit dirty, maybe should be replaced with some nicer code
        if fname < fname_start or fname > fname_end:
            continue

        # load file with BeautifulSoup
        fpath = join(dirpath, fname)
        with open(fpath) as f_in:
            tree = BeautifulSoup(f_in)

        # extract terms from file
        terms.extend(extraction_fn(tree))

#     num_altterms = sum(len(x[1]) for x in terms if x[1] is not None)
    print(f'Found {len(terms)} terms')
    return terms

def save_output(output_fp: str, data: List[Tuple[str, List[str]]]) -> None:
    """
    Save `data` (terms) in a CSV file located in `output_fp`.
    """
    with open(output_fp, 'w') as f_out:
        writer = csv.writer(f_out)
        for term in data:
            if len(term[0]) >= 3:
                writer.writerow((term[0], ))
        for term in data:
            if term[1] is not None:
                for altterm in term[1]:
                    if len(altterm) >= 3:
                        writer.writerow((altterm, ))

In [5]:
all_terms = []

In [6]:
# ============================================================= #
# Tables                                                     #
# ============================================================= #

def extract_terms_Table_1(xml_tree: Type[Tag]) -> List[Tuple[str, List[str]]]:
    """
    Example structure:
    
    ...
    
        
    """
    
    # build a list of terms
    terms = []
    relations=[]
    title=None
    current_term = None
    relation = None
    cols=None
    total_columns=None
    lists=[]
    final=[]
    global Chapter_title
    Chapter_title=None
    global ISBN
    ISBN=None
    global Chapter_pii
    Chapter_pii=None
    
    
    for chapter in xml_tree.find_all('chapter'):
        for info in chapter.find_all('info'):
            for children in info.children:
                if children.name== 'ce:pii':
                    Chapter_pii=html.unescape(children.getText())
                if children.name== 'ce:isbn':
                    ISBN = html.unescape(children.getText())
        for children in chapter.children:
            if children.name==('ce:title'):
                Chapter_title= html.unescape(children.getText())
            
    print(Chapter_title,ISBN,Chapter_pii)
    
    for maintextbox in xml_tree.find_all('ce:textbox'):
        for head in maintextbox.find_all('ce:textbox-head'):
            for titles in head.find_all('ce:title'):
                title=html.unescape(titles.getText())
                table_id=titles.get('id')
        for groups in maintextbox.find_all('ce:textbox-body'):
            for group in groups.find_all('tgroup'):
                total_columns=group.get('cols')
                for head in group.find_all('thead'):
                    i=0
                    for rows in head.find_all('entry'):
                        i=i+1
                        globals()[f"header{i}"] = html.unescape(rows.getText())
#                     print(header1)
#                     print(header2)
                row=0
                for body in group.find_all('tbody'):
                    for rows in body.find_all('row'):
                        i=0
                        col=1
                        group=0
                        num_of_nodes_row=0
                        for entries in rows.find_all('entry'):
                            num_of_nodes_row = num_of_nodes_row + 1
                        less_row_element=0
                        if num_of_nodes_row < int(total_columns):
                            less_row_element=1
                        if less_row_element!=1:
                            row=row+1
                        for entries in rows.find_all('entry'):
#                             current_term=(entries.text)
#                             lists.append(current_term)
#                             print('endshere')
                            global bold_flag
                            bold_flag=0
                            global list_item_flag
                            list_item_flag=0
                            global bold_break_flag
                            bold_break_flag=0
                            global prev_col
#                             prev_col=0
                            global italic_flag
                            italic_flag=0
                
                        

                            for children in entries.children:
                                prev_col=col
                                if children.name == 'ce:bold' and less_row_element==0 and bold_break_flag==0:
#                                     i=1
                                    bold_flag=1
                                    bold_break_flag=1
                                    final.append([Chapter_title,ISBN,Chapter_pii,table_id,title,html.unescape(children.getText()),bold_flag,globals()[f"header{col}"],total_columns,row,col,group])
#                                     print([Chapter_title,ISBN,Chapter_pii,table_id,title,html.unescape(children.getText()),bold_flag,globals()[f"header{col}"],total_columns,row,col,group])
#                                     prev_col=col
                                    col=col+1
                                    

                                elif children.name == 'ce:italic':
                                    italic_flag=1
                                    final.append([Chapter_title,ISBN,Chapter_pii,table_id,title,html.unescape(children.getText()),0,globals()[f"header{col}"],total_columns,row,prev_col,group+1])
#                                     print([Chapter_title,ISBN,Chapter_pii,table_id,title,html.unescape(children.getText()),0,globals()[f"header{col}"],total_columns,row,prev_col,group+1])
                                
                                    
                                elif children.name !='ce:bold' and children.name != 'ce:list' and bold_break_flag==1:
                                    final.append([Chapter_title,ISBN,Chapter_pii,table_id,title,html.unescape(children.getText()),0,globals()[f"header{prev_col}"],total_columns,row,prev_col,group+1])
#                                     print([Chapter_title,ISBN,Chapter_pii,table_id,title,html.unescape(children.getText()),0,globals()[f"header{prev_col}"],total_columns,row,prev_col,group+1])
                                    

                                elif children.name !='ce:italic' and children.name != 'ce:list' and italic_flag==1 and bold_break_flag==0:
                                    final.append([Chapter_title,ISBN,Chapter_pii,table_id,title,html.unescape(children.getText()),0,globals()[f"header{prev_col}"],total_columns,row,prev_col,group+1])
#                                     print([Chapter_title,ISBN,Chapter_pii,table_id,title,html.unescape(children.getText()),0,globals()[f"header{prev_col}"],total_columns,row,prev_col,group+1])
                                    
                                    

                                    
                                elif children.name == 'ce:list'and less_row_element==0:
#                                     col=col+1
                                    list_item_flag=1
                                    bold_flag=0
#                                     i=1
#                                     i=i+1
#                                     col=col+1
                                    for lists in children.find_all('ce:list-item'):
                                        final.append([Chapter_title,ISBN,Chapter_pii,table_id,title,html.unescape(lists.getText().strip('\n')),bold_flag,globals()[f"header{col}"],total_columns,row,col,group])
                                        group=group+1
#                                         print([Chapter_title,ISBN,Chapter_pii,table_id,title,html.unescape(lists.getText().strip('\n')),bold_flag,globals()[f"header{col}"],total_columns,row,col,group])
                                    col=col+1
#                                     row=row+1
                                    col=1
                                    group=0
        
#                                 if bold_break_flag==1 or 

                                    
                            if bold_flag==0 and list_item_flag==0 and less_row_element==0 and italic_flag==0:
#                                     i=1
#                                     i=i+1
                                    bold_flag=0
                                    final.append([Chapter_title,ISBN,Chapter_pii,table_id,title,html.unescape(entries.getText().strip('\n')),bold_flag,globals()[f"header{col}"],total_columns,row,col,group])
#                                     print([Chapter_title,ISBN,Chapter_pii,table_id,title,html.unescape(entries.getText().strip('\n')),bold_flag,globals()[f"header{col}"],total_columns,row,col,group])
                                    group=group+1
                                    col=col+1
#                                     col=1
                                    group=0
                            if italic_flag==1:
                                col=col+1

      
                                    
                            
                                        
        
    return final



In [7]:
dirname = 'XML'
dirpath = join('/home/jupyter-sindhwah/Table_Extraction/', dirname) # replace this with your own path if needed
print(f'Loading from {dirpath}')

# # Index
# fname_start = 'B978-0-323-61242-5.18001-7.xml'
# fname_end = 'B978-0-323-61242-5.18001-7.xml'
# terms=process_files(dirpath, fname_start, fname_end, extract_index_425)

#Chapters
fname_start = 'B978-0-7020-7028-0.00001-9.xml'
fname_end = 'B978-0-7020-7028-0.00035-4.xml'
# fname_start = 'B978-0-7020-7028-0.00002-0.xml'
# fname_end = 'B978-0-7020-7028-0.00002-0.xml'
# fname_start = 'B978-0-7020-7028-0.00002-0.xml'
# fname_end = 'B978-0-7020-7028-0.00002-0.xml'
title=process_files(dirpath, fname_start, fname_end, extract_terms_Table_1 )

Loading from /home/jupyter-sindhwah/Table_Extraction/XML
Clinical decision-making 978-0-7020-7028-0 B978-0-7020-7028-0.00001-9
Clinical therapeutics and good prescribing 978-0-7020-7028-0 B978-0-7020-7028-0.00002-0




Clinical genetics 978-0-7020-7028-0 B978-0-7020-7028-0.00003-2
Clinical immunology 978-0-7020-7028-0 B978-0-7020-7028-0.00004-4
Population health and epidemiology 978-0-7020-7028-0 B978-0-7020-7028-0.00005-6
Principles of infectious disease 978-0-7020-7028-0 B978-0-7020-7028-0.00006-8
Poisoning 978-0-7020-7028-0 B978-0-7020-7028-0.00007-X
Envenomation 978-0-7020-7028-0 B978-0-7020-7028-0.00008-1
Environmental medicine 978-0-7020-7028-0 B978-0-7020-7028-0.00009-3
Acute medicine and critical illness 978-0-7020-7028-0 B978-0-7020-7028-0.00010-X
Infectious disease 978-0-7020-7028-0 B978-0-7020-7028-0.00011-1
HIV infection and AIDS 978-0-7020-7028-0 B978-0-7020-7028-0.00012-3
Sexually transmitted infections 978-0-7020-7028-0 B978-0-7020-7028-0.00013-5
Clinical biochemistry and metabolic medicine 978-0-7020-7028-0 B978-0-7020-7028-0.00014-7
Nephrology and urology 978-0-7020-7028-0 B978-0-7020-7028-0.00015-9
Cardiology 978-0-7020-7028-0 B978-0-7020-7028-0.00016-0
Respiratory medicine 978-0-70

In [8]:
df = pd.DataFrame(title, columns = ['Chapter_Title','ISBN','Chapter_PII','table_id', 'Table_name', 'Value','bold_flag','Table_Column','Total_columns','Row','Column','Group'])

In [9]:
df

Unnamed: 0,Chapter_Title,ISBN,Chapter_PII,table_id,Table_name,Value,bold_flag,Table_Column,Total_columns,Row,Column,Group
0,Clinical decision-making,978-0-7020-7028-0,B978-0-7020-7028-0.00001-9,tit0015,Root causes of diagnostic error in studies,No fault,1,Error category,2,1,1,0
1,Clinical decision-making,978-0-7020-7028-0,B978-0-7020-7028-0.00001-9,tit0015,Root causes of diagnostic error in studies,\n,0,Examples,2,1,2,1
2,Clinical decision-making,978-0-7020-7028-0,B978-0-7020-7028-0.00001-9,tit0015,Root causes of diagnostic error in studies,Unusual presentation of a disease,0,Examples,2,1,2,0
3,Clinical decision-making,978-0-7020-7028-0,B978-0-7020-7028-0.00001-9,tit0015,Root causes of diagnostic error in studies,Missing information,0,Examples,2,1,2,1
4,Clinical decision-making,978-0-7020-7028-0,B978-0-7020-7028-0.00001-9,tit0015,Root causes of diagnostic error in studies,System error,1,Error category,2,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
16458,Laboratory reference ranges,978-0-7020-7028-0,B978-0-7020-7028-0.00035-4,tit0055,Analytes that may be significantly affected by...,0.38–4.04 mIU/L,0,Second trimester,4,8,4,0
16459,Laboratory reference ranges,978-0-7020-7028-0,B978-0-7020-7028-0.00035-4,tit0055,Analytes that may be significantly affected by...,"Thyroxine (free), (free T4)",1,Analyte,4,9,1,0
16460,Laboratory reference ranges,978-0-7020-7028-0,B978-0-7020-7028-0.00035-4,tit0055,Analytes that may be significantly affected by...,10–18 pmol/L0.77–1.40 ng/dL,0,Reference range,4,9,2,0
16461,Laboratory reference ranges,978-0-7020-7028-0,B978-0-7020-7028-0.00035-4,tit0055,Analytes that may be significantly affected by...,9–16 pmol/L0.70–1.24 ng/dL,0,First trimester,4,9,3,0


In [10]:
df.to_excel('All_Chapters_9780702070280_Table_Extraction_2nd_Dec.xlsx', header=True, index=False, encoding='utf-8')

In [None]:
### END ###