In [1]:
import pandas as pd
import xml.etree.ElementTree as ET

from retrieve_columns_loghi import retrieve_columns

In [2]:
# Parse the XML file
tree = ET.parse('/home/roderickmajoor/Desktop/Master/Thesis/GT_data/55/page/WBMA00007000010.xml')
root = tree.getroot()

In [3]:
# Define a function to extract information from table cells
def extract_table_cell(cell):
    cell_info = {}
    cell_info['id'] = cell.get('id')
    cell_info['row'] = int(cell.get('row', 0))
    cell_info['col'] = int(cell.get('col', 0))
    cell_info['rowSpan'] = int(cell.get('rowSpan', 1))
    cell_info['colSpan'] = int(cell.get('colSpan', 1))
    cell_info['coords'] = cell.find('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Coords').get('points')
    text_equiv = cell.findtext('.//{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextEquiv/{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Unicode')
    cell_info['text'] = text_equiv if text_equiv else ''
    return cell_info

# Define a function to extract information from table regions and create DataFrames for each region
def extract_table_regions(root):
    table_regions = root.findall('.//{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TableRegion')
    dfs = []
    for region in table_regions:
        cells = region.findall('.//{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TableCell')
        cell_data = [extract_table_cell(cell) for cell in cells]
        max_row = max(cell['row'] + cell['rowSpan'] for cell in cell_data)
        max_col = max(cell['col'] + cell['colSpan'] for cell in cell_data)
        table = [[''] * max_col for _ in range(max_row)]
        for cell in cell_data:
            for i in range(cell['row'], cell['row'] + cell['rowSpan']):
                for j in range(cell['col'], cell['col'] + cell['colSpan']):
                    table[i][j] = cell['text']
        df = pd.DataFrame(table)
        dfs.append(df)
    return dfs

In [4]:
# Extract table regions and create DataFrames
dataframes = extract_table_regions(root)

# Display DataFrames
for i, df in enumerate(dataframes):
    print(f"Table Region {i+1}:")
    print(df)
    print("\n")

Table Region 1:
         0 1     2      3   4  5
0                               
1    feb 5      35   2448       
2        6     346   3366       
3       14      67   1000       
4       16     124   1800       
5    4 mrt     120   1800       
6              905   1134       
7       15     540    668   4  8
8       16      69   1225       
9       26     155    700       
10      29     344    366       
11      30     308    861       
12    Ap 6     343    600       
13             593   1799  15   
14       9     709    652  10   
15      10     952   1029  17   
16      16     948    895  10   
17      18     954   1407   5   
18      19     330   1075       
19             949   1075       
20      20     131    300  14  8
21             982   1044  15  8
22      29     173    666  13   
23   may 2     658   1065       
24       4     515   2451       
25             420   1802   5   
26       9     426   1246  17  8
27      17     690    597       
28      27     553    150  

In [5]:
dataframes[0]

Unnamed: 0,0,1,2,3,4,5
0,,,,,,
1,feb 5,,35.0,2448.0,,
2,6,,346.0,3366.0,,
3,14,,67.0,1000.0,,
4,16,,124.0,1800.0,,
5,4 mrt,,120.0,1800.0,,
6,,,905.0,1134.0,,
7,15,,540.0,668.0,4.0,8.0
8,16,,69.0,1225.0,,
9,26,,155.0,700.0,,


In [6]:
dataframes[1]

Unnamed: 0,0,1,2,3,4,5
0,,,,4381,16.0,
1,,,92.0,1227,,
2,,,52.0,759,7.0,8.0
3,8,,79.0,1216,10.0,
4,9,,32.0,1220,19.0,
5,12,,394.0,600,,
6,14,,32.0,2060,8.0,
7,20,,67.0,1000,,
8,8 maart,,320.0,913,10.0,
9,13,,394.0,300,,


In [7]:
xml_columns = '/home/roderickmajoor/Desktop/Master/Thesis/GT_data/55/page/WBMA00007000010_columns_found.xml'
xml_loghi = '/home/roderickmajoor/Desktop/Master/Thesis/loghi/data/55/page/WBMA00007000010.xml'

df = retrieve_columns(xml_columns, xml_loghi)
df.head(50)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,130,Jan,35.0,2448,3366.0,8.0,th,Moet,bal,122019,10.0,8.0
1,feb,Eijghels,346.0,1000,861.0,4.0,14,hebb,32,394,7.0,28.0
2,6,Deben,67.0,1800,15.0,148.0,20,395,321,600,10.0,18.0
3,14,5,124.0,1800,6520.0,13.0,meert,Feb,67,20608,18.0,8.0
4,meert,aen,1120.0,1134,10291.0,8.0,13,10,P20,1000,9.0,
5,15,Paqe,905.0,668,7010.0,68.0,18,van,394,9,9.0,
6,16,Martijn,55.0,540,5.0,8.0,20,Jan,266,13,4.0,
7,26,aen,844.0,1225,10441158.0,128.0,21,Elise,21,300,10.0,
8,29,Franc,308.0,69,2451.0,38.0,26,ginderdeurg,173,300,15.0,
9,30,dvicq,342.0,100,5.0,58.0,27,8117,47,1049,14.0,
