In [1]:
import json
import re

file_path = 'sample/scanned/mandiri-account_statement.json'

with open(file_path, 'r') as f:
    json_data = json.load(f)

expected_headers = [
    {"Posting", "Date", "Effective", "Branch", "Journal", "Transaction", "Description", "Amount", "DB/CR", "Balance"},
    {"Tgl", "Transaksi", "No.", "Dokumen", "Uraian", "Tipe", "Mutasi", "Saldo"},
    {"TIME", "REMARK", "DEBET", "CREDIT", "TELLER", "ID"},
    {"Tanggal", "Keterangan", "Debit", "Kredit", "Saldo", "UserID", "SEQ"},
    {"Posting", "Date", "Remark", "Reference", "No", "Debit", "Credit", "Balance"},
    {"Date", "Val.Date", "Description", "Reference", "No.", "Debet", "Credit", "Balance"},
    {"Date", "Time", "Value", "Description", "Reference", "No.", "Debit", "Credit", "Saldo"},
    {"Tran", "Date", "Value", "Description", "Debit", "Credit", "Balance"}
]

def normalize_text(text):
    return re.sub(r'[^a-z0-9]', '', text.lower())

def find_table_header(json_data):
    blocks = json_data["pages"][0]["blocks"]
    
    found_words = {}
    
    for block in blocks:
        for line in block.get("lines", []):
            for word in line.get("words", []):
                normalized_value = normalize_text(word["value"])
                if normalized_value not in found_words:
                    found_words[normalized_value] = []
                found_words[normalized_value].append(word["geometry"])
    
    for header in expected_headers:
        normalized_header = set(normalize_text(word) for word in header)
        if len(normalized_header.intersection(found_words.keys())) >= len(normalized_header) * 0.8:  
            matched_geometries = {word: found_words[word] for word in normalized_header if word in found_words}
            return matched_geometries
    
    return None

matched_geometries = find_table_header(json_data)

In [2]:
def find_header_row(matched_geometries, y_tolerance=0.01):
    rows = {}
    for word, geometries in matched_geometries.items():
        for geometry in geometries:
            y_coord = geometry[0][1]
            matched_row = None
            for existing_y in rows.keys():
                if abs(existing_y - y_coord) < y_tolerance:
                    matched_row = existing_y
                    break
            
            if matched_row is None:
                rows[y_coord] = []
            else:
                y_coord = matched_row
                
            rows[y_coord].append({
                'word': word,
                'geometry': geometry
            })
    
    header_row = max(rows.items(), key=lambda x: len(x[1]))
    header_row_sorted = sorted(header_row[1], key=lambda x: x['geometry'][0][0])
    words_in_order = [item['word'] for item in header_row_sorted]
    geometries = [item['geometry'][0][1] for item in header_row_sorted]
    highest_geometry = max(geometries)
    lowest_geometry = min(geometries)
    
    return {
        'highest_geometry': highest_geometry,
        'lowest_geometry': lowest_geometry,
        'words': words_in_order, #show words for debugging
        'geometries': geometries #show geometries for debugging
    }


header_row = find_header_row(matched_geometries)
header_row

{'highest_geometry': 0.3173828125,
 'lowest_geometry': 0.314453125,
 'words': ['date',
  'description',
  'reference',
  'no',
  'debet',
  'credit',
  'balance'],
 'geometries': [0.314453125,
  0.3154296875,
  0.3154296875,
  0.31640625,
  0.31640625,
  0.31640625,
  0.3173828125]}

In [3]:
def split_json_data(json_data, lowest_geometry, highest_geometry):
    above_header = []
    below_header = []

    blocks = json_data["pages"][0]["blocks"]
    
    for block in blocks:
        for line in block.get("lines", []):
            for word in line.get("words", []):
                    word_y_coord = word['geometry'][0][1]
                    word_x_coord = word['geometry'][0][0]
                    
                    if word_y_coord < lowest_geometry:
                        above_header.append({
                            'value': word['value'],
                            'y_coord': word_y_coord,
                            'x_coord': word_x_coord
                        })
                    elif word_y_coord > highest_geometry:
                        below_header.append({
                            'value': word['value'],
                            'confidence': word['confidence'],
                            'geometry': word['geometry'],
                            'y_coord': word_y_coord,
                            'x_coord': word_x_coord
                        })

    return above_header, below_header

above_header, below_header = split_json_data(json_data, header_row['lowest_geometry'], header_row['highest_geometry'])
below_header

[{'value': 'Saldo',
  'confidence': 0.984931230545044,
  'geometry': [[0.12272518382352943, 0.3388671875],
   [0.1614200367647059, 0.3525390625]],
  'y_coord': 0.3388671875,
  'x_coord': 0.12272518382352943},
 {'value': 'Permndahan',
  'confidence': 0.6011824607849121,
  'geometry': [[0.1600380777310924, 0.33984375],
   [0.2291360294117647, 0.3525390625]],
  'y_coord': 0.33984375,
  'x_coord': 0.1600380777310924},
 {'value': '274,849,767.00',
  'confidence': 0.9974940419197083,
  'geometry': [[0.7114397321428572, 0.3427734375],
   [0.8026490283613446, 0.3564453125]],
  'y_coord': 0.3427734375,
  'x_coord': 0.7114397321428572},
 {'value': '01/03',
  'confidence': 0.9992011189460754,
  'geometry': [[0.03842568277310926, 0.3505859375],
   [0.07573857668067224, 0.3642578125]],
  'y_coord': 0.3505859375,
  'x_coord': 0.03842568277310926},
 {'value': '28/02',
  'confidence': 0.9541639089584351,
  'geometry': [[0.08126641281512603, 0.3505859375],
   [0.11857930672268907, 0.365234375]],
  'y_c