In [1]:
import json
from copy import deepcopy
import re
import os
from datetime import datetime

In [2]:
# file_path = 'sample/normalized/bni-digics.json'

# with open(file_path, 'r') as file:
#     data = json.load(file)

In [3]:
def get_date_balance(normalized_data, target_page, balance_align, tolerance=0.01):
    with open(normalized_data, 'r') as file:
        data = json.load(file)

    x_date = float('inf')
    x_balance = float('-inf')
    dates = []
    balances = []

    for page in data:
        if page['page_id'] == target_page and 'words' in page:
            page_data = page['words']

            for word in page['words']:
                x_date_coordinate = word['geometry'][0][0]
                x_balance_coordinate = word['geometry'][balance_align][0]

                if x_date_coordinate < x_date:
                    x_date = x_date_coordinate

                if x_balance_coordinate > x_balance:
                    x_balance = x_balance_coordinate

            for word in page['words']:
                x_date_coordinate = word['geometry'][0][0]
                y_date_coordinate = word['geometry'][0][1]
                x_balance_coordinate = word['geometry'][balance_align][0]
                y_balance_coordinate = word['geometry'][0][1]

                if abs(x_date_coordinate - x_date) <= tolerance:
                    dates.append((word['value'], word['confidence'], y_date_coordinate))

                if abs(x_balance_coordinate - x_balance) <= tolerance:
                    balances.append((word['value'], word['confidence'], y_balance_coordinate))

    if len(balances) > len(dates):
        smallest_balance = min(balances, key=lambda x: x[2])
        balances.remove(smallest_balance)

    return dates, balances, page_data

In [4]:
def group_rows(date, balance, page_data):
    distances = []
    rows = []
    filtered_date_balance_rows = []    
    for i in range(len(date)):
        distance = abs(date[i][2] - balance[i][2])
        max_y = max(date[i][2], balance[i][2])
        min_y = min(date[i][2], balance[i][2])
        distances.append((distance, max_y, min_y))

    for i in range(len(distances)):
        if i + 1 < len(distances):
            next_y_coord = distances[i + 1][2]
        else:
            next_y_coord = distances[i][2] + 0.02
        row_data = []
        for word in page_data:
            item_y_coord = word['geometry'][0][1]
            if item_y_coord < (next_y_coord - 0.001) and (item_y_coord + distances[i][0] + 0.001) >= distances[i][1]:
                row_data.append(word)

        rows.append({'row_data': row_data})


    date_set = set(date)
    balance_set = set(balance)
    
    for row in rows:
        filtered_row = {
            'row_data': []
        }
        for item in row['row_data']:
            value = item['value']
            confidence = item['confidence']
            geometry_y = item['geometry'][0][1]

            if (value, confidence, geometry_y,) not in date_set and (value, confidence, geometry_y) not in balance_set:
                filtered_row['row_data'].append(item)
        
        if filtered_row['row_data']:
            filtered_date_balance_rows.append(filtered_row)

            
    return filtered_date_balance_rows


belowe code for BNI - Digics

In [5]:
def get_debit_credit(filtered_date_balance_rows):
    mutation = []
    type = []
    debits = []
    credits = []
    debits_credits = []
    no_document = []

    filtered_mutation = []
    filtered_type = []
    filtered_no_doc = []

    #get mutation
    for row in filtered_date_balance_rows:
        mutation_coordinate = max(row['row_data'], key=lambda x: x['geometry'][1][0])
        mutation.append(mutation_coordinate)

    mutation_set = {(item['value'], tuple(map(tuple, item['geometry']))) for item in mutation}

    for entry in filtered_date_balance_rows:
        filtered_row_data = [
            item for item in entry['row_data']
            if (item['value'], tuple(map(tuple, item['geometry']))) not in mutation_set 
        ]
        if filtered_row_data: 
            filtered_mutation.append({'row_data': filtered_row_data})
    #get type
    for row in filtered_mutation:
        type_coordinate = max(row['row_data'], key=lambda x: x['geometry'][1][0])
        type.append(type_coordinate)

    type_set = {(item['value'], tuple(map(tuple, item['geometry']))) for item in type}

    for entry in filtered_mutation:
        filtered_row_data = [
            item for item in entry['row_data']
            if (item['value'], tuple(map(tuple, item['geometry']))) not in type_set 
        ]
        if filtered_row_data: 
            filtered_type.append({'row_data': filtered_row_data})

    if len(type) != len(mutation):
        raise ValueError("The lengths of type_data and mutation_data must be the same.")
    
    #get debits and credits from combined type and mutation
    for t, m in zip(type, mutation):
        debits_credits.append({
            'value': [[t['value']], [m['value']]],
            'confidence': max(t['confidence'], m['confidence'])
        })

    debits = [(0.0, 1)] * len(debits_credits)
    credits = [(0.0, 1)] * len(debits_credits)

    for index, item in enumerate(debits_credits):
        value_type = item['value'][0][0]  
        value_amount = item['value'][1][0]  
        confidence = item['confidence']  

        if value_type == 'D':
            debits[index] = (value_amount, confidence)
        elif value_type == 'K':
            credits[index] = (value_amount, confidence)
        else:
            debits[index] = (0.0, 0)
            credits[index] = (0.0, 0)
    
    #clear number of document
    for row in filtered_type:
        no_doc_coordinate = min(row['row_data'], key= lambda x: x['geometry'][0][0])
        no_document.append(no_doc_coordinate)

    no_doc_set = {(item['value'], tuple(map(tuple, item['geometry']))) for item in no_document}

    for entry in filtered_type:
        filtered_row_data = [
            item for item in entry['row_data']
            if (item['value'], tuple(map(tuple, item['geometry']))) not in no_doc_set 
        ]
        if filtered_row_data: 
            filtered_no_doc.append({'row_data': filtered_row_data})


    return debits, credits, filtered_no_doc

below code for BRI - Statement Transaction

In [6]:
def get_datetime_userId(filtered_data, dates):

    userId_collumn = []
    time_collum = []
    times = []
    tolerance = 0.02

    for row in filtered_data:
        max_x_coordinate = max(row['row_data'], key=lambda x: x['geometry'][0][0])
        min_x_coordinate = min(row['row_data'], key=lambda x: x['geometry'][0][0])

        userId_collumn.append(max_x_coordinate)                
        time_collum.append(min_x_coordinate)
        times.append((min_x_coordinate['value'], min_x_coordinate['confidence']))

    highest_userId = max(item['geometry'][0][0] for item in userId_collumn)
    userId = [item for item in userId_collumn if abs(item['geometry'][0][0] - highest_userId) <= tolerance]

    date_times = [
        (f"{date} {time}", min(date_conf, time_conf))
        for (date, date_conf, _), (time, time_conf) in zip(dates, times)
    ]

    userId_set = {(item['value'], tuple(map(tuple, item['geometry']))) for item in userId}
    times_set = {(item['value'], tuple(map(tuple, item['geometry']))) for item in time_collum}

    filtered_time_userId = []

    for entry in filtered_data:
        filtered_row_data = [
            item for item in entry['row_data']
            if (item['value'], tuple(map(tuple, item['geometry']))) not in userId_set and (item['value'], tuple(map(tuple, item['geometry']))) not in times_set 
        ]
        if filtered_row_data: 
            filtered_time_userId.append({'row_data': filtered_row_data})

    return date_times, filtered_time_userId

In [7]:
def get_balance_credit_debit(filtered_time_userId):

    balances = []
    balance_collumn = []
    filtered_balance = []
    
    credits = []
    credit_collumn = []
    filtered_credit = []

    debits = []
    debit_collumn = []
    filtered_debit = []


    for row in filtered_time_userId:
        max_x_coordinate = max(row['row_data'], key=lambda x: x['geometry'][1][0])
        balances.append((max_x_coordinate['value'], max_x_coordinate['confidence']))
        balance_collumn.append(max_x_coordinate)
    
    balance_set = {(item['value'], tuple(map(tuple, item['geometry']))) for item in balance_collumn}

    for entry in filtered_time_userId:
        filtered_row_data = [
            item for item in entry['row_data']
            if (item['value'], tuple(map(tuple, item['geometry']))) not in balance_set 
        ]
        if filtered_row_data: 
            filtered_balance.append({'row_data': filtered_row_data})

    for row in filtered_balance:
        credit_coordinate = max(row['row_data'], key=lambda x: x['geometry'][1][0])
        credits.append((credit_coordinate['value'], credit_coordinate['confidence']))
        credit_collumn.append(credit_coordinate)

    credit_set = {(item['value'], tuple(map(tuple, item['geometry']))) for item in credit_collumn}

    for entry in filtered_balance:
        filtered_row_data = [
            item for item in entry['row_data']
            if (item['value'], tuple(map(tuple, item['geometry']))) not in credit_set 
        ]
        if filtered_row_data: 
            filtered_credit.append({'row_data': filtered_row_data})
        
    for row in filtered_credit:
        debit_coordinate = max(row['row_data'], key=lambda x: x['geometry'][1][0])
        debits.append((debit_coordinate['value'], debit_coordinate['confidence']))
        debit_collumn.append(debit_coordinate)
    
    debit_set = {(item['value'], tuple(map(tuple, item['geometry']))) for item in debit_collumn}

    for entry in filtered_credit:
        filtered_row_data = [
            item for item in entry['row_data']
            if (item['value'], tuple(map(tuple, item['geometry']))) not in debit_set 
        ]
        if filtered_row_data: 
            filtered_debit.append({'row_data': filtered_row_data})



    return balances, credits, debits, filtered_debit

In [8]:
def get_datetime_credit(filtered_data, dates, tolerance=0.01):
    x_start = float('inf')
    x_end = float('-inf')
    times = []
    credits = []

    for row in filtered_data:
        credit_added = False
        row_data_to_remove = []

        for item in row['row_data']:
            left_x_coordinate = item['geometry'][0][0]
            right_x_coordinate = item['geometry'][1][0]

            if left_x_coordinate < x_start:
                x_start = left_x_coordinate

            if right_x_coordinate > x_end:
                x_end = right_x_coordinate
        
        for item in row['row_data']:
            left_x_coordinate = item['geometry'][0][0]
            right_x_coordinate = item['geometry'][1][0]

            if abs(left_x_coordinate - x_start) <= tolerance:
                times.append((item['value'], item['confidence']))
                row_data_to_remove.append(item)

            if abs(right_x_coordinate - x_end) <= tolerance and not credit_added:
                credits.append((item['value'], item['confidence']))
                credit_added = True

            if abs(right_x_coordinate - x_end) <= tolerance:
                row_data_to_remove.append(item)

        for data in row_data_to_remove:
             row['row_data'].remove(data)

    date_times = [
        (f"{date} {time}", min(date_conf, time_conf))
        for (date, date_conf, _), (time, time_conf) in zip(dates, times)
    ]
                
    return date_times, credits, filtered_data

In [9]:
def get_valuedate_debit(filtered_data, tolerance=0.01):
    x_start = float('inf')
    x_end = float('-inf')
    valuedates = []
    debits = []

    for row in filtered_data:
        row_data_to_remove = []
        
        for item in row['row_data']:
            x_coordinate = item['geometry'][1][0]

            if x_coordinate < x_start:
                x_start = x_coordinate

            if x_coordinate > x_end:
                x_end = x_coordinate
                
        for item in row['row_data']:
            x_coordinate = item['geometry'][1][0]

            if abs(x_coordinate - x_start) <= tolerance:
                valuedates.append((item['value'], item['confidence']))
                row_data_to_remove.append(item)

            if abs(x_coordinate - x_end) <= tolerance:
                debits.append((item['value'], item['confidence']))
                row_data_to_remove.append(item)
        
        for data in row_data_to_remove:
             row['row_data'].remove(data)
    

    return valuedates, debits, filtered_data

In [10]:
def get_description(filtered_data):
    descriptions = [
        (
            ' '.join(entry['value'] for entry in row['row_data']),
            min(entry['confidence'] for entry in row['row_data'])
        )
        for row in filtered_data
    ]
    return descriptions

The code below is for Mandiri account statement!

In [11]:
def valid_date_format(date_string):
    try:
        datetime.strptime(date_string, '%d/%m')
        return True
    except ValueError:
        return False
    
def number_format(input_str):
    if all(char.isdigit() or char in ",." for char in input_str):
        return True
    else:
        return False

def get_valdate_credit(filtered_data, tolerance=0.02):
    x_start = float('inf')
    x_end = float('-inf')
    valdate = []
    debit_credit = []
    credits = []
    debits = []


    for row in filtered_data:
        
        for item in row['row_data']:
            left_x_coordinate = item['geometry'][0][0]
            right_x_coordinate = item['geometry'][1][0]

            if left_x_coordinate < x_start:
                x_start = left_x_coordinate

            if right_x_coordinate > x_end:
                x_end = right_x_coordinate

    for row in filtered_data:
        row_credits = []
        row_data_to_remove = []
        for item in row['row_data']:
            left_x_coordinate = item['geometry'][0][0]
            right_x_coordinate = item['geometry'][1][0]

            if abs(left_x_coordinate - x_start) <= tolerance:
                if valid_date_format(item['value']) :
                    valdate.append((item['value'], item['confidence']))
                    row_data_to_remove.append(item)

            if abs(right_x_coordinate - x_end) <= tolerance:
                if number_format(item['value']) or len(item['value']) <= 1:
                    row_credits.append((item['value'], item['confidence']))
                    row_data_to_remove.append(item)
                
        for data in row_data_to_remove:
            row['row_data'].remove(data)
                
        debit_credit.append(row_credits)
        
    if debit_credit != [[]]:
        for row in debit_credit:
            if len(row) > 1 or any(item[0] == "D" for item in row):
                debits.append(row[0])
                credits.append((0,1))
            else:
                credits.append(row[0])
                debits.append((0,1))


    return valdate, debits, credits, filtered_data

Make transaction for all satement

In [12]:
def clean_float_string(value):
    if len(value) > 2 and (value[-3] == '.' or value[-3] == ','):
        value = value[:-3]

    value = value.replace(',', '')
    value = value.replace('.', '')

    return int(value)


def make_transaction(date_times, valuedates, descriptions, debits, credits, balances):

    with open("database/template.json") as template_file:
        template = json.load(template_file)

    transactions = []
    transaction_id = 0

    for page_id, dt_list in date_times.items():
        for i, (dt, dt_conf) in enumerate(dt_list):

            credit_val, credit_conf = (0.0, 0.0)
            balance_val, balance_conf = (0.0, 0.0)
            valuedate_val, valuedate_conf = ("", 0.0)
            debit_val, debit_conf = (0.0, 0.0)
            description_val, description_conf = ("", 0.0)

            try:
                if page_id in credits and i < len(credits[page_id]):
                    credit_val, credit_conf = credits[page_id][i]
                
                if page_id in balances and i < len(balances[page_id]):
                    balance_val, balance_conf = balances[page_id][i][:2]

                if page_id in valuedates and i < len(valuedates[page_id]):
                    valuedate_val, valuedate_conf = valuedates[page_id][i]

                if page_id in debits and i < len(debits[page_id]):
                    debit_val, debit_conf = debits[page_id][i]

                if page_id in descriptions and i < len(descriptions[page_id]):
                    description_val, description_conf = descriptions[page_id][i]

            except IndexError:
                continue 

            transaction = deepcopy(template)
            transaction["id"] = transaction_id
            transaction["datetime"]["value"] = dt
            transaction["datetime"]["confidence"] = dt_conf
            transaction["valuedate"]["value"] = valuedate_val 
            transaction["valuedate"]["confidence"] = valuedate_conf if valuedate_conf else 1
            transaction["description"]["value"] = description_val
            transaction["description"]["confidence"] = description_conf
            transaction["debit"]["value"] = clean_float_string(debit_val) if debit_val else 0.0
            transaction["debit"]["confidence"] = debit_conf if debit_conf else 1
            transaction["credit"]["value"] = clean_float_string(credit_val) if credit_val else 0.0
            transaction["credit"]["confidence"] = credit_conf if credit_conf else 1
            transaction["balance"]["value"] = clean_float_string(balance_val) if balance_val else 0.0
            transaction["balance"]["confidence"] = balance_conf if balance_conf else 0.0

            transactions.append(transaction)
            transaction_id += 1 

    return transactions


In [13]:
file_path = 'sample/normalized/bni-digics.json'

with open(file_path, 'r') as file:
    data = json.load(file)

date_times, valuedates, descriptions, debits, credits, balances, seqs = {}, {}, {}, {}, {}, {}, {}
dates = {}
pages_data, filtered_date_balance_rows, filtered_datetime_credit_rows, filtered_valuedate_debit_row, filtered_valdate_credit_rows = {}, {}, {}, {}, {}
filtered_date_seqs, filtered_times_userIds, filtered_debits, filtered_no_docs = {}, {}, {}, {}

for page in data:
    page_id = page.get("page_id")

    if page.get("statement_id") == 1 :
        date, balance, page_data = get_date_balance(file_path, page_id, 1)
        dates[page_id], balances[page_id], pages_data[page_id] = date, balance, page_data

        filtered_date_balance_row = group_rows(dates[page_id], balances[page_id], pages_data[page_id])
        filtered_date_balance_rows[page_id] = filtered_date_balance_row

        #modify dates tuples
        date_times = {}
        for key, value in dates.items():
            date_times[key] = [(item[0], item[1]) for item in value]

        debit, credit, filtered_doc  = get_debit_credit(filtered_date_balance_rows[page_id])
        debits[page_id], credits[page_id], filtered_no_docs[page_id] = debit, credit, filtered_doc

        descriptions[page_id] = get_description(filtered_no_docs[page_id])

    elif page.get("statement_id") == 3 :
        date, seq, page_data = get_date_balance(file_path, page_id, 0)
        dates[page_id], seqs[page_id], pages_data[page_id] = date, seq, page_data

        filtered_date_seq = group_rows(dates[page_id], seqs[page_id], pages_data[page_id])
        filtered_date_seqs[page_id] = filtered_date_seq

        date_time, filtered_time_userId = get_datetime_userId(filtered_date_seqs[page_id], dates[page_id])
        date_times[page_id], filtered_times_userIds[page_id] = date_time, filtered_time_userId

        balance, credit, debit, filtered_debit = get_balance_credit_debit(filtered_times_userIds[page_id])
        balances[page_id], credits[page_id], debits[page_id], filtered_debits[page_id] = balance, credit, debit, filtered_debit

        descriptions[page_id] = get_description(filtered_debits[page_id])
    elif page.get("statement_id") == 5 :
        # Get date, balance, and filtered group rows
        date, balance, page_data = get_date_balance(file_path, page_id, 1)
        dates[page_id], balances[page_id], pages_data[page_id] = date, balance, page_data

        #modify dates tuples
        date_times = {}
        for key, value in dates.items():
            date_times[key] = [(item[0], item[1]) for item in value]

        filtered_date_balance_row = group_rows(dates[page_id], balances[page_id], pages_data[page_id])
        filtered_date_balance_rows[page_id] = filtered_date_balance_row
        # Extract valuedates, debits, credits, and descriptions
        valdate, debit, credit, filtered_valdate_credit_row = get_valdate_credit(filtered_date_balance_rows[page_id])
        valuedates[page_id], debits[page_id], credits[page_id], filtered_valdate_credit_rows[page_id] = valdate, debit, credit, filtered_valdate_credit_row
        descriptions[page_id] = get_description(filtered_valdate_credit_rows[page_id])
    elif page.get("statement_id") == 6 and len(page.get("words", [])) >= 6:
        # Get date, balance, and filtered group rows
        date, balance, page_data = get_date_balance(file_path, page_id, 0)
        dates[page_id], balances[page_id], pages_data[page_id] = date, balance, page_data

        filtered_date_balance_row = group_rows(dates[page_id], balances[page_id], pages_data[page_id])
        filtered_date_balance_rows[page_id] = filtered_date_balance_row

        datetime, credit, filtered_datetime_credit_row = get_datetime_credit(filtered_date_balance_rows[page_id], date)
        date_times[page_id], credits[page_id], filtered_datetime_credit_rows[page_id] = datetime, credit, filtered_datetime_credit_row

        # Extract valuedates, debits, and descriptions
        valuedate, debit, filtered_row = get_valuedate_debit(filtered_datetime_credit_rows[page_id])
        valuedates[page_id], debits[page_id], filtered_valuedate_debit_row[page_id] = valuedate, debit, filtered_row

        descriptions[page_id] = get_description(filtered_valuedate_debit_row[page_id])

#  save transaction into Json format
transactions = make_transaction(date_times, valuedates, descriptions, debits, credits, balances)

if transactions != []:
    filename = os.path.basename(file_path)
    foldername = "sample/extracted"
    ouput= os.path.join(foldername, filename)
    result = {"transactions": transactions}

    with open(ouput, "w") as file:
        json.dump(result, file, indent=4)
    print(f'Data has been written to {filename}')
else:
    print("Input is invalid")

Data has been written to bni-digics.json
