In [1]:
import json

def get_date_balance(normalized_data, target_page, tolerance=0.01):
    with open(normalized_data, 'r') as file:
        data = json.load(file)

    x_start = float('inf')
    x_end = float('-inf')
    date = []
    balance = []
    page_data = []
    
    for page in data:
        if page['page_id'] == target_page and 'words' in page:
            page_data = page['words']
            for word in page['words']:
                x_coordinate = word['geometry'][0][0]

                if x_coordinate < x_start:
                    x_start = x_coordinate

                if x_coordinate > x_end:
                    x_end = x_coordinate

            for word in page['words']:
                x_coordinate = word['geometry'][0][0]
                y_coordinate = word['geometry'][0][1]

                if abs(x_coordinate - x_start) <= tolerance:
                    date.append((word['value'], y_coordinate, x_coordinate))

                if abs(x_coordinate - x_end) <= tolerance:
                    balance.append((word['value'], y_coordinate, x_coordinate))

    return date, balance, page_data


In [2]:
def calculate_distance(date, balance):
    distances = []

    length = min(len(date), len(balance))

    for i in range(length):
        distance = abs(date[i][1] - balance[i][1])
        distances.append((date[i][0], date[i][1], distance))

    return distances


In [3]:
def group_rows(distances, data):
    rows = []
    
    for i in range(len(distances)):

        if i + 1 < len(distances):
            next_y_coord = distances[i + 1][1]
        else:
            next_y_coord = distances[i][1] + 0.1
        
        row_data = []
        for word in data:
            item_y_coord = word['geometry'][0][1]

            if item_y_coord < next_y_coord and (item_y_coord + distances[i][2] - 0.001) > distances[i][1]:
                row_data.append(word)

        rows.append({'row_data': row_data})
    
    return rows


In [4]:
def date_balance_filter(rows, date, balance):
    filtered_date_balance_rows = []
    date_set = set(date)
    balance_set = set(balance)

    for row in rows:
        filtered_row = {
            'row_data': []
        }
        for item in row['row_data']:
            value = item['value']
            geometry_y = item['geometry'][0][1]
            geometry_x = item['geometry'][0][0]
            if (value, geometry_y, geometry_x) not in date_set and (value, geometry_y, geometry_x) not in balance_set:
                filtered_row['row_data'].append(item)
        
        if filtered_row['row_data']:
            filtered_date_balance_rows.append(filtered_row)
        
    return filtered_date_balance_rows

In [25]:
def get_time_credit(filtered_date_balance_rows, tolerance=0.01):
    x_start = float('inf')
    x_end = float('-inf')
    time = []
    credits = []

    for idx, row in enumerate(filtered_date_balance_rows):
        credit_added = False  # Track if a credit has been added for this row
        for item in row['row_data']:
            left_x_coordinate = item['geometry'][0][0]
            right_x_coordinate = item['geometry'][1][0]

            if left_x_coordinate < x_start:
                x_start = left_x_coordinate

            if right_x_coordinate > x_end:
                x_end = right_x_coordinate
        
        for item in row['row_data']:
            left_x_coordinate = item['geometry'][0][0]
            right_x_coordinate = item['geometry'][1][0]

            if abs(left_x_coordinate - x_start) <= tolerance:
                time.append((idx, item['value'], item['confidence'], left_x_coordinate))

            if abs(right_x_coordinate - x_end) <= tolerance and not credit_added:
                credits.append((idx, item['value'], item['confidence'], right_x_coordinate))
                credit_added = True  # Prevent adding more than one credit per row

    return time, credits


In [26]:
normalized_data = 'sample/normalized/mandiri-transaction_inquery.json'
target_page=0

# mengambil tanggal dan saldo untuk inisialisasi
date, balance, data = get_date_balance(normalized_data, target_page)
# menghitung jarak kemiringan tabel
distances = calculate_distance(date, balance)
# mengelompokan data berdasarkan baris
rows = group_rows(distances, data)
# filter tanggal dan saldo pada baris
filtered_date_balance_rows = date_balance_filter(rows, date, balance)
filtered_date_balance_rows


times, credits = get_time_credit(filtered_date_balance_rows)
times, credits

([(0, '11.38.01', 0.9918650388717651, 0.12245646956699346),
  (1, '09.16.15', 0.9869914054870605, 0.12245646956699346),
  (2, '15.44.24', 0.9718276262283325, 0.12245646956699346),
  (3, '13.04.11', 0.9896488785743713, 0.12245646956699346),
  (4, '16.33.30', 0.7440197467803955, 0.12245646956699346),
  (5, '12.49.33', 0.9114729762077332, 0.12245646956699346),
  (6, '14.14.24', 0.9794628024101257, 0.12245646956699346),
  (7, '09.08.01', 0.9836367964744568, 0.12111289828431371),
  (8, '09.43.17', 0.9830178618431091, 0.12245646956699346),
  (9, '06.10.15', 0.9821978807449341, 0.12111289828431371),
  (10, '09.04.14', 0.9990867972373962, 0.11976932700163401),
  (11, '11.52.02', 0.9979954957962036, 0.12111289828431371),
  (12, '12.27.13', 0.9557540416717529, 0.12111289828431371),
  (13, '08.56.10', 0.5516970753669739, 0.12111289828431371),
  (14, '12.14.02', 0.972770631313324, 0.12111289828431371),
  (15, '12.15.23', 0.5703465938568115, 0.12111289828431371),
  (16, '14.13.31', 0.99134826660156