In [2]:
import re
from rapidfuzz import fuzz
from helper import load_json_file, save_json_file

data = load_json_file("sample/mandiri-account_statement_report.json")
settings = load_json_file("sample/setting_parameter.json")

In [3]:
def normalize_string(s):
    return re.sub(r'\W+', ' ', s).lower()

In [49]:
def get_statement_type(data):
    normalized_data = normalize_string(data)
    ranking = []

    for setting in settings['statement_types']:
        scores = []
        total_word_count = sum(len(keyword.split()) for keyword in setting['keywords'])
        
        for keyword in setting['keywords']:
            score = fuzz.partial_ratio(normalize_string(keyword), normalized_data)
            if score > 80:
                scores.append(score)
        
        if scores:
            final_score = sum(scores) * total_word_count
            ranking.append({
                "setting": setting,
                "final_score": final_score
            })

    if not ranking:
        return None
    
    best_match = max(ranking, key=lambda x: x["final_score"])
    best_match["setting"]["final_score"] = best_match["final_score"]

    return best_match["setting"]

In [144]:
def normalize_text(data):
    result = {"pages": []}

    for page in data['pages']:
        page_id = page['page_idx']
        word_entries = []
        combined_words = []

        for block in page['blocks']:
            for line in block['lines']:
                for word in line['words']:
                    if len(combined_words) < 12:
                        combined_words.append(normalize_string(word['value']))
                    word_entry = {
                        "value": word['value'],
                        "confidence": word['confidence'],
                        "geometry": word['geometry']
                    }
                    word_entries.append(word_entry)
        
        cleaned_words = [''.join(word.split()) for word in combined_words]
        combined_words_text = ' '.join(cleaned_words)
        
        normalized_page = {
            "page_id": page_id,
            "combined_words": combined_words_text,
            "words": word_entries
        }

        result["pages"].append(normalized_page)
    
    return result


In [145]:
results = normalize_text(data)
save_json_file("sample/mandiri-account_statement_report_normalized.json", results)

In [146]:
for result in results['pages']:
    print(result['combined_words'])

laporan rekening koran a a account statement report mandiri account no 16500
02082022 15 1852 inhousetrf ke yossy 6200000 000 13648336138 02082022 1852 18
kaskecil 12082022 0050 372009736106008076 9422transfer 89040000 03 1120510254656010 000 1203346038 20050 fensteres
po410 23082022 1952 mcm inhousetrf key yossy 45 inccatantgnpno linggatransfer 6300000 000
2 po420 1821 mcm inhousetif kejohnny 3002021 202208301609609409010 budiantotransfer fee 81000000 000
laporan rekening koran e a  account statement report mandiri account no
01092022 1532 mcmoutw cn indriani 15 citradewiciearing fee99102 suryaagung 227900000 000 6382200016
202000070075465069102 gapokjul122 02002020 0909 onlinetr 46059 le 15000 000 626613916 02092022 0909
20200616370621899102 po433 06092022 1637 mcm inhousetif ke sartika  28 iriawantransfer fee
tgl31 agt 09092022 1120 ke 02 lonetn fee 120000000 000 138710257116 09092022
po279 14092022 0b40 mcminhousetif ke 351350000 17 saifatransfer fee 000 32754828518 202091319228985