In [1]:
import os
import re

## Read file fuctions

In [2]:
def is_there_a_file(path: str) -> bool:
    return os.path.exists(path)

def check_file_extension(path: str, extension: str) -> bool:
    return os.path.splitext(path)[1] == extension

def valid_path_file(path: str)-> bool:
    return is_there_a_file(path) and check_file_extension(path, '.txt')

def get_all_files_by_extension(path: str, extension: str)-> list:
    return [file_name for file_name in os.listdir(path) if extension in file_name]

def read_file(path: str):
    all_data = []
    text = []
    with open(path, 'r') as file:
        for line in file:
            line = line.rstrip('\n').split(',')

            xn_and_yn = turn_all_elements_into_int(line[:8].copy())
            text_data = process_text_data(line[8:].copy(), ',')

            
            xn_and_yn.append(text_data)
            
            text.append(text_data)
            all_data.append(xn_and_yn)
        
        return all_data, text

def turn_all_elements_into_int(list_str: list)-> list:
    return list(map(lambda number: int(number), list_str))

def process_text_data(list_text: list, join_string: str)-> str:
    if len(list_text) > 1:
        list_text = join_string.join(list_text)
    else:
        list_text = list_text[0]
        
    return list_text


In [14]:
file_path = './test_files/OCR2.txt'
directory_path = './test_files'

valid_path_file(file_path)
get_all_files_by_extension(directory_path,'.txt')
all, text = read_file(file_path)

# Process fuctions

In [4]:
def find_indexes_equal_character(block: list, element: str)-> list:
    return [index for index, text_element in enumerate(block) if  element == text_element]

def find_indexes_that_contain_character(block: list, element: str)-> list:
    return [index for index, text_element in enumerate(block) if  element in text_element]

def index_before_2_and_after_2(index:int)->list:
    return [index - 2, index - 1, index + 1, index + 2]

def get_line_item(text: list, index_x: int)-> list:
    return [text[index] for index in index_before_2_and_after_2(index_x)]

def split_block_by_index(block: list, start: int = None, end: int = None)-> list:
    return block[start:end]

def get_indexes_to_break_down_block(block: list)-> tuple:
    index_invoice = find_indexes_that_contain_character(text,'-INVOICE-')[0]
    index_item = find_indexes_that_contain_character(text, 'ITEM')[0]

    return index_invoice, index_item

def get_main_blocks(reference_block: list, target_block: list )-> list:
    index_invoice, index_item = get_indexes_to_break_down_block(reference_block)

    indexes = [None, index_invoice, index_item, None]
    blocks = []

    for index in range(len(indexes) - 1):
        blocks.append(split_block_by_index(target_block,start=indexes[index], end=indexes[index + 1]))

    return blocks

def get_all_line_items(line_items_block: list)-> list:
    indexes_x = find_indexes_equal_character(line_items_block, 'X')
    result = []

    for index in indexes_x:
        result.append(get_line_item(line_items_block, index))

    return result


In [5]:
def get_company_info(company_info_block: list)-> tuple:
    index_company_registration = find_indexes_that_contain_character(block=company_info_block, element="CO.")[0]

    company_name = split_block_by_index(company_info_block, 1, index_company_registration)
    company_name = process_text_data(company_name, ' ')

    address_lines = split_block_by_index(company_info_block, index_company_registration + 1)
    address_lines = process_text_data(address_lines, ' ')

    return company_name, address_lines

In [6]:
def get_total(totals_block: list)-> str:
    total_rounded_index = find_indexes_that_contain_character(totals_block, 'ROUNDED')[0]
    total = totals_block[total_rounded_index + 1]
    return total.split(' ')[1]

In [7]:
def get_date(date_block: list)-> str:
    pattern = re.compile("[0-9][0-9]-[0-9][0-9]-[0-9][0-9]")

    for i in date_block:
        find = re.search(pattern, i)
        if find:
            return find.group()
            

In [32]:
blocks = get_main_blocks(text, text)

In [9]:
get_all_line_items(blocks[1])


[['9555916500133', '1', '3.11', '3.11'],
 ['9555916500126', '1', '4.62', '4.62'],
 ['079567600084', '1', '11.23', '11.23'],
 ['9555651400385', '1', '7.45', '7.45'],
 ['9090822', '1', '4.50', '4.50']]

In [10]:
name, address = get_company_info(blocks[0])
[name, address]

['MR D.I.Y. (M) SDN BHD',
 'LOT 1851-A & 1851-B, JALAN KPB 6, KAWASAN PERINDUSTRIAN BALAKONG, 43300 SERI KEMBANGAN, SELANGOR (TESCO PUTRA NILAI)']

In [11]:
get_total(blocks[2])

'30.90'

In [12]:
get_date(blocks[2])

'18-11-18'