In [4]:
import pytesseract
from pytesseract import Output
import cv2

def ResizeWithAspectRatio(image, width=None, height=None, inter=cv2.INTER_AREA):
    dim = None
    (h, w) = image.shape[:2]

    if width is None and height is None:
        return image
    if width is None:
        r = height / float(h)
        dim = (int(w * r), height)
    else:
        r = width / float(w)
        dim = (width, int(h * r))

    return cv2.resize(image, dim, interpolation=inter)

In [77]:
class Wordbox:
    def __init__(self, level, page_num, block_num, par_num, line_num, word_num, left, top, width, height, conf, text):
        self.level = level
        self.page_num = page_num
        self.block_num = block_num
        self.par_num = par_num
        self.line_num = line_num
        self.word_num = word_num
        self.left = left
        self.top = top
        self.width = width
        self.height = height
        self.conf = conf
        self.text = text
        self.neighs = {}
        
    def set_neighs(self, left=None, right=None, up=None, down=None):
        if left is not None:
            self.neighs['left'] = left
        if right is not None:
            self.neighs['right'] = right
        if up is not None:
            self.neighs['up'] = up
        if down is not None:
            self.neighs['down'] = down
        pass
        
    def __str__(self):
        def get_neigh(key):
            if key in self.neighs:
                return self.neighs[key].text
            return 'None'
        return 'text: ' + self.text + ' left=' + get_neigh('left') + ' right=' + get_neigh('right')
        #return 'line_num: ' + str(self.line_num) + ', left: ' + str(self.left) + ', top: ' + str(self.top)
        

In [78]:
#from pdf2image import convert_from_path
#pages = convert_from_path('example.pdf', 500)

#for i, page in enumerate(pages):
#    page.save('example' + str(i) + '.png', 'PNG')



In [145]:
import re
def isValid(text):
    if len(text.strip()) == 0:
        return False
    if re.match(r'[:;,.!?\\-]', text.strip()) is not None:
        return False
    return True

In [150]:
kWidthThreshold = 2.0

def task1(path, output_file):
    print('Process: ' + path)
    img = cv2.imread(path)
    d = pytesseract.image_to_data(img, output_type=Output.DICT)
    n_boxes = len(d['level'])

    wordboxes = []
    for i in range(n_boxes):
        if d['level'][i] == 5:
            wordboxes.append(Wordbox(d['level'][i], d['page_num'][i], d['block_num'][i], d['par_num'][i], d['line_num'][i], d['word_num'][i], d['left'][i], d['top'][i], d['width'][i], d['height'][i], d['conf'][i], d['text'][i]))

    for i in range(n_boxes):
        if d['level'][i] == 5:
            (x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])    
            cv2.rectangle(img, (x, y), (x + w, y + h), (0, 0, 255), 2)

    wordboxes = list(filter(lambda x: isValid(x.text), wordboxes))
    wordboxes = sorted(wordboxes, key=lambda x:(x.page_num, x.block_num, x.par_num, x.line_num, x.left))        

    curr_left, curr_top, curr_right, curr_bottom, curr_pos, curr_word = None, None, None, None, None, None

    ret = ''
    args = [-1] * 11 + [""]
    wordboxes.append(Wordbox(*args))
    new_wordboxes = []
    state = None
    curr_line_num = None
    acc_num_letters = 0
    acc_width = 0
    for wordbox in wordboxes:
        new_pos = (wordbox.page_num, wordbox.block_num, wordbox.par_num, wordbox.line_num)
        # replace 30 with 3*avg letter size in that line
        #print("wordbox.line_num=" + str(wordbox.line_num) + "wordbox.text=" + wordbox.text + " word_num=" + str(wordbox.word_num))
        # Update the current line number and its state.
        if new_pos != curr_pos:
            acc_num_letters = 0
            acc_width = 0
            
        acc_num_letters += len(wordbox.text.strip())
        acc_width += wordbox.width
            
        hasMerged = False
        if new_pos == curr_pos:
            # Computer the letter weight.
            letter_weight = acc_width / acc_num_letters
            if (wordbox.left - curr_right) / letter_weight <= kWidthThreshold:
                curr_right = wordbox.left + wordbox.width
                curr_word = curr_word + ' ' + wordbox.text
                hasMerged = True
        if not hasMerged:
            if curr_pos is not None:
                assert state is not None
                if isValid(curr_word):
                    new_wordboxes.append(Wordbox(
                        6,
                        page_num=state['page_num'],
                        block_num=state['block_num'],
                        par_num=state['par_num'],
                        line_num=state['line_num'],
                        word_num=-1,
                        left=curr_left,
                        top=curr_top,
                        width=curr_right - curr_left,
                        height=curr_bottom - curr_top,
                        conf=-1,
                        text=curr_word
                    ))
                #print(new_wordboxes[-1])
                #print(','.join([str(elem) for elem in [curr_left, curr_top, curr_right, curr_top, curr_right, curr_bottom, curr_left, curr_bottom, curr_word]]))
                #ret += str(curr_left) + ',' + str(curr_top) + ',' + str(curr_right) + ',' + str(curr_top) + ',' + str(curr_right) + ',' + str(curr_bottom) + ','
                #+ str(curr_left) + ',' + str(curr_bottom) + ',' 
                #+ curr_word
            curr_word = wordbox.text
            curr_pos = new_pos
            curr_left = wordbox.left
            curr_top = wordbox.top
            curr_right = wordbox.left + wordbox.width
            curr_bottom = wordbox.top + wordbox.height
            
            # Update the state.
            state = {
                'page_num' : wordbox.page_num,
                'block_num' : wordbox.block_num,
                'par_num' : wordbox.par_num,
                'line_num' : wordbox.line_num
            }
    
    return new_wordboxes
    #return ret
    #with open(output_file, 'w') as f:
    #    f.write(ret)
    #pass

In [127]:
#def solve_task1():
#  input_dir = main_dir + 'task1&2_test(361p)'
#  output_dir = '/content/drive/MyDrive/MySROIE2019/task1-test/'
#  for file in get_files(input_dir):
#    print("File: " + file['path'])
#    output_file = output_dir + file['path'].split('/')[-1].replace('.jpg', '') + '.txt'
#    task1(file['path'], output_file)
#    #with open(output_dir + file['path'].split('/')[0].replace('.jpg', '') + '.txt', 'w') as f:
#    #  f.write(ret)

In [82]:
#task1('/content/drive/MyDrive/SROIE2019/task1&2_test(361p)/X00016469670.jpg')
#solve_task1()

In [151]:
print('\n'.join([str(elem) for elem in task1('img/test.jpg', 'output/test.txt')]))

Process: img/test.jpg
text: tan woon yann left=None right=None
text: BOOK TAK (TAMAN DAYA) SDN BHD left=None right=None
text: B97 left=None right=None
text: NO.5? 55,57 & 59, JALAN SAGU 18, left=None right=None
text: TAMAN DAYA left=None right=None
text: 81100 JOHOR BAHRU, left=None right=None
text: JOHOR. left=None right=None
text: WAM MICA A left=None right=None
text: Document Ho left=None right=None
text: TDO1167104 left=None right=None
text: Date left=None right=None
text: 25/12/2018 8:13:39 PM left=None right=None
text: Cashier left=None right=None
text: MANIS left=None right=None
text: Member left=None right=None
text: CASH BILL left=None right=None
text: CODE/DESC left=None right=None
text: PRICE left=None right=None
text: Disc left=None right=None
text: AMOUITT left=None right=None
text: Quy left=None right=None
text: RM left=None right=None
text: RM left=None right=None
text: 9556939040118 left=None right=None
text: KF MODELLING CLAY KIDDY left=None right=None
text: FISH left=

In [171]:
boxes = task1('img/test.jpg', 'output/test.txt')

def debug(index):
    box = boxes[index]
    return (box.page_num, box.block_num, box.par_num, box.line_num)

def isChange(index):
    box1, box2 = boxes[index], boxes[index - 1]
    x1 = (box1.page_num, box1.block_num, box1.par_num, box1.line_num)
    x2 = (box2.page_num, box2.block_num, box2.par_num, box2.line_num)
    print(x1)
    print(x2)
    return x1 != x2

def info(box):
    return str((box.text, box.left, box.top, box.width, box.height))

def gravityCenter(box):
    return (box.left + box.width / 2, box.top + box.height / 2)

def isHorizontal(this, other):
    if this.top >= other.top + other.height:
        return False
    if other.top >= this.top + this.height:
        return False
    return True

def isVertical(this, other):
    if this.left >= other.left + other.width:
        return False
    if other.left >= this.left + this.width:
        return False
    return True

def coordDiff(this, other, axis):
    return abs(gravityCenter(this)[axis] - gravityCenter(other)[axis])

def isHClose(this, other, that):
    return coordDiff(this, other, 0) <= coordDiff(this, that, 0) 

def isVCloser(this, other, that):
    return coordDiff(this, other, 1) <= coordDiff(this, that, 1) 

def isLeft(this, other):
    return isHorizontal(this, other) and gravityCenter(this)[0] >= gravityCenter(other)[0]

def isRight(this, other):
    return isHorizontal(this, other) and gravityCenter(this)[0] <= gravityCenter(other)[0]

def isTop(this, other):
    return isVertical(this, other) and gravityCenter(this)[1] >= gravityCenter(other)[1]

def isBottom(this, other):
    return isVertical(this, other) and gravityCenter(this)[1] <= gravityCenter(other)[1]

mapper = {'left': {'direction_fn' : isLeft, 'closer_fn' : isHCloser},
          'right' : {'direction_fn' : isRight, 'closer_fn' : isHorizontalCloser},
          'top' : {'direction_fn' : isTop, 'closer_fn' : isVCloser},
          'bottom' : {'direction_fn' : isBottom, 'closer_fn' : isVCloser}
         }

def update(coord, this, other):
    for key in coord.keys():
        if mapper[key]['direction_fn'](this, other):
            if (coord[key] is None) or (mapper[key]['closer_fn'](this, other, coord[key])):
                coord[key] = other

graph = {}
for i, box1 in enumerate(boxes):
    coord = {'left' : None, 'right' : None, 'top' : None, 'bottom' : None}
    for j, box2 in enumerate(boxes):
        update(coord, box1, box2)
    graph[i] = coord

print(graph)
# print('\n'.join([str(elem) for elem in boxes]))

Process: img/test.jpg


NameError: name 'isHCloser' is not defined