In [2]:
import boto3
import io
import re
import numpy as np
import os
import xml.etree.ElementTree as ET 
from joblib import Parallel, delayed
import multiprocessing
from utilities.score_computors import are_sentences_alike

In [3]:
def extract_clean_str(original_str):
    return re.sub('[^a-zA-Z0-9]+', ' ', original_str, flags=re.UNICODE).lower().strip().replace(' ', '')

def get_mapped(path):
    key_map, dict_idx = {}, 0
    for r, _, f in os.walk(dir_path):
        for file in f:
            if '.JPG' in file or '.jpg' in file:
                key_map[dict_idx] = file
                dict_idx += 1
    idxs = list(key_map.keys())          
    shuffled_idxs = np.random.randint(0, len(idxs), len(idxs))
    return key_map, shuffled_idxs

def fetch_from_xml(path, file):
    tree = ET.parse(os.path.join(path, file)).getroot()  
    return extract_clean_str(''.join([e.attrib['char'] for e in tree.iter(tag='character')]))

In [12]:
def compute_text_recognition(file):
    cutoff=0.6
    try:
        xml_file = file[:len(file)-4] + '.xml' 
        if not os.path.exists(os.path.join(dir_path, xml_file)):
            return False
        ground_truth_str = fetch_from_xml(dir_path, xml_file)
        if len(ground_truth_str) == 0:
            return False

        with io.open(os.path.join(dir_path, file), 'rb') as image_file:
            content = (image_file.read())
        imgobj = {'Bytes': content}
        client=boto3.client('rekognition', region_name='us-east-1')
        response=client.detect_text(Image=imgobj)
        detected_str = extract_clean_str(''.join([txt['DetectedText'] for txt in response['TextDetections'] \
                                           if txt['Type']=='WORD']))
        if are_sentences_alike(detected_str, ground_truth_str, cutoff) and len(detected_str) != 0:
            return True
        else:
            return False
    except:
        return False

In [13]:
dir_path = 'datasets/KAIST/English/'
dict_files, files_idx = get_mapped(dir_path)
shuffled_idx = np.random.randint(0, len(files_idx), len(files_idx))
outcomes = [False] * len(shuffled_idx)
inputs = [dict_files.get(shuffled_idx[i]) for i in range(len(shuffled_idx))]

In [14]:
results = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(compute_text_recognition)(i) for i in inputs)
acc = np.sum(results) / len(results)
print ("Accuracy = {}".format(round(acc, 4)))

Accuracy = 0.739
