In [1]:
import boto3
import io
import re
import numpy as np
import os
import xml.etree.ElementTree as ET 
from joblib import Parallel, delayed
import multiprocessing
from utilities.score_computors import jaccard_similarity
from time import time
import pandas as pd

In [2]:
def extract_clean_str(original_str):
    return re.sub('[^a-zA-Z0-9]+', ' ', original_str, flags=re.UNICODE).lower().strip().replace(' ', '')

def get_mapped(path):
    key_map, dict_idx = {}, 0
    for r, _, f in os.walk(dir_path):
        for file in f:
            if '.JPG' in file or '.jpg' in file:
                key_map[dict_idx] = file
                dict_idx += 1
    idxs = list(key_map.keys())          
    shuffled_idxs = np.random.randint(0, len(idxs), len(idxs))
    return key_map, shuffled_idxs

def fetch_from_xml(path, file):
    tree = ET.parse(os.path.join(path, file)).getroot()  
    return extract_clean_str(''.join([e.attrib['char'] for e in tree.iter(tag='character')]))

In [3]:
def compute_text_recognition(file, cutoff=.25):
    print ("Iterating for FileName = {}".format(file))
    status, score = -1, 0
    xml_file = file[:len(file)-4] + '.xml' 
    if not os.path.exists(os.path.join(dir_path, xml_file)):
        return False, status, score
    try:
        ground_truth_str = fetch_from_xml(dir_path, xml_file)
    except: 
        return False, -2, score
    if len(ground_truth_str) == 0:
        return False, status, score
    try:
        with io.open(os.path.join(dir_path, file), 'rb') as image_file:
            content = (image_file.read())
        imgobj = {'Bytes': content}
        client=boto3.client('rekognition', region_name='us-east-1')
        response=client.detect_text(Image=imgobj)
        detected_str = extract_clean_str(''.join([txt['DetectedText'] for txt in response['TextDetections'] \
                                               if txt['Type']=='WORD']))
    except:
        return False, 100, score
    score = jaccard_similarity(detected_str, ground_truth_str)
    if score > cutoff:
        return True, 0, score
    else:
        return False, 0, score

In [4]:
dir_path = 'datasets/KAIST/English/'
dict_files, files_idx = get_mapped(dir_path)
shuffled_idx = np.random.randint(0, len(files_idx), len(files_idx))

In [5]:
df = pd.DataFrame(columns=['score', 'status', 'matched', 'time'])
for idx in shuffled_idx:
    start_time = time()
    matched, status, score = compute_text_recognition(dict_files.get(idx))
    end_time = time()
    df = df.append({'score': score, 'status': status, 'matched': matched, 'time': (end_time-start_time)}, ignore_index=True)

Iterating for FileName = DSC03829.JPG
Iterating for FileName = DSC02619.JPG
Iterating for FileName = DSC02949.JPG
Iterating for FileName = DSC02610.JPG
Iterating for FileName = DSC02936.JPG
Iterating for FileName = P090903003.jpg
Iterating for FileName = DSC03203.JPG
Iterating for FileName = DSC03435.JPG
Iterating for FileName = DSC03294.JPG
Iterating for FileName = DSC04031.JPG
Iterating for FileName = P1010074.JPG
Iterating for FileName = P090831126.jpg
Iterating for FileName = P1010103.JPG
Iterating for FileName = P090903004.jpg
Iterating for FileName = DSC03539.JPG
Iterating for FileName = DSC04246.JPG
Iterating for FileName = P090911083.jpg
Iterating for FileName = DSC03314.JPG
Iterating for FileName = DSC04434.JPG
Iterating for FileName = 4.jpg
Iterating for FileName = DSC02410.JPG
Iterating for FileName = DSC03820.JPG
Iterating for FileName = DSC02400.JPG
Iterating for FileName = P1010052.JPG
Iterating for FileName = DSC03581.JPG
Iterating for FileName = P1010156.JPG
Iterating f

Iterating for FileName = DSC03569.JPG
Iterating for FileName = DSC02613.JPG
Iterating for FileName = 080116-0084.jpg
Iterating for FileName = DSC04433.JPG
Iterating for FileName = P090831011.jpg
Iterating for FileName = P1010111.JPG
Iterating for FileName = DSC03052.JPG
Iterating for FileName = DSC03745.JPG
Iterating for FileName = DSC02963.JPG
Iterating for FileName = 31.JPG
Iterating for FileName = DSC02926.JPG
Iterating for FileName = DSC03556.JPG
Iterating for FileName = DSC03561.JPG
Iterating for FileName = DSC02477.JPG
Iterating for FileName = DSC03052.JPG
Iterating for FileName = P1010096.JPG
Iterating for FileName = DSC03549.JPG
Iterating for FileName = DSC02623.JPG
Iterating for FileName = 8.jpg
Iterating for FileName = DSC02978.JPG
Iterating for FileName = DSC04108.JPG
Iterating for FileName = DSC03689.JPG
Iterating for FileName = DSC04005.JPG
Iterating for FileName = DSC03676.JPG
Iterating for FileName = DSC03402.JPG
Iterating for FileName = 2007-12-09-day-exterior-032.jpg
I

Iterating for FileName = DSC03245.JPG
Iterating for FileName = DSC02500.JPG
Iterating for FileName = DSC02240.JPG
Iterating for FileName = P090831005.jpg
Iterating for FileName = DSC02788.JPG
Iterating for FileName = DSC03340.JPG
Iterating for FileName = DSC02937.JPG
Iterating for FileName = 2007-12-08-day-130.jpg
Iterating for FileName = 31.JPG
Iterating for FileName = DSC03446.JPG
Iterating for FileName = P090831113.jpg
Iterating for FileName = P1010120.JPG
Iterating for FileName = P090912036.jpg
Iterating for FileName = DSC02998.JPG
Iterating for FileName = DSC04318.JPG
Iterating for FileName = DSC03257.JPG
Iterating for FileName = DSC03017.JPG
Iterating for FileName = 080119-0003.jpg
Iterating for FileName = DSC03149.JPG
Iterating for FileName = DSC04015.JPG
Iterating for FileName = P090831013.jpg
Iterating for FileName = DSC02777.JPG
Iterating for FileName = DSC03581.JPG
Iterating for FileName = s.jpg
Iterating for FileName = DSC04173.JPG
Iterating for FileName = DSC03716.JPG
Iter

Iterating for FileName = DSC02587.JPG
Iterating for FileName = DSC03429.JPG
Iterating for FileName = DSC03084.JPG
Iterating for FileName = 065.jpg
Iterating for FileName = DSC02897.JPG
Iterating for FileName = DSC02924.JPG
Iterating for FileName = DSC04064.JPG
Iterating for FileName = DSC03310.JPG
Iterating for FileName = DSC03561.JPG
Iterating for FileName = 17.JPG
Iterating for FileName = DSC02711.JPG
Iterating for FileName = DSC03047.JPG
Iterating for FileName = DSC02842.JPG
Iterating for FileName = DSC04160.JPG
Iterating for FileName = DSC03644.JPG
Iterating for FileName = DSC04231.JPG
Iterating for FileName = DSC04125.JPG
Iterating for FileName = DSC04104.JPG
Iterating for FileName = DSC03642.JPG
Iterating for FileName = DSC03225.JPG
Iterating for FileName = P1010064.JPG
Iterating for FileName = DSC02624.JPG
Iterating for FileName = DSC02763.JPG
Iterating for FileName = DSC04155.JPG
Iterating for FileName = DSC04326.JPG
Iterating for FileName = P1010094.JPG
Iterating for FileName 

Iterating for FileName = P090831023.jpg
Iterating for FileName = DSC04097.JPG
Iterating for FileName = DSC03620.JPG
Iterating for FileName = DSC03657.JPG
Iterating for FileName = 080116-0053.jpg
Iterating for FileName = DSC02936.JPG
Iterating for FileName = DSC03432.JPG
Iterating for FileName = 15.JPG
Iterating for FileName = DSC03309.JPG
Iterating for FileName = DSC04449.JPG
Iterating for FileName = 080116-0064.jpg
Iterating for FileName = DSC02938.JPG
Iterating for FileName = DSC03896.JPG
Iterating for FileName = s.jpg
Iterating for FileName = DSC02912.JPG
Iterating for FileName = DSC04173.JPG
Iterating for FileName = P1010114.JPG
Iterating for FileName = DSC02613.JPG
Iterating for FileName = DSC02604.JPG
Iterating for FileName = 014.JPG
Iterating for FileName = DSC03640.JPG
Iterating for FileName = P090903008.jpg
Iterating for FileName = P1010030.JPG
Iterating for FileName = DSC04197.JPG
Iterating for FileName = DSC02450.JPG
Iterating for FileName = DSC03310.JPG
Iterating for FileNa

Iterating for FileName = DSC03132.JPG
Iterating for FileName = 2007-12-09-day-exterior-078.jpg
Iterating for FileName = DSC03146.JPG
Iterating for FileName = 055.JPG
Iterating for FileName = 080119-0015.jpg
Iterating for FileName = P090831080.jpg
Iterating for FileName = DSC04409.JPG
Iterating for FileName = DSC02491.JPG
Iterating for FileName = 2007-12-09-day-exterior-086.jpg
Iterating for FileName = DSC02793.JPG
Iterating for FileName = DSC03628.JPG
Iterating for FileName = P1010027.JPG
Iterating for FileName = m.jpg
Iterating for FileName = P1010127.JPG
Iterating for FileName = DSC04249.JPG
Iterating for FileName = P090912068.jpg
Iterating for FileName = P090831037.jpg
Iterating for FileName = P090912059.jpg
Iterating for FileName = DSC04200.JPG
Iterating for FileName = DSC04312.JPG
Iterating for FileName = DSC04015.JPG
Iterating for FileName = DSC03095.JPG
Iterating for FileName = DSC03260.JPG
Iterating for FileName = 25.JPG
Iterating for FileName = DSC03613.JPG
Iterating for FileN

In [6]:
display(df)

Unnamed: 0,score,status,matched,time
0,0.800000,0,True,2.592687
1,0.666667,0,True,1.559088
2,0.666667,0,True,0.953767
3,0.500000,0,True,1.499217
4,0.566667,0,True,4.038721
...,...,...,...,...
1175,0.800000,0,True,1.335379
1176,0.714286,0,True,3.159529
1177,0.619048,0,True,2.324886
1178,0.533333,0,True,2.044551
