In [1]:
import io
import re
import numpy as np
import os
import xml.etree.ElementTree as ET 
from joblib import Parallel, delayed
import multiprocessing
from utilities.score_computors import jaccard_similarity
from time import time
import pandas as pd
from google.oauth2 import service_account
from google.cloud import vision

In [2]:
def extract_clean_str(original_str):
    return re.sub('[^a-zA-Z0-9]+', ' ', original_str, flags=re.UNICODE).lower().strip().replace(' ', '')

def get_mapped(path):
    key_map, dict_idx = {}, 0
    for r, _, f in os.walk(dir_path):
        for file in f:
            if '.JPG' in file or '.jpg' in file:
                key_map[dict_idx] = file
                dict_idx += 1
    idxs = list(key_map.keys())          
    shuffled_idxs = np.random.randint(0, len(idxs), len(idxs))
    return key_map, shuffled_idxs

def fetch_from_xml(path, file):
    tree = ET.parse(os.path.join(path, file)).getroot()  
    return extract_clean_str(''.join([e.attrib['char'] for e in tree.iter(tag='character')]))

In [3]:
def compute_text_recognition(file, cutoff=.25):
    print ("Iterating for FileName = {}".format(file))
    status, score = -1, 0
    xml_file = file[:len(file)-4] + '.xml' 
    if not os.path.exists(os.path.join(dir_path, xml_file)):
        return False, status, score
    try:
        ground_truth_str = fetch_from_xml(dir_path, xml_file)
    except: 
        return False, -2, score
    if len(ground_truth_str) == 0:
        return False, status, score
    try:
        with io.open(os.path.join(dir_path, file), 'rb') as image_file:
            content = (image_file.read())
        vision_client = vision.ImageAnnotatorClient()
        image = vision.types.Image(content=content)
        text_detection_response = vision_client.text_detection(image=image)
        detected_str = extract_clean_str(text_detection_response.full_text_annotation.text)
    except:
        return False, 100, score
    score = jaccard_similarity(detected_str, ground_truth_str)
    if score > cutoff:
        return True, 0, score
    else:
        return False, 0, score

In [4]:
dir_path = 'datasets/KAIST/English/'
dict_files, files_idx = get_mapped(dir_path)
shuffled_idx = np.random.randint(0, len(files_idx), len(files_idx))

In [5]:
df = pd.DataFrame(columns=['score', 'status', 'matched', 'time'])
for idx in shuffled_idx:
    start_time = time()
    matched, status, score = compute_text_recognition(dict_files.get(idx))
    end_time = time()
    df = df.append({'score': score, 'status': status, 'matched': matched, 'time': (end_time-start_time)}, ignore_index=True)

Iterating for FileName = DSC02609.JPG
Iterating for FileName = DSC03126.JPG
Iterating for FileName = P090905002.jpg
Iterating for FileName = DSC02575.JPG
Iterating for FileName = DSC03328.JPG
Iterating for FileName = P090831119.jpg
Iterating for FileName = DSC03657.JPG
Iterating for FileName = P090831018.jpg
Iterating for FileName = DSC02788.JPG
Iterating for FileName = DSC03912.JPG
Iterating for FileName = DSC03411.JPG
Iterating for FileName = DSC03821.JPG
Iterating for FileName = DSC03431.JPG
Iterating for FileName = DSC03141.JPG
Iterating for FileName = DSC02243.JPG
Iterating for FileName = 20.JPG
Iterating for FileName = 135.JPG
Iterating for FileName = 2007-12-08-day-113.jpg
Iterating for FileName = DSC02881.JPG
Iterating for FileName = DSC03255.JPG
Iterating for FileName = 308.jpg
Iterating for FileName = DSC03020.JPG
Iterating for FileName = DSC02794.JPG
Iterating for FileName = DSC03275.JPG
Iterating for FileName = DSC03162.JPG
Iterating for FileName = 2007-12-08-day-113.jpg
It

Iterating for FileName = DSC02980.JPG
Iterating for FileName = DSC04087.JPG
Iterating for FileName = DSC03654.JPG
Iterating for FileName = DSC02897.JPG
Iterating for FileName = 2007-12-09-day-exterior-056.jpg
Iterating for FileName = P1010129.JPG
Iterating for FileName = 068.jpg
Iterating for FileName = DSC02969.JPG
Iterating for FileName = DSC03533.JPG
Iterating for FileName = 056.JPG
Iterating for FileName = DSC02556.JPG
Iterating for FileName = DSC02477.JPG
Iterating for FileName = DSC02361.JPG
Iterating for FileName = DSC03428.JPG
Iterating for FileName = 055.JPG
Iterating for FileName = DSC02675.JPG
Iterating for FileName = 085.jpg
Iterating for FileName = P090831010.jpg
Iterating for FileName = 55.JPG
Iterating for FileName = DSC03305.JPG
Iterating for FileName = DSC02903.JPG
Iterating for FileName = DSC03303.JPG
Iterating for FileName = DSC03936.JPG
Iterating for FileName = 080116-0089.jpg
Iterating for FileName = DSC04326.JPG
Iterating for FileName = DSC03161.JPG
Iterating for 

Iterating for FileName = DSC02936.JPG
Iterating for FileName = 047.JPG
Iterating for FileName = DSC04364.JPG
Iterating for FileName = DSC03006.JPG
Iterating for FileName = 089.jpg
Iterating for FileName = DSC04052.JPG
Iterating for FileName = IMG_2618.JPG
Iterating for FileName = 050.JPG
Iterating for FileName = DSC03719.JPG
Iterating for FileName = DSC04251.JPG
Iterating for FileName = DSC02559.JPG
Iterating for FileName = DSC02664.JPG
Iterating for FileName = DSC02724.JPG
Iterating for FileName = DSC02755.JPG
Iterating for FileName = DSC02732.JPG
Iterating for FileName = 2007-12-09-day-exterior-063.jpg
Iterating for FileName = DSC02394.JPG
Iterating for FileName = DSC02840.JPG
Iterating for FileName = DSC03514.JPG
Iterating for FileName = 004.jpg
Iterating for FileName = DSC03745.JPG
Iterating for FileName = P090911081.jpg
Iterating for FileName = DSC02926.JPG
Iterating for FileName = DSC03376.JPG
Iterating for FileName = P090831062.jpg
Iterating for FileName = DSC02320.JPG
Iterating

Iterating for FileName = DSC02954.JPG
Iterating for FileName = DSC04150.JPG
Iterating for FileName = DSC02623.JPG
Iterating for FileName = 25.JPG
Iterating for FileName = DSC02466.JPG
Iterating for FileName = P1010164.JPG
Iterating for FileName = DSC02802.JPG
Iterating for FileName = 034.JPG
Iterating for FileName = DSC03516.JPG
Iterating for FileName = P090831124.jpg
Iterating for FileName = DSC03122.JPG
Iterating for FileName = DSC03911.JPG
Iterating for FileName = DSC03082.JPG
Iterating for FileName = DSC03637.JPG
Iterating for FileName = DSC03415.JPG
Iterating for FileName = DSC03150.JPG
Iterating for FileName = DSC02714.JPG
Iterating for FileName = DSC02345.JPG
Iterating for FileName = DSC03663.JPG
Iterating for FileName = 032.jpg
Iterating for FileName = 080119-0009.jpg
Iterating for FileName = DSC04434.JPG
Iterating for FileName = DSC02725.JPG
Iterating for FileName = DSC03745.JPG
Iterating for FileName = DSC02859.JPG
Iterating for FileName = P1010044.JPG
Iterating for FileName 

Iterating for FileName = P090903071.jpg
Iterating for FileName = DSC03413.JPG
Iterating for FileName = DSC02511.JPG
Iterating for FileName = DSC02374.JPG
Iterating for FileName = DSC03871.JPG
Iterating for FileName = 18.JPG
Iterating for FileName = DSC02624.JPG
Iterating for FileName = DSC03439.JPG
Iterating for FileName = 43.JPG
Iterating for FileName = DSC03122.JPG
Iterating for FileName = P1010093.JPG
Iterating for FileName = DSC03241.JPG
Iterating for FileName = 2007-12-08-day-015.jpg
Iterating for FileName = DSC02930.JPG
Iterating for FileName = DSC03953.JPG
Iterating for FileName = DSC04444.JPG
Iterating for FileName = 002.jpg
Iterating for FileName = DSC04139.JPG
Iterating for FileName = DSC03305.JPG
Iterating for FileName = DSC02586.JPG
Iterating for FileName = DSC04146.JPG
Iterating for FileName = DSC04131.JPG
Iterating for FileName = DSC02359.JPG
Iterating for FileName = P090831125.jpg
Iterating for FileName = DSC03980.JPG
Iterating for FileName = DSC02850.JPG
Iterating for F

Iterating for FileName = IMG_2618.JPG
Iterating for FileName = P090911050.jpg
Iterating for FileName = DSC02397.JPG
Iterating for FileName = P090831023.jpg
Iterating for FileName = P090903009.jpg
Iterating for FileName = P090903041.jpg
Iterating for FileName = DSC03396.JPG
Iterating for FileName = DSC03135.JPG
Iterating for FileName = DSC03652.JPG
Iterating for FileName = P090831045.jpg
Iterating for FileName = DSC03125.JPG
Iterating for FileName = DSC02505.JPG
Iterating for FileName = P1010019.JPG
Iterating for FileName = DSC02755.JPG
Iterating for FileName = DSC03169.JPG
Iterating for FileName = P090831075.jpg
Iterating for FileName = DSC02571.JPG
Iterating for FileName = DSC03759.JPG
Iterating for FileName = DSC03317.JPG
Iterating for FileName = DSC03329.JPG
Iterating for FileName = DSC03179.JPG
Iterating for FileName = v.jpg
Iterating for FileName = 45.JPG
Iterating for FileName = P090912071.jpg
Iterating for FileName = DSC03413.JPG
Iterating for FileName = P1010088.JPG
Iterating f

In [6]:
display(df)

Unnamed: 0,score,status,matched,time
0,0.600000,0,True,0.963695
1,1.000000,0,True,1.655070
2,1.000000,0,True,1.362367
3,0.818182,0,True,1.315121
4,0.607143,0,True,1.341273
...,...,...,...,...
1175,0.600000,0,True,1.302501
1176,0.631579,0,True,0.972082
1177,0.800000,0,True,1.268586
1178,0.400000,0,True,1.302634
