In [1]:
import io
import re
import numpy as np
import os
import xml.etree.ElementTree as ET 
from joblib import Parallel, delayed
import multiprocessing
from utilities.score_computors import jaccard_similarity
from time import time
import pandas as pd
from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from azure.cognitiveservices.vision.computervision.models import TextOperationStatusCodes
from azure.cognitiveservices.vision.computervision.models import TextRecognitionMode
from azure.cognitiveservices.vision.computervision.models import VisualFeatureTypes
from msrest.authentication import CognitiveServicesCredentials
import requests

In [2]:
def extract_clean_str(original_str):
    return re.sub('[^a-zA-Z0-9]+', ' ', original_str, flags=re.UNICODE).lower().strip().replace(' ', '')

def get_mapped(path):
    key_map, dict_idx = {}, 0
    for r, _, f in os.walk(dir_path):
        for file in f:
            if '.JPG' in file or '.jpg' in file:
                key_map[dict_idx] = file
                dict_idx += 1
    idxs = list(key_map.keys())          
    shuffled_idxs = np.random.randint(0, len(idxs), len(idxs))
    return key_map, shuffled_idxs

def fetch_from_xml(path, file):
    tree = ET.parse(os.path.join(path, file)).getroot()  
    return extract_clean_str(''.join([e.attrib['char'] for e in tree.iter(tag='character')]))

In [3]:
def compute_text_recognition(file, cutoff=.25):
    print ("Iterating for FileName = {}".format(file))
    status, score = -1, 0
    xml_file = file[:len(file)-4] + '.xml' 
    if not os.path.exists(os.path.join(dir_path, xml_file)):
        return False, status, score
    try:
        ground_truth_str = fetch_from_xml(dir_path, xml_file)
    except: 
        return False, -2, score
    if len(ground_truth_str) == 0:
        return False, status, score
    try:
        with io.open(os.path.join(dir_path, file), 'rb') as image_file:
            content = (image_file.read())
        subscription_key = os.environ['COMPUTER_VISION_SUBSCRIPTION_KEY']
        endpoint = os.environ['COMPUTER_VISION_ENDPOINT']
        ocr_url = endpoint + "vision/v2.1/ocr"
        headers = {'Ocp-Apim-Subscription-Key': subscription_key, 'Content-Type': 'application/octet-stream'}
        params = {'detectOrientation': 'true'} #'language': 'en',
        response = requests.post(ocr_url, headers=headers, params=params, data=content)
        response.raise_for_status()
        
        lines_info = [region["lines"] for region in response.json()["regions"]]
        detected_str = extract_clean_str(''.join([word_info['text'] for line in lines_info \
                                  for word_metadata in line for word_info in word_metadata["words"]]))
    except:
        return False, 100, score
    score = jaccard_similarity(detected_str, ground_truth_str)
    if score > cutoff:
        return True, 0, score
    else:
        return False, 0, score

In [4]:
dir_path = 'datasets/KAIST/English/'
dict_files, files_idx = get_mapped(dir_path)
shuffled_idx = np.random.randint(0, len(files_idx), len(files_idx))

In [5]:
df = pd.DataFrame(columns=['score', 'status', 'matched', 'time'])
for idx in shuffled_idx:
    start_time = time()
    matched, status, score = compute_text_recognition(dict_files.get(idx))
    end_time = time()
    df = df.append({'score': score, 'status': status, 'matched': matched, 'time': (end_time-start_time)}, ignore_index=True)

Iterating for FileName = DSC02673.JPG
Iterating for FileName = P090902019.jpg
Iterating for FileName = DSC03196.JPG
Iterating for FileName = 2007-12-09-day-exterior-051.jpg
Iterating for FileName = P1010065.JPG
Iterating for FileName = DSC03308.JPG
Iterating for FileName = DSC02376.JPG
Iterating for FileName = DSC03670.JPG
Iterating for FileName = DSC04451.JPG
Iterating for FileName = DSC03440.JPG
Iterating for FileName = DSC04087.JPG
Iterating for FileName = P090911060.jpg
Iterating for FileName = DSC04458.JPG
Iterating for FileName = 20.JPG
Iterating for FileName = DSC04197.JPG
Iterating for FileName = DSC02897.JPG
Iterating for FileName = DSC03822.JPG
Iterating for FileName = DSC03091.JPG
Iterating for FileName = 080119-0038.jpg
Iterating for FileName = DSC03918.JPG
Iterating for FileName = DSC02382.JPG
Iterating for FileName = DSC02590.JPG
Iterating for FileName = DSC04364.JPG
Iterating for FileName = DSC03316.JPG
Iterating for FileName = IMG_2618.JPG
Iterating for FileName = DSC02

Iterating for FileName = DSC03501.JPG
Iterating for FileName = DSC02969.JPG
Iterating for FileName = P090831121.jpg
Iterating for FileName = P090903024.jpg
Iterating for FileName = DSC03911.JPG
Iterating for FileName = P090902016.jpg
Iterating for FileName = 7.jpg
Iterating for FileName = P1010025.JPG
Iterating for FileName = P090902019.jpg
Iterating for FileName = 032.jpg
Iterating for FileName = P090911083.jpg
Iterating for FileName = P090831011.jpg
Iterating for FileName = DSC03150.JPG
Iterating for FileName = DSC04063.JPG
Iterating for FileName = DSC04314.JPG
Iterating for FileName = DSC03003.JPG
Iterating for FileName = DSC03532.JPG
Iterating for FileName = DSC03722.JPG
Iterating for FileName = DSC04401.JPG
Iterating for FileName = P1010021.JPG
Iterating for FileName = 139.JPG
Iterating for FileName = P090903081.jpg
Iterating for FileName = DSC03786.JPG
Iterating for FileName = DSC03609.JPG
Iterating for FileName = P090912078.jpg
Iterating for FileName = P090911071.jpg
Iterating f

Iterating for FileName = P090903022.jpg
Iterating for FileName = 047.jpg
Iterating for FileName = DSC04070.JPG
Iterating for FileName = 45.JPG
Iterating for FileName = P090831037.jpg
Iterating for FileName = 080119-0033.jpg
Iterating for FileName = DSC04030.JPG
Iterating for FileName = DSC03199.JPG
Iterating for FileName = 080116-0059.jpg
Iterating for FileName = DSC02418.JPG
Iterating for FileName = DSC02389.JPG
Iterating for FileName = DSC04160.JPG
Iterating for FileName = DSC04451.JPG
Iterating for FileName = 142.JPG
Iterating for FileName = 080119-0038.jpg
Iterating for FileName = DSC03871.JPG
Iterating for FileName = DSC03176.JPG
Iterating for FileName = DSC03083.JPG
Iterating for FileName = DSC02917.JPG
Iterating for FileName = DSC02382.JPG
Iterating for FileName = DSC04338.JPG
Iterating for FileName = DSC03165.JPG
Iterating for FileName = P1010020.JPG
Iterating for FileName = DSC03652.JPG
Iterating for FileName = DSC02711.JPG
Iterating for FileName = DSC03440.JPG
Iterating for F

Iterating for FileName = DSC03047.JPG
Iterating for FileName = DSC03046.JPG
Iterating for FileName = DSC02859.JPG
Iterating for FileName = DSC02436.JPG
Iterating for FileName = DSC03820.JPG
Iterating for FileName = DSC02746.JPG
Iterating for FileName = DSC02379.JPG
Iterating for FileName = DSC03556.JPG
Iterating for FileName = DSC03398.JPG
Iterating for FileName = P090831042.jpg
Iterating for FileName = DSC04256.JPG
Iterating for FileName = DSC03129.JPG
Iterating for FileName = DSC02394.JPG
Iterating for FileName = DSC03723.JPG
Iterating for FileName = DSC04029.JPG
Iterating for FileName = DSC03285.JPG
Iterating for FileName = P1010021.JPG
Iterating for FileName = DSC03255.JPG
Iterating for FileName = P1010052.JPG
Iterating for FileName = DSC03711.JPG
Iterating for FileName = DSC02427.JPG
Iterating for FileName = P090911016.jpg
Iterating for FileName = DSC03664.JPG
Iterating for FileName = DSC04233.JPG
Iterating for FileName = DSC04120.JPG
Iterating for FileName = DSC04401.JPG
Iteratin

Iterating for FileName = DSC03867.JPG
Iterating for FileName = DSC04200.JPG
Iterating for FileName = DSC04465.JPG
Iterating for FileName = P1010155.JPG
Iterating for FileName = DSC03719.JPG
Iterating for FileName = DSC03118.JPG
Iterating for FileName = 055.JPG
Iterating for FileName = DSC03909.JPG
Iterating for FileName = 097.jpg
Iterating for FileName = 7.jpg
Iterating for FileName = DSC03737.JPG
Iterating for FileName = DSC03478.JPG
Iterating for FileName = DSC02673.JPG
Iterating for FileName = DSC02676.JPG
Iterating for FileName = DSC03947.JPG
Iterating for FileName = DSC02954.JPG
Iterating for FileName = 20.JPG
Iterating for FileName = DSC03015.JPG
Iterating for FileName = DSC03651.JPG
Iterating for FileName = DSC03483.JPG
Iterating for FileName = P090831012.jpg
Iterating for FileName = P090831122.jpg
Iterating for FileName = DSC03245.JPG
Iterating for FileName = DSC03483.JPG
Iterating for FileName = DSC03212.JPG
Iterating for FileName = P090904004.jpg
Iterating for FileName = DSC0

Iterating for FileName = DSC02724.JPG
Iterating for FileName = P090912059.jpg
Iterating for FileName = 054.JPG
Iterating for FileName = 108.JPG
Iterating for FileName = 080116-0057.jpg
Iterating for FileName = DSC02859.JPG
Iterating for FileName = DSC02639.JPG
Iterating for FileName = DSC04098.JPG
Iterating for FileName = DSC03662.JPG
Iterating for FileName = DSC03031.JPG
Iterating for FileName = DSC03038.JPG
Iterating for FileName = DSC02610.JPG
Iterating for FileName = 104.jpg
Iterating for FileName = DSC03998.JPG
Iterating for FileName = P1010094.JPG
Iterating for FileName = P1010090.JPG
Iterating for FileName = DSC03842.JPG
Iterating for FileName = P090912047.jpg
Iterating for FileName = P1010138.JPG
Iterating for FileName = DSC02928.JPG
Iterating for FileName = DSC03732.JPG
Iterating for FileName = DSC04330.JPG
Iterating for FileName = DSC03637.JPG
Iterating for FileName = 110.jpg
Iterating for FileName = DSC03199.JPG
Iterating for FileName = DSC04048.JPG
Iterating for FileName = 

In [6]:
display(df)

Unnamed: 0,score,status,matched,time
0,0,0,False,0.193809
1,0.631579,0,True,0.124048
2,0.8,0,True,0.175187
3,0.354839,0,True,0.237938
4,0.461538,0,True,0.145799
...,...,...,...,...
1175,0.785714,0,True,0.264928
1176,0.692308,0,True,0.129097
1177,0,-1,False,0.000843
1178,0,0,False,0.166199
