In [1]:
import boto3
import io
import re
import numpy as np
import os
import xml.etree.ElementTree as ET 
from google.cloud import vision
import requests
from time import time
from utilities.score_computors import jaccard_similarity
from utilities.pythonDB import writeToDB, recordsExists

In [3]:
class RegonizeText():
    def __init__(self, base, file): 
        with io.open(os.path.join(base, file), 'rb') as image_file:
            content = (image_file.read())
        self.content = content
        self.base, self.file, self.actual_str, self.detected_str = base, file, '', ''
        self.label = ''
        self.start = time()
        
    def return_function(self, name):
        dataset = dir_path.split('/')[1] 
        if recordsExists(self.file, dataset, name + '-OCR'):
            return
        
        self.label = name + '-OCR'
        status, score =  getattr(self, 'if_' + name)()
        compute_time = time() - self.start
        bag = (self.file, dataset, self.label, status, score, compute_time, self.actual_str, self.detected_str)
        writeToDB(bag)
    def compute_ground_truth(self):
        ''' 
            Status = {-1: 'XML File Missing', -2: 'Error in XML', -3: 'Ground Truth Empty', -4: 'API Error', 0: 'All Correct'} 
            Return = Ground Truth, Status
        
        '''
        xml_file = self.file[:len(self.file)-4] + '.xml' 
        
        if not os.path.exists(os.path.join(self.base, self.file)):
            return '', -1
        try:
            ground_truth_str = fetch_from_xml_natocr(dir_path, xml_file)
        except: 
            return '', -2
        
        if len(ground_truth_str) == 0:
            return '', -3
        else:
            self.actual_str = ground_truth_str
            return ground_truth_str, 0

    def if_aws(self):
        ground_truth, xml_status = self.compute_ground_truth()
        if xml_status != 0: return xml_status, 0
        
        try:
            imgobj = {'Bytes': self.content}
            client=boto3.client('rekognition', region_name='us-east-1')
            response=client.detect_text(Image=imgobj)
            detected_str = extract_clean_str(''.join([txt['DetectedText'] for txt in response['TextDetections'] \
                                                   if txt['Type']=='WORD']))
            self.detected_str = detected_str
            score = jaccard_similarity(detected_str, ground_truth)
            return 0, score
        except:
            return -4, 0

    def if_gc(self):
        ground_truth, xml_status = self.compute_ground_truth()
        if xml_status != 0: return xml_status, 0
        try:
            vision_client = vision.ImageAnnotatorClient()
            image = vision.types.Image(content=self.content)
            text_detection_response = vision_client.text_detection(image=image)
            detected_str = extract_clean_str(text_detection_response.full_text_annotation.text)
            self.detected_str = detected_str
            score = jaccard_similarity(detected_str, ground_truth)
            return 0, score
        except:
            return -4, 0
    
    def if_azure(self):
        ground_truth, xml_status = self.compute_ground_truth()
        if xml_status != 0: return xml_status, 0
        try:
            subscription_key, endpoint = os.environ['COMPUTER_VISION_SUBSCRIPTION_KEY'], os.environ['COMPUTER_VISION_ENDPOINT']
            ocr_url = endpoint + "vision/v2.1/ocr"
            headers = {'Ocp-Apim-Subscription-Key': subscription_key, 'Content-Type': 'application/octet-stream'}
            params = {'detectOrientation': 'true'}
            response = requests.post(ocr_url, headers=headers, params=params, data=self.content)
            response.raise_for_status()

            lines_info = [region["lines"] for region in response.json()["regions"]]
            detected_str = extract_clean_str(''.join([word_info['text'] for line in lines_info \
                                      for word_metadata in line for word_info in word_metadata["words"]]))
            self.detected_str = detected_str
            score = jaccard_similarity(detected_str, ground_truth)
            return 0, score
        except:
            return -4, 0

In [4]:
dir_path = 'datasets/neocr_dataset/OCR/'
dict_files, files_idx = get_mapped(dir_path)
shuffled_idx = np.random.randint(0, len(files_idx), len(files_idx))

In [5]:
for idx in shuffled_idx:
    file_name = dict_files.get(idx)
    print ("Iterating for File Name = {}".format(file_name))
    RegonizeText(dir_path, file_name).return_function('azure')

Iterating for File Name = img_431882674.jpg
Iterating for File Name = img_368342066.jpg
Iterating for File Name = img_836067259.jpg
Iterating for File Name = img_238034465.jpg
Iterating for File Name = img_383306977.jpg
Iterating for File Name = img_535474250.jpg
Iterating for File Name = img_720201586.jpg
Iterating for File Name = img_830799757.jpg
Iterating for File Name = img_575687816.jpg
Iterating for File Name = img_730544651.jpg
Iterating for File Name = img_124983963.jpg
Iterating for File Name = img_442936783.jpg
Iterating for File Name = img_448918739.jpg
Iterating for File Name = img_198896356.jpg
Iterating for File Name = img_697503541.jpg
Iterating for File Name = img_238034465.jpg
Iterating for File Name = img_442936783.jpg
Iterating for File Name = img_567666779.jpg
Iterating for File Name = img_388759127.jpg
Iterating for File Name = img_906238596.jpg
Iterating for File Name = img_66151835.jpg
Iterating for File Name = img_639884818.jpg
Iterating for File Name = img_741

Iterating for File Name = img_219187814.jpg
Iterating for File Name = img_286093498.jpg
Iterating for File Name = img_466999428.jpg
Iterating for File Name = img_861711709.jpg
Iterating for File Name = img_140521666.jpg
Iterating for File Name = img_603325961.jpg
Iterating for File Name = img_125727917.jpg
Iterating for File Name = img_669343765.jpg
Iterating for File Name = img_939464754.jpg
Iterating for File Name = img_368272591.jpg
Iterating for File Name = img_465847322.jpg
Iterating for File Name = img_535474250.jpg
Iterating for File Name = img_592574387.jpg
Iterating for File Name = img_596597607.jpg
Iterating for File Name = img_896861793.jpg
Iterating for File Name = img_271353050.jpg
Iterating for File Name = img_438919089.jpg
Iterating for File Name = img_227100705.jpg
Iterating for File Name = img_254077972.jpg
Iterating for File Name = img_62798522.jpg
Iterating for File Name = img_265545352.jpg
Iterating for File Name = img_54991.jpg
Iterating for File Name = img_1391304

Iterating for File Name = img_578831261.jpg
Iterating for File Name = img_797153501.jpg
Iterating for File Name = img_377854318.jpg
Iterating for File Name = img_524532062.jpg
Iterating for File Name = img_625968571.jpg
Iterating for File Name = img_781379848.jpg
Iterating for File Name = img_816734020.jpg
Iterating for File Name = img_442936783.jpg
Iterating for File Name = img_175166697.jpg
Iterating for File Name = img_926016157.jpg
Iterating for File Name = img_25740525.jpg
Iterating for File Name = img_778299687.jpg
Iterating for File Name = img_70249305.jpg
Iterating for File Name = img_674039513.jpg
Iterating for File Name = img_416780208.jpg
Iterating for File Name = img_657467816.jpg
Iterating for File Name = img_966193527.jpg
Iterating for File Name = img_475356249.jpg
Iterating for File Name = img_475356249.jpg
Iterating for File Name = img_291308882.jpg
Iterating for File Name = img_670952987.jpg
Iterating for File Name = img_320207031.jpg
Iterating for File Name = img_2798

Iterating for File Name = img_42324802.jpg
Iterating for File Name = img_109844195.jpg
Iterating for File Name = img_577482077.jpg
Iterating for File Name = img_541742847.jpg
Iterating for File Name = img_828399006.jpg
Iterating for File Name = img_281540538.jpg
Iterating for File Name = img_365483720.jpg
Iterating for File Name = img_787284418.jpg
Iterating for File Name = img_148261129.jpg
Iterating for File Name = img_792313480.jpg
Iterating for File Name = img_547819252.jpg
Iterating for File Name = img_810996216.jpg
Iterating for File Name = img_953272397.jpg
Iterating for File Name = img_896592785.jpg
Iterating for File Name = img_686460599.jpg
Iterating for File Name = img_662544171.jpg
Iterating for File Name = img_295496652.jpg
Iterating for File Name = img_778143575.jpg
Iterating for File Name = img_325299426.jpg
Iterating for File Name = img_31235891.jpg
Iterating for File Name = img_572177224.jpg
Iterating for File Name = img_890618205.jpg
Iterating for File Name = img_3887

In [6]:
for idx in shuffled_idx:
    file_name = dict_files.get(idx)
    print ("Iterating for File Name = {}".format(file_name))
    RegonizeText(dir_path, file_name).return_function('aws')

Iterating for File Name = img_431882674.jpg
Iterating for File Name = img_368342066.jpg
Iterating for File Name = img_836067259.jpg
Iterating for File Name = img_238034465.jpg
Iterating for File Name = img_383306977.jpg
Iterating for File Name = img_535474250.jpg
Iterating for File Name = img_720201586.jpg
Iterating for File Name = img_830799757.jpg
Iterating for File Name = img_575687816.jpg
Iterating for File Name = img_730544651.jpg
Iterating for File Name = img_124983963.jpg
Iterating for File Name = img_442936783.jpg
Iterating for File Name = img_448918739.jpg
Iterating for File Name = img_198896356.jpg
Iterating for File Name = img_697503541.jpg
Iterating for File Name = img_238034465.jpg
Iterating for File Name = img_442936783.jpg
Iterating for File Name = img_567666779.jpg
Iterating for File Name = img_388759127.jpg
Iterating for File Name = img_906238596.jpg
Iterating for File Name = img_66151835.jpg
Iterating for File Name = img_639884818.jpg
Iterating for File Name = img_741

Iterating for File Name = img_219187814.jpg
Iterating for File Name = img_286093498.jpg
Iterating for File Name = img_466999428.jpg
Iterating for File Name = img_861711709.jpg
Iterating for File Name = img_140521666.jpg
Iterating for File Name = img_603325961.jpg
Iterating for File Name = img_125727917.jpg
Iterating for File Name = img_669343765.jpg
Iterating for File Name = img_939464754.jpg
Iterating for File Name = img_368272591.jpg
Iterating for File Name = img_465847322.jpg
Iterating for File Name = img_535474250.jpg
Iterating for File Name = img_592574387.jpg
Iterating for File Name = img_596597607.jpg
Iterating for File Name = img_896861793.jpg
Iterating for File Name = img_271353050.jpg
Iterating for File Name = img_438919089.jpg
Iterating for File Name = img_227100705.jpg
Iterating for File Name = img_254077972.jpg
Iterating for File Name = img_62798522.jpg
Iterating for File Name = img_265545352.jpg
Iterating for File Name = img_54991.jpg
Iterating for File Name = img_1391304

Iterating for File Name = img_578831261.jpg
Iterating for File Name = img_797153501.jpg
Iterating for File Name = img_377854318.jpg
Iterating for File Name = img_524532062.jpg
Iterating for File Name = img_625968571.jpg
Iterating for File Name = img_781379848.jpg
Iterating for File Name = img_816734020.jpg
Iterating for File Name = img_442936783.jpg
Iterating for File Name = img_175166697.jpg
Iterating for File Name = img_926016157.jpg
Iterating for File Name = img_25740525.jpg
Iterating for File Name = img_778299687.jpg
Iterating for File Name = img_70249305.jpg
Iterating for File Name = img_674039513.jpg
Iterating for File Name = img_416780208.jpg
Iterating for File Name = img_657467816.jpg
Iterating for File Name = img_966193527.jpg
Iterating for File Name = img_475356249.jpg
Iterating for File Name = img_475356249.jpg
Iterating for File Name = img_291308882.jpg
Iterating for File Name = img_670952987.jpg
Iterating for File Name = img_320207031.jpg
Iterating for File Name = img_2798

Iterating for File Name = img_42324802.jpg
Iterating for File Name = img_109844195.jpg
Iterating for File Name = img_577482077.jpg
Iterating for File Name = img_541742847.jpg
Iterating for File Name = img_828399006.jpg
Iterating for File Name = img_281540538.jpg
Iterating for File Name = img_365483720.jpg
Iterating for File Name = img_787284418.jpg
Iterating for File Name = img_148261129.jpg
Iterating for File Name = img_792313480.jpg
Iterating for File Name = img_547819252.jpg
Iterating for File Name = img_810996216.jpg
Iterating for File Name = img_953272397.jpg
Iterating for File Name = img_896592785.jpg
Iterating for File Name = img_686460599.jpg
Iterating for File Name = img_662544171.jpg
Iterating for File Name = img_295496652.jpg
Iterating for File Name = img_778143575.jpg
Iterating for File Name = img_325299426.jpg
Iterating for File Name = img_31235891.jpg
Iterating for File Name = img_572177224.jpg
Iterating for File Name = img_890618205.jpg
Iterating for File Name = img_3887

In [7]:
for idx in shuffled_idx:
    file_name = dict_files.get(idx)
    print ("Iterating for File Name = {}".format(file_name))
    RegonizeText(dir_path, file_name).return_function('gc')

Iterating for File Name = img_431882674.jpg
Iterating for File Name = img_368342066.jpg
Iterating for File Name = img_836067259.jpg
Iterating for File Name = img_238034465.jpg
Iterating for File Name = img_383306977.jpg
Iterating for File Name = img_535474250.jpg
Iterating for File Name = img_720201586.jpg
Iterating for File Name = img_830799757.jpg
Iterating for File Name = img_575687816.jpg
Iterating for File Name = img_730544651.jpg
Iterating for File Name = img_124983963.jpg
Iterating for File Name = img_442936783.jpg
Iterating for File Name = img_448918739.jpg
Iterating for File Name = img_198896356.jpg
Iterating for File Name = img_697503541.jpg
Iterating for File Name = img_238034465.jpg
Iterating for File Name = img_442936783.jpg
Iterating for File Name = img_567666779.jpg
Iterating for File Name = img_388759127.jpg
Iterating for File Name = img_906238596.jpg
Iterating for File Name = img_66151835.jpg
Iterating for File Name = img_639884818.jpg
Iterating for File Name = img_741

Iterating for File Name = img_219187814.jpg
Iterating for File Name = img_286093498.jpg
Iterating for File Name = img_466999428.jpg
Iterating for File Name = img_861711709.jpg
Iterating for File Name = img_140521666.jpg
Iterating for File Name = img_603325961.jpg
Iterating for File Name = img_125727917.jpg
Iterating for File Name = img_669343765.jpg
Iterating for File Name = img_939464754.jpg
Iterating for File Name = img_368272591.jpg
Iterating for File Name = img_465847322.jpg
Iterating for File Name = img_535474250.jpg
Iterating for File Name = img_592574387.jpg
Iterating for File Name = img_596597607.jpg
Iterating for File Name = img_896861793.jpg
Iterating for File Name = img_271353050.jpg
Iterating for File Name = img_438919089.jpg
Iterating for File Name = img_227100705.jpg
Iterating for File Name = img_254077972.jpg
Iterating for File Name = img_62798522.jpg
Iterating for File Name = img_265545352.jpg
Iterating for File Name = img_54991.jpg
Iterating for File Name = img_1391304

Iterating for File Name = img_578831261.jpg
Iterating for File Name = img_797153501.jpg
Iterating for File Name = img_377854318.jpg
Iterating for File Name = img_524532062.jpg
Iterating for File Name = img_625968571.jpg
Iterating for File Name = img_781379848.jpg
Iterating for File Name = img_816734020.jpg
Iterating for File Name = img_442936783.jpg
Iterating for File Name = img_175166697.jpg
Iterating for File Name = img_926016157.jpg
Iterating for File Name = img_25740525.jpg
Iterating for File Name = img_778299687.jpg
Iterating for File Name = img_70249305.jpg
Iterating for File Name = img_674039513.jpg
Iterating for File Name = img_416780208.jpg
Iterating for File Name = img_657467816.jpg
Iterating for File Name = img_966193527.jpg
Iterating for File Name = img_475356249.jpg
Iterating for File Name = img_475356249.jpg
Iterating for File Name = img_291308882.jpg
Iterating for File Name = img_670952987.jpg
Iterating for File Name = img_320207031.jpg
Iterating for File Name = img_2798

Iterating for File Name = img_42324802.jpg
Iterating for File Name = img_109844195.jpg
Iterating for File Name = img_577482077.jpg
Iterating for File Name = img_541742847.jpg
Iterating for File Name = img_828399006.jpg
Iterating for File Name = img_281540538.jpg
Iterating for File Name = img_365483720.jpg
Iterating for File Name = img_787284418.jpg
Iterating for File Name = img_148261129.jpg
Iterating for File Name = img_792313480.jpg
Iterating for File Name = img_547819252.jpg
Iterating for File Name = img_810996216.jpg
Iterating for File Name = img_953272397.jpg
Iterating for File Name = img_896592785.jpg
Iterating for File Name = img_686460599.jpg
Iterating for File Name = img_662544171.jpg
Iterating for File Name = img_295496652.jpg
Iterating for File Name = img_778143575.jpg
Iterating for File Name = img_325299426.jpg
Iterating for File Name = img_31235891.jpg
Iterating for File Name = img_572177224.jpg
Iterating for File Name = img_890618205.jpg
Iterating for File Name = img_3887