In [1]:
args = {}
args['country'] = 'Croatia'
args['vision_credential'] = '/home/connect/prashant/Finland/mysecondvisionproject-4e1c5d1beb85.json'
args['stage_options_file'] = "statuses.xlsx"
args['similarity_threshold'] = 0.7
args['industry_options_file'] = 'industries.xlsx'
args['companies_file'] = 'companies.xlsx'

In [2]:
'''
How to use it:

python3 finland_ocr.py --images_path images/ --vision_credential vision_api.json --companies_excel_file Finland_companies.xlsx

Assuming all images with PNG extension
'''




import cv2
import numpy as np
import pandas as pd
import math
import os
import shutil
import io
from google.cloud import vision
import re
from collections import defaultdict
import string
from fuzzywuzzy import process
import glob
import pickle
import simplediff

def simplediff_score(raw, known):
    diff_result = simplediff.diff(known, raw)
    same = [len (t[1]) for t in diff_result if t[0] == "="]
    deletions = [len (t[1]) for t in diff_result if t[0] == "-"]
    additions = [len (t[1]) for t in diff_result if t[0] == "+"]
    n = 2*sum(same) - (sum(additions) + sum(deletions))/2
    return n/(len(known)+len(raw))

def partial_match(word , possibilities, cut_off=0.5):
    if cut_off=='auto':
        if len(word)>6:
            cut_off = 1-(3/len(word))
        elif len(word)>2:
            cut_off = 1-(2/len(word))
        else:
            cut_off=0.5
    max_score = 0
#     possibilities = [each.lower() for each in possibilities ]
    match = word
    for each in possibilities:
        score = simplediff_score(word,each)
        if score>max_score:
            max_score = score
            match = each
    if max_score>=cut_off:
        return match,max_score
    else:
        return None

def get_bin_image(image,th=127, inv=False):
    '''Takes an image and returns its binary image'''
    try:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    except:
        pass
    if inv:
        _, bin_image = cv2.threshold(image, th, 255, cv2.THRESH_BINARY_INV)#|cv2.THRESH_OTSU)
    else:
        _, bin_image = cv2.threshold(image, th, 255, cv2.THRESH_BINARY)#|cv2.THRESH_OTSU)
    return bin_image


def white_percentage(bin_image, mode):
    '''Takes a binary image i.e. {0,255} pixels values and returns percentage of white pixels across columns or rows as specified in mode parameter.'''

    bin_image = bin_image.astype('int64')/255
    h,w = bin_image.shape

    if mode == 'horizontal':
        srow = bin_image.sum(axis=1)
        return srow/w*100
    
        p=10
        parts = np.round(np.linspace(0,w,p+1))

        part_white_percentage = []
        
        for i in range(p):
            part_bin_image = bin_image[:,int(parts[i]):int(parts[i+1])]
            _, part_w = part_bin_image.shape
            part_srow = part_bin_image.sum(axis=1)
            part_white_percentage.append(part_srow/part_w*100)

        part_white_percentage = np.array(part_white_percentage).T

        num_text_part = np.count_nonzero(part_white_percentage, axis=1)
        num_text_part = num_text_part.clip(1)
        return part_white_percentage.sum(axis=1)/num_text_part


    elif mode == 'vertical':
        scol = bin_image.sum(axis=0)
        return (scol/h)*100

    else:
        print('Error: mode argument is missing')
        return -1

def get_band_lines(img_lines, min_band_sep=1, ignore_rate=0, ignore_rate_step=0.1, which='mid'):
    '''
    Returns 'which' line number of continous lines as defined by min_band_sep parameter.
    Takes a list of tuples as [ (line_number, percentage_value), ...] and returns a list of tuples as [ (line_number, threshold_category), ...].
    '''

    cont_list= []
    res_list = []

    ###min_th = np.array(img_lines).min(axis=0)[1]

    for line in img_lines:        
            if len(cont_list)==0:
                 cont_list.append(line)        
            else:
                if line[0]-cont_list[-1][0]<=min_band_sep:  ##cont_list[-1][0] to cont_list[0][0] to pick a band of max line height
                    cont_list.append(line)
                else:
                    for th in np.arange(0, ignore_rate+ignore_rate_step, ignore_rate_step):
                        th = round(th,1)
                        line_list = [i[0] for i in cont_list if i[1] <= th]
                        if line_list:
                            res = line_list[0] if which=='first' else (line_list[-1] if which=='last' else line_list[math.floor((len(line_list))/2)])
                            res_list.append((res,th))
                            cont_list=[]
                            cont_list.append(line)
                            break
            
    for th in np.arange(0, ignore_rate+ignore_rate_step, ignore_rate_step):
        th = round(th,1)
        line_list = [i[0] for i in cont_list if i[1] <= th]
        if line_list:
            res = line_list[0] if which=='first' else (line_list[-1] if which=='last' else line_list[math.floor((len(line_list))/2)])
            res_list.append((res,th))
            break
            
    return res_list

def google_hit(img_file, auth_key, api_type):
	'''Fetches the response from the google cloud API'''

	os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = auth_key
	client = vision.ImageAnnotatorClient()

	with io.open(img_file, 'rb') as image_file:
		content = image_file.read()

		image = vision.types.Image(content=content)
	# response = client.document_text_detection(image=image, image_context={"language_hints": ["en"]})
	if api_type=='document_text_detection':
		response = client.document_text_detection(image=image,image_context={"language_hints": ["en"]})
	elif api_type=='text_detection':
		response = client.text_detection(image=image,image_context={"language_hints": ["en"]})
	else:
		return 'Please mention the valid api_type'
	document = response.full_text_annotation

	return document

  
def find_word_location(document,word_to_find):
    word_list = []
    for page in document.pages:
        for block in page.blocks:
            for paragraph in block.paragraphs:
                for word in paragraph.words:
                    assembled_word=assemble_word(word)
                    if(assembled_word.lower()==word_to_find.lower()):
                        word_list.append(word.bounding_box.vertices) 
    return word_list

def assemble_word(word):
    assembled_word = ""
    for symbol in word.symbols:
        assembled_word += symbol.text
    return assembled_word


def text_within(document,bbox):
    '''finds out the text within a set of co-ordinates'''
    x1,y1,x2,y2 = bbox
   
    symbols = []
    for page in document.pages:
        for block in page.blocks:
            for paragraph in block.paragraphs:
                for word in paragraph.words:
                    for symbol in word.symbols:
                        min_x=min(symbol.bounding_box.vertices[0].x,symbol.bounding_box.vertices[1].x,symbol.bounding_box.vertices[2].x,symbol.bounding_box.vertices[3].x)
                        max_x=max(symbol.bounding_box.vertices[0].x,symbol.bounding_box.vertices[1].x,symbol.bounding_box.vertices[2].x,symbol.bounding_box.vertices[3].x)
                        min_y=min(symbol.bounding_box.vertices[0].y,symbol.bounding_box.vertices[1].y,symbol.bounding_box.vertices[2].y,symbol.bounding_box.vertices[3].y)
                        max_y=max(symbol.bounding_box.vertices[0].y,symbol.bounding_box.vertices[1].y,symbol.bounding_box.vertices[2].y,symbol.bounding_box.vertices[3].y)
                        if(min_x >= x1 and max_x <= x2 and min_y >= y1 and max_y <= y2):
                            symbols.append([symbol,min_x,min_y,max_x,max_y])
    lines = ''                      
    if len(symbols)>0:
        df = pd.DataFrame(symbols)

        df = df.sort_values([2,1])
        df[2]=df[2].astype(int)
        y_list = sorted( df[2].unique())
        old_y = -y_list[0]
        tol = 10
        for y in y_list:
            text=""
            if old_y+tol< y :
                df_ = df[(df[2]>=y)&(df[2]<=y+tol)]
                df_ = df_.sort_values(1)
                old_y =y

                for symbol in df_[0]:
                    text+=symbol.text
                    if(symbol.property.detected_break.type==1 or symbol.property.detected_break.type==3):
                        text+=' '
                    if(symbol.property.detected_break.type==2):
                        text+='\t'
                    if(symbol.property.detected_break.type==5):
                        text+='\n'
                lines = lines+text#.lower()
    return lines.strip()


def convert_to_int(s):
	try:
	    text_string = s.translate(str.maketrans('', '',string.punctuation.replace('.','').replace(',','')))      
	    d={'D':'0','o':'0','O':'0','i':'1','I':'1','J':'1','l':'1','L':'1','!':'1','r':'1','s':'5','S':'5','G':'6','Y':'7','y':'7','g':'9'}
	    for k,v in d.items():
	        text_string = text_string.replace(k, v)
	    text_string=''.join(filter(lambda x: x.isdigit(),text_string))
	except:
		text_string = np.nan
	return text_string

def get_df(hr_img_lines,vr_img_lines,document,all_companies, all_stages, all_industries, similarity_threshold):
	rows = []
	indices = []
	for y in range(len(hr_img_lines)-1):
		cols=[]
		for x in range(len(vr_img_lines)-1):
			cell_bbox = [vr_img_lines[x][0], hr_img_lines[y][0], vr_img_lines[x+1][0], hr_img_lines[y+1][0]]
			text = text_within(document, cell_bbox)
			if x==0:
				text = text.strip()
				if len(text)>0:
					cols.append(convert_to_int(text.split()[0]))
					text = text.split()[1:]

					text = ' '.join([each for each in text if not each in ['x','X']])
					if len(text)>0:
						if text[0] in ['0','O']:
							text = text[1:]
					temp = process.extractOne(text, all_companies, score_cutoff=similarity_threshold)
					similarity = 0
					if temp is not None and len(temp)>0:
						text = temp[0]
						similarity = temp[1]
					cols.append(str(similarity<similarity_threshold))
				else:
					cols.append(np.nan)
					cols.append(np.nan)
					text=np.nan
			if x==1:
				temp = process.extractOne(text, all_industries, score_cutoff=similarity_threshold)
				similarity = 0
				if temp is not None and len(temp)>0:
					text = temp[0]
					similarity = temp[1]
				cols.append(str(similarity<similarity_threshold))

			if x==2:
				text = convert_to_int(text)
			if x==4:
				temp = process.extractOne(text, all_stages, score_cutoff=similarity_threshold)
				similarity = 0
				if temp is not None and len(temp)>0:
					text = temp[0]
					similarity = temp[1]
				cols.append(str(similarity<similarity_threshold))
			if x in [5,6,7]:
				text = text.replace('.',',')
				text = [convert_to_int(each) for each in text.split(',')]
				text = ','.join(text)
			cols.append(text)
		rows.append(cols)
	df = pd.DataFrame(rows)
	df = df.rename(columns={0:'Index',1:'problem_company',2:'company_name',3:'problem_industry',4:'industry',7:'problem_stage',8:'stage'})

	# indices = list(df['Index'].astype(int))
	# if indices[-1] - indices[0] + 1 == df.shape[0]:
	# 	df['Index'] = range(indices[0], indices[-1]+1)

	return df



In [13]:
all_companies = list(pd.read_excel(args['companies_file'],sheet_name=None)[args['country']]['Company Name'])
all_stages = list(pd.read_excel(args['stage_options_file'],header=None)[0])
all_industries = list(pd.read_excel(args['industry_options_file'])['industry'])
similarity_threshold = int(args['similarity_threshold'])
img_path = '/home/connect/prashant/Finland/Croatia/Croatia 1-21.PNG'
print('Processing '+img_path)

document1_path = img_path.split('.')[0]+'_document_td.pickle'
if not os.path.exists(document1_path):
    document1 = google_hit(img_path, args['vision_credential'], api_type='document_text_detection')
    print('api hit')
    with open(document1_path,'wb') as f:
        pickle.dump(document1,f)
else:
    with open(document1_path,'rb') as f:
        document1 = pickle.load(f)
    
document2_path = img_path.split('.')[0]+'_td.pickle'
if not os.path.exists(document2_path):
    document2 = google_hit(img_path, args['vision_credential'], api_type='text_detection')
    print('api hit')
    with open(document2_path,'wb') as f:
        pickle.dump(document2,f)
else:
    with open(document2_path,'rb') as f:
        document2 = pickle.load(f)

Processing /home/connect/prashant/Finland/Croatia/Croatia 1-21.PNG


In [None]:
img = cv2.imread(img_path)
H,W = img.shape[:2] 

im = get_bin_image(img.copy(), th=220)
hr_hist = white_percentage(im, mode='horizontal')
hr_img_lines = [(y,hr_hist[y]) for y in range(H) if hr_hist[y]<=5]
hr_img_lines = get_band_lines(hr_img_lines, min_band_sep=5, ignore_rate=5, ignore_rate_step=0.1, which='mid')
hr_img_lines[0] = [0,0]
hr_img_lines[-1] = [H-1,0]

vr_hist = white_percentage(im, mode='vertical')
vr_img_lines = [(x,vr_hist[x]) for x in range(W) if vr_hist[x]<=5]
vr_img_lines = get_band_lines(vr_img_lines, min_band_sep=5, ignore_rate=5, ignore_rate_step=0.1, which='mid')
vr_img_lines[0] = [0,0]
vr_img_lines[-1] = [W-1,0]
# print(len(vr_img_lines),len(hr_img_lines))


df1 = get_df(hr_img_lines, vr_img_lines, document1, all_companies, all_stages, all_industries, similarity_threshold)
df2 = get_df(hr_img_lines, vr_img_lines, document2, all_companies, all_stages, all_industries, similarity_threshold)

df1 = df1.replace('',np.nan)
df1.fillna(df2, inplace=True)



In [None]:
from matplotlib import pyplot as plt
%matplotlib inline
plt.figure(figsize=(30,30))
plt.title('original image',size=40)
plt.imshow(img)
for i in hr_img_lines:
    cv2.line(img,(0,i[0]),(W,i[0]),(0,255,0),2)
    
for i in vr_img_lines:
    cv2.line(img,(i[0],0),(i[0],H),(0,0,255),2)
plt.figure(figsize=(30,30))
plt.title('image after tabular cells detection',size=40)
plt.imshow(img)

In [None]:
df1

### archives