In [1]:
import numpy as np
import pandas as pd
import cv2
import pytesseract
from glob import glob
import string
import re
import spacy

In [2]:
import warnings
warnings.filterwarnings('ignore')

## Clean Text Function

In [3]:
def cleanText(txt):
    
    whitespace = string.whitespace
    punctuation = "!#$%&\'()*+:;<=>?[\\]^`{|}~"
    tableWhitespace = str.maketrans('', '', whitespace)
    tablePunctuation = str.maketrans('', '', punctuation)
    
    text = str(txt)
    text = text.lower()
    removeWhiteSpace = text.translate(tableWhitespace)
    removePunctuation = removeWhiteSpace.translate(tablePunctuation)
    
    return str(removePunctuation)

## Load NER Model

In [4]:
model_ner = spacy.load('./output/model-best')

## Load Image

In [5]:
image = cv2.imread('./data/6.jpg')

cv2.imshow('Business Card', image)
cv2.waitKey(0)
cv2.destroyAllWindows()

## Extract Data using pytesseract

In [6]:
tessData = pytesseract.image_to_data(image)
tessData

'level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext\n1\t1\t0\t0\t0\t0\t0\t0\t1065\t616\t-1\t\n2\t1\t1\t0\t0\t0\t3\t0\t1062\t5\t-1\t\n3\t1\t1\t1\t0\t0\t3\t0\t1062\t5\t-1\t\n4\t1\t1\t1\t1\t0\t3\t0\t1062\t5\t-1\t\n5\t1\t1\t1\t1\t1\t3\t0\t1062\t5\t95\t \n2\t1\t2\t0\t0\t0\t47\t44\t8\t119\t-1\t\n3\t1\t2\t1\t0\t0\t47\t44\t8\t119\t-1\t\n4\t1\t2\t1\t1\t0\t47\t44\t8\t119\t-1\t\n5\t1\t2\t1\t1\t1\t47\t44\t8\t119\t95\t \n2\t1\t3\t0\t0\t0\t46\t53\t979\t156\t-1\t\n3\t1\t3\t1\t0\t0\t722\t53\t303\t28\t-1\t\n4\t1\t3\t1\t1\t0\t722\t53\t303\t28\t-1\t\n5\t1\t3\t1\t1\t1\t722\t53\t64\t28\t90\tCell\n5\t1\t3\t1\t1\t2\t802\t60\t5\t20\t90\t:\n5\t1\t3\t1\t1\t3\t822\t53\t203\t28\t96\t8099948528\n3\t1\t3\t2\t0\t0\t55\t55\t970\t112\t-1\t\n4\t1\t3\t2\t1\t0\t55\t55\t970\t89\t-1\t\n5\t1\t3\t2\t1\t1\t55\t55\t85\t89\t30\tte)\n5\t1\t3\t2\t1\t2\t822\t71\t203\t65\t96\t8466045457\n4\t1\t3\t2\t2\t0\t593\t136\t432\t31\t-1\t\n5\t1\t3\t2\t2\t1\t593\t136\t93\t25\t89\temail\n5\t1\t3\t2\t2

In [7]:
tessList = list(map(lambda x : x.split('\t'), tessData.split('\n')))
tessList

[['level',
  'page_num',
  'block_num',
  'par_num',
  'line_num',
  'word_num',
  'left',
  'top',
  'width',
  'height',
  'conf',
  'text'],
 ['1', '1', '0', '0', '0', '0', '0', '0', '1065', '616', '-1', ''],
 ['2', '1', '1', '0', '0', '0', '3', '0', '1062', '5', '-1', ''],
 ['3', '1', '1', '1', '0', '0', '3', '0', '1062', '5', '-1', ''],
 ['4', '1', '1', '1', '1', '0', '3', '0', '1062', '5', '-1', ''],
 ['5', '1', '1', '1', '1', '1', '3', '0', '1062', '5', '95', ' '],
 ['2', '1', '2', '0', '0', '0', '47', '44', '8', '119', '-1', ''],
 ['3', '1', '2', '1', '0', '0', '47', '44', '8', '119', '-1', ''],
 ['4', '1', '2', '1', '1', '0', '47', '44', '8', '119', '-1', ''],
 ['5', '1', '2', '1', '1', '1', '47', '44', '8', '119', '95', ' '],
 ['2', '1', '3', '0', '0', '0', '46', '53', '979', '156', '-1', ''],
 ['3', '1', '3', '1', '0', '0', '722', '53', '303', '28', '-1', ''],
 ['4', '1', '3', '1', '1', '0', '722', '53', '303', '28', '-1', ''],
 ['5', '1', '3', '1', '1', '1', '722', '53', '6

In [8]:
df = pd.DataFrame(tessList[1:], columns=tessList[0])
df.head()

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text
0,1,1,0,0,0,0,0,0,1065,616,-1,
1,2,1,1,0,0,0,3,0,1062,5,-1,
2,3,1,1,1,0,0,3,0,1062,5,-1,
3,4,1,1,1,1,0,3,0,1062,5,-1,
4,5,1,1,1,1,1,3,0,1062,5,95,


In [9]:
# Removing Null values
df.dropna(inplace=True)

In [10]:
# Performing cleantext Function
df['text'] = df['text'].apply(cleanText)

## Convert Data into Context

In [11]:
df_clean = df.query('text != "" ')
content = ' '.join([w for w in df_clean['text']])
print(content)

cell 8099948528 te 8466045457 email lictsrikant@gmail.com life insurance corporation of india seosrika ntht@gmail .com thathineni srikanth insurance advisor agent code no. 0316164y life insurance corporation of india br. off. lic office, trimulgherry, sec’bad - 500 016. add. borabanda, hyderabad - 500 018. lictsrikant8099948528.blogspot.in, interviewsinhyderabad.blogspot.in facebook.com/lictsrikant8099948528, facebook.com/thathineni.srikanth.9 promote your business online pybo


## Get Predictions from NER Model

In [12]:
doc = model_ner(content)

In [13]:
from spacy import displacy

In [14]:
# Serve method to display the output

displacy.serve(doc, style='ent')


Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [15]:
# render method to display the output

displacy.render(doc, style='ent')

## Tagging

In [16]:
docjson = doc.to_json()
docjson.keys()

dict_keys(['text', 'ents', 'tokens'])

In [17]:
docjson['text']

'cell 8099948528 te 8466045457 email lictsrikant@gmail.com life insurance corporation of india seosrika ntht@gmail .com thathineni srikanth insurance advisor agent code no. 0316164y life insurance corporation of india br. off. lic office, trimulgherry, sec’bad - 500 016. add. borabanda, hyderabad - 500 018. lictsrikant8099948528.blogspot.in, interviewsinhyderabad.blogspot.in facebook.com/lictsrikant8099948528, facebook.com/thathineni.srikanth.9 promote your business online pybo'

In [18]:
docjson['ents'][0]

{'start': 5, 'end': 15, 'label': 'B-PHONE'}

In [19]:
docjson['tokens'][0]

{'id': 0, 'start': 0, 'end': 4}

In [20]:
# Create Data Frame with TOKENS

datafram_tokens = pd.DataFrame(docjson['tokens'])
datafram_tokens.head()

Unnamed: 0,id,start,end
0,0,0,4
1,1,5,15
2,2,16,18
3,3,19,29
4,4,30,35


In [21]:
doc_text = docjson['text']

In [22]:
# Add TEXT to our Data Frame

datafram_tokens['token'] = datafram_tokens[['start', 'end']].apply(lambda x:doc_text[x[0]: x[1]], axis =1)
datafram_tokens.head()

Unnamed: 0,id,start,end,token
0,0,0,4,cell
1,1,5,15,8099948528
2,2,16,18,te
3,3,19,29,8466045457
4,4,30,35,email


In [23]:
# Add ENTITIES(Labels) to our Data Frame

right_table = pd.DataFrame(docjson['ents'])[['start', 'label']]
right_table.head()

Unnamed: 0,start,label
0,5,B-PHONE
1,19,B-PHONE
2,36,B-EMAIL
3,58,B-ORG
4,63,I-ORG


In [24]:
datafram_tokens = pd.merge(datafram_tokens, right_table, how = 'left', on = 'start')
datafram_tokens.head()

Unnamed: 0,id,start,end,token,label
0,0,0,4,cell,
1,1,5,15,8099948528,B-PHONE
2,2,16,18,te,
3,3,19,29,8466045457,B-PHONE
4,4,30,35,email,


In [25]:
# Fill NAN values with 'O'

datafram_tokens.fillna('O', inplace=True)
datafram_tokens.head()

Unnamed: 0,id,start,end,token,label
0,0,0,4,cell,O
1,1,5,15,8099948528,B-PHONE
2,2,16,18,te,O
3,3,19,29,8466045457,B-PHONE
4,4,30,35,email,O


## Join Token DataFrame with Pytesseract Data

In [26]:
# Pytesseract Data

df_clean.head()

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text
12,5,1,3,1,1,1,722,53,64,28,90,cell
14,5,1,3,1,1,3,822,53,203,28,96,8099948528
17,5,1,3,2,1,1,55,55,85,89,30,te
18,5,1,3,2,1,2,822,71,203,65,96,8466045457
20,5,1,3,2,2,1,593,136,93,25,89,email


In [27]:
# Create end and start point from the df_clean Data Frame

df_clean['end'] = df_clean['text'].apply(lambda x : len(x)+1).cumsum()-1
df_clean.head()

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end
12,5,1,3,1,1,1,722,53,64,28,90,cell,4
14,5,1,3,1,1,3,822,53,203,28,96,8099948528,15
17,5,1,3,2,1,1,55,55,85,89,30,te,18
18,5,1,3,2,1,2,822,71,203,65,96,8466045457,29
20,5,1,3,2,2,1,593,136,93,25,89,email,35


In [28]:
df_clean['start'] = df_clean[['text', 'end']].apply(lambda x : x[1]-len(x[0]), axis = 1)
df_clean.head()

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start
12,5,1,3,1,1,1,722,53,64,28,90,cell,4,0
14,5,1,3,1,1,3,822,53,203,28,96,8099948528,15,5
17,5,1,3,2,1,1,55,55,85,89,30,te,18,16
18,5,1,3,2,1,2,822,71,203,65,96,8466045457,29,19
20,5,1,3,2,2,1,593,136,93,25,89,email,35,30


In [29]:
## Inner Join with Start position

dataframe_info = pd.merge(df_clean, datafram_tokens[['start', 'token', 'label']], how = 'inner', on ='start')
dataframe_info.head()

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label
0,5,1,3,1,1,1,722,53,64,28,90,cell,4,0,cell,O
1,5,1,3,1,1,3,822,53,203,28,96,8099948528,15,5,8099948528,B-PHONE
2,5,1,3,2,1,1,55,55,85,89,30,te,18,16,te,O
3,5,1,3,2,1,2,822,71,203,65,96,8466045457,29,19,8466045457,B-PHONE
4,5,1,3,2,2,1,593,136,93,25,89,email,35,30,email,O


In [30]:
dataframe_info.tail()

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label
46,5,1,6,3,4,1,46,571,106,21,96,promote,455,448,promote,O
47,5,1,6,3,4,2,161,576,56,22,96,your,460,456,your,O
48,5,1,6,3,4,3,226,571,111,21,96,business,469,461,business,O
49,5,1,6,3,4,4,347,571,74,21,92,online,476,470,online,O
50,5,1,6,3,4,5,432,571,96,27,92,pybo,481,477,pybo,O


## Bounding Box

In [31]:
# Create a Specific Data Frame for bouning box

bb_df = dataframe_info.query("label != 'O' ")
img = image.copy()

for x,y,w,h,label in bb_df[['left', 'top', 'width', 'height', 'label']].values:
    x = int(x)
    y = int(y)
    w = int(w)
    h = int(h)
    
    cv2.rectangle(img, pt1 =(x,y), pt2=(x+w, y+h), color= (0,255,0), thickness = 2)
    cv2.putText(img, str(label), (x,y), cv2.FONT_HERSHEY_COMPLEX, 1, (255,0,255),2)
    

cv2.imshow('Predictions', img)
cv2.waitKey(0)
cv2.destroyAllWindows()

## Combine the Bio Information

In [32]:
bb_df.head()

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label
1,5,1,3,1,1,3,822,53,203,28,96,8099948528,15,5,8099948528,B-PHONE
3,5,1,3,2,1,2,822,71,203,65,96,8466045457,29,19,8466045457,B-PHONE
5,5,1,3,2,2,3,709,136,316,31,88,lictsrikant@gmail.com,57,36,lictsrikant@gmail.com,B-EMAIL
6,5,1,3,3,1,1,46,170,33,14,96,life,62,58,life,B-ORG
7,5,1,3,3,1,2,85,151,91,42,94,insurance,72,63,insurance,I-ORG


In [33]:
# We need to remove the first 2 letter from labels

bb_df['label'] = bb_df['label'].apply(lambda x : x[2:])
bb_df.head()

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label
1,5,1,3,1,1,3,822,53,203,28,96,8099948528,15,5,8099948528,PHONE
3,5,1,3,2,1,2,822,71,203,65,96,8466045457,29,19,8466045457,PHONE
5,5,1,3,2,2,3,709,136,316,31,88,lictsrikant@gmail.com,57,36,lictsrikant@gmail.com,EMAIL
6,5,1,3,3,1,1,46,170,33,14,96,life,62,58,life,ORG
7,5,1,3,3,1,2,85,151,91,42,94,insurance,72,63,insurance,ORG


## Group the Label

In [34]:
class groupgen():
    
    def __init__(self):
        self.id = 0
        self.text = ''
        
    def getgroup(self, text):
        
        if self.text ==text:
            return self.id
        else:
            self.id += 1
            self.text = text
            return self.id
        

grp_gen = groupgen()

In [35]:
bb_df['group'] = bb_df['label'].apply(grp_gen.getgroup)

In [36]:
bb_df.head()

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label,group
1,5,1,3,1,1,3,822,53,203,28,96,8099948528,15,5,8099948528,PHONE,1
3,5,1,3,2,1,2,822,71,203,65,96,8466045457,29,19,8466045457,PHONE,1
5,5,1,3,2,2,3,709,136,316,31,88,lictsrikant@gmail.com,57,36,lictsrikant@gmail.com,EMAIL,2
6,5,1,3,3,1,1,46,170,33,14,96,life,62,58,life,ORG,3
7,5,1,3,3,1,2,85,151,91,42,94,insurance,72,63,insurance,ORG,3


### Right and Bottom of Bounding Box

In [37]:
bb_df[['left', 'top', 'width', 'height']] = bb_df[['left', 'top', 'width', 'height']].astype(int)

bb_df['right'] = bb_df['left']+ bb_df['width']
bb_df['bottom'] = bb_df['top']+ bb_df['height']

bb_df.head()

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label,group,right,bottom
1,5,1,3,1,1,3,822,53,203,28,96,8099948528,15,5,8099948528,PHONE,1,1025,81
3,5,1,3,2,1,2,822,71,203,65,96,8466045457,29,19,8466045457,PHONE,1,1025,136
5,5,1,3,2,2,3,709,136,316,31,88,lictsrikant@gmail.com,57,36,lictsrikant@gmail.com,EMAIL,2,1025,167
6,5,1,3,3,1,1,46,170,33,14,96,life,62,58,life,ORG,3,79,184
7,5,1,3,3,1,2,85,151,91,42,94,insurance,72,63,insurance,ORG,3,176,193


### Tagging : GroupBy Group

In [38]:
col_group = ['left','top', 'right', 'bottom', 'token', 'label', 'group']
group_tag_img = bb_df[col_group].groupby(by = 'group')
group_tag_img

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001A74698CCA0>

In [39]:
img_tagging = group_tag_img.agg({
    
    'left': min,
    'right': max,
    'top' : min,
    'bottom': max,
    'label' : np.unique ,
    'token' : lambda x: " ".join(x)
    
})

In [40]:
img_tagging.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8 entries, 1 to 8
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   left    8 non-null      int32 
 1   right   8 non-null      int32 
 2   top     8 non-null      int32 
 3   bottom  8 non-null      int32 
 4   label   8 non-null      object
 5   token   8 non-null      object
dtypes: int32(4), object(2)
memory usage: 320.0+ bytes


In [41]:
img_tagging['label'] = img_tagging['label'].apply(lambda x: str(x))

In [42]:
img_bb = image.copy()

for l, r, t, b, label, token in img_tagging.values:
    
    cv2.rectangle(img_bb, (l,t), (r,b), (0,255,0), 2)
    cv2.putText(img_bb, label,(l,t), cv2.FONT_HERSHEY_PLAIN, 1, (255,0,0), 2)
    
cv2.imshow('Bounding Box for Business Card', img_bb)
cv2.waitKey(0)
cv2.destroyAllWindows()

## Parser

In [43]:
def parser(text, label):
    
    if label == 'PHONE':
        text = text.lower()
        text = re.sub(r'\D', '', text)
        
    elif label == 'EMAIL':
        text = text.lower()
        allow_special_char = '\@-_.'
        text = re.sub(r'[^A-Za-z0-9{}]'.format(allow_special_char), '', text)
        
        
    elif label == 'WEB':
        text = text.lower()
        allow_special_char = ':\/.%#-'
        text = re.sub(r'[^A-Za-z0-9{}]'.format(allow_special_char), '', text)
        
        
    elif label in ('NAME', 'DES'):
        text = text.lower()
        text = re.sub(r'[^a-z]', '', text)
        text = text.title()
                   
        
    elif label == 'ORG':
        text = text.lower()
        text = re.sub(r'[^a-z0-9]', '', text)
        text = text.title()
        
    return text

In [44]:
# Verification

parser('deedhd2232323g323b@#$%^&*(2', 'EMAIL')

'deedhd2232323g323b@^2'

## Entities 

In [45]:
dataframe_info.head()

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label
0,5,1,3,1,1,1,722,53,64,28,90,cell,4,0,cell,O
1,5,1,3,1,1,3,822,53,203,28,96,8099948528,15,5,8099948528,B-PHONE
2,5,1,3,2,1,1,55,55,85,89,30,te,18,16,te,O
3,5,1,3,2,1,2,822,71,203,65,96,8466045457,29,19,8466045457,B-PHONE
4,5,1,3,2,2,1,593,136,93,25,89,email,35,30,email,O


In [46]:
info_array = dataframe_info[['token','label']].values
entities = dict(NAME=[],ORG=[],DES=[],PHONE=[],EMAIL=[],WEB=[])
previous = 'O'

for token, label in info_array:
    bio_tag = label[0]
    label_tag = label[2:]

    # step -1 parse the token
    text = parser(token,label_tag)

    if bio_tag in ('B','I'):
        if previous != label_tag:
            entities[label_tag].append(text)

        else:
            if bio_tag == "B":
                entities[label_tag].append(text)
            else:
                if label_tag in ("NAME",'ORG','DES'):
                    entities[label_tag][-1] = entities[label_tag][-1] + " " + text
                else:
                    entities[label_tag][-1] = entities[label_tag][-1] + text


    previous = label_tag
        

In [47]:
entities

{'NAME': ['Thathineni Srikanth'],
 'ORG': ['Life Insurance Corporation Of India',
  'Life Insurance Corporation Of India'],
 'DES': ['Insurance Advisor'],
 'PHONE': ['8099948528', '8466045457'],
 'EMAIL': ['lictsrikant@gmail.com', 'seosrikantht@gmail.com'],
 'WEB': ['lictsrikant8099948528.blogspot.in',
  'interviewsinhyderabad.blogspot.in']}