In [1]:
import numpy as np
import pandas as pd
import string
import re

In [2]:
with open('businessCard.txt', mode='r', encoding='utf8', errors='ignore') as f:
    text = f.read()

In [3]:
print(text)

id	text	tag
000.jpeg	 	O
000.jpeg	.	O
000.jpeg	040-4852	B-PHONE
000.jpeg	"8881,"	I-PHONE
000.jpeg	90309	B-PHONE
000.jpeg	52549	I-PHONE
000.jpeg	Fi	O
000.jpeg	/laurelsoverseaseducation	O
000.jpeg	@:	O
000.jpeg	LAURELS	B-ORG
000.jpeg	OVERSEAS	I-ORG
000.jpeg	EDUCATIONAL	I-ORG
000.jpeg	CONSULTANCY	I-ORG
000.jpeg	PVT.	I-ORG
000.jpeg	LTD.	I-ORG
000.jpeg	Sea	O
000.jpeg	|	O
000.jpeg	U.K	O
000.jpeg	AUSTRALIA	O
000.jpeg	CANADA	O
000.jpeg	IRELAND	O
000.jpeg	 	O
000.jpeg	 	O
000.jpeg	 	O
000.jpeg	 	O
000.jpeg	 	O
000.jpeg	 	O
000.jpeg	www.laurelseducation.com	B-WEB
000.jpeg	)%info@laurelseducation.com	B-EMAIL
000.jpeg	 	O
001.jpeg	john	B-NAME
001.jpeg	smith	I-NAME
001.jpeg	marketing	B-DES
001.jpeg	manager	I-DES
001.jpeg	web:	O
001.jpeg	www.psdgraphics.com	B-WEB
001.jpeg	phone:	O
001.jpeg	123-456-7890	B-PHONE
001.jpeg	mail:	O
001.jpeg	email@psdgraphics.com	B-EMAIL
002.jpeg	    	O
002.jpeg	   	O
002.jpeg	Sau	O
002.jpeg	0	O
002.jpeg	98489	B-PHONE
002.jpeg	24441	I-PHONE
002.jpeg	dy	O
002.jpeg	"08672,"

In [4]:
data = list(map(lambda x: x.split('\t'), text.split('\n')))

In [5]:
df = pd.DataFrame(data=data[1:], columns=data[0])
df.head(10)

Unnamed: 0,id,text,tag
0,000.jpeg,,O
1,000.jpeg,.,O
2,000.jpeg,040-4852,B-PHONE
3,000.jpeg,"""8881,""",I-PHONE
4,000.jpeg,90309,B-PHONE
5,000.jpeg,52549,I-PHONE
6,000.jpeg,Fi,O
7,000.jpeg,/laurelsoverseaseducation,O
8,000.jpeg,@:,O
9,000.jpeg,LAURELS,B-ORG


## Cleaning Text
- Remove white space
- Remove unwanted special characters

In [6]:
whitespace = string.whitespace
punctuation = "!#$%&\'()*+:;<=>?[\\]^`{|}~"
table_whitespace = str.maketrans('', '', whitespace)
table_punctuation = str.maketrans('', '', punctuation)

def clean_text(txt):
    text = str(txt)
    # text = text.lower()
    remove_whitespace = text.translate(table_whitespace)
    remove_punctuation = remove_whitespace.translate(table_punctuation)
    return str(remove_punctuation)

In [7]:
df['text'] = df['text'].apply(clean_text)

In [8]:
data_clean = df.query("text != '' ")
data_clean.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean.dropna(inplace=True)


In [9]:
data_clean.head(10)

Unnamed: 0,id,text,tag
1,000.jpeg,.,O
2,000.jpeg,040-4852,B-PHONE
3,000.jpeg,"""8881,""",I-PHONE
4,000.jpeg,90309,B-PHONE
5,000.jpeg,52549,I-PHONE
6,000.jpeg,Fi,O
7,000.jpeg,/laurelsoverseaseducation,O
8,000.jpeg,@,O
9,000.jpeg,LAURELS,B-ORG
10,000.jpeg,OVERSEAS,I-ORG


## Convert Data into Spacy Format

In [10]:
group = data_clean.groupby(by='id')

In [11]:
group_array = group.get_group('000.jpeg')[['text', 'tag']].values
group_array


array([['.', 'O'],
       ['040-4852', 'B-PHONE'],
       ['"8881,"', 'I-PHONE'],
       ['90309', 'B-PHONE'],
       ['52549', 'I-PHONE'],
       ['Fi', 'O'],
       ['/laurelsoverseaseducation', 'O'],
       ['@', 'O'],
       ['LAURELS', 'B-ORG'],
       ['OVERSEAS', 'I-ORG'],
       ['EDUCATIONAL', 'I-ORG'],
       ['CONSULTANCY', 'I-ORG'],
       ['PVT.', 'I-ORG'],
       ['LTD.', 'I-ORG'],
       ['Sea', 'O'],
       ['U.K', 'O'],
       ['AUSTRALIA', 'O'],
       ['CANADA', 'O'],
       ['IRELAND', 'O'],
       ['www.laurelseducation.com', 'B-WEB'],
       ['info@laurelseducation.com', 'B-EMAIL']], dtype=object)

In [12]:
content = ''
annotations = { 'entities': [] }
start = 0
end = 0

for text, label in group_array:
    text = str(text)
    string_length = len(text) + 1
    start = end
    end = start + string_length
    
    if label != 'O':
        annot = (start, end-1, label)
        annotations['entities'].append(annot)
    
    content = content + text + ' '

In [13]:
content

'. 040-4852 "8881," 90309 52549 Fi /laurelsoverseaseducation @ LAURELS OVERSEAS EDUCATIONAL CONSULTANCY PVT. LTD. Sea U.K AUSTRALIA CANADA IRELAND www.laurelseducation.com info@laurelseducation.com '

In [14]:
annotations

{'entities': [(2, 10, 'B-PHONE'),
  (11, 18, 'I-PHONE'),
  (19, 24, 'B-PHONE'),
  (25, 30, 'I-PHONE'),
  (62, 69, 'B-ORG'),
  (70, 78, 'I-ORG'),
  (79, 90, 'I-ORG'),
  (91, 102, 'I-ORG'),
  (103, 107, 'I-ORG'),
  (108, 112, 'I-ORG'),
  (146, 170, 'B-WEB'),
  (171, 196, 'B-EMAIL')]}

In [15]:
content.find('040-4852') + len('040-4852')

10

In [16]:
cards = group.groups.keys()
cards

dict_keys(['000.jpeg', '001.jpeg', '002.jpeg', '003.jpeg', '004.jpeg', '007.jpeg', '008.jpeg', '009.jpeg', '010.jpeg', '011.jpeg', '012.jpeg', '013.jpeg', '014.jpeg', '015.jpeg', '016.jpeg', '017.jpeg', '018.jpeg', '020.jpeg', '021.jpeg', '022.jpeg', '023.jpeg', '024.jpeg', '025.jpeg', '027.jpeg', '028.jpeg', '030.jpeg', '031.jpeg', '032.jpeg', '033.jpeg', '034.jpeg', '035.jpeg', '036.jpeg', '037.jpeg', '038.jpeg', '039.jpeg', '040.jpeg', '041.jpeg', '042.jpeg', '043.jpeg', '044.jpeg', '045.jpeg', '047.jpeg', '048.jpeg', '049.jpeg', '050.jpeg', '051.jpeg', '052.jpeg', '053.jpeg', '054.jpeg', '055.jpeg', '056.jpeg', '057.jpeg', '058.jpeg', '059.jpeg', '060.jpeg', '061.jpeg', '062.jpeg', '063.jpeg', '064.jpeg', '065.jpeg', '066.jpeg', '067.jpeg', '068.jpeg', '069.jpeg', '070.jpeg', '071.jpeg', '072.jpeg', '073.jpeg', '074.jpeg', '075.jpeg', '076.jpeg', '078.jpeg', '079.jpeg', '080.jpeg', '081.jpeg', '082.jpeg', '083.jpeg', '084.jpeg', '085.jpeg', '086.jpeg', '087.jpeg', '088.jpeg', '089.

In [17]:
all_cards_data = []
for card in cards:
    group_array = group.get_group(card)[['text', 'tag']].values
    card_data = []
    content = ''
    annotations = { 'entities': [] }
    start = 0
    end = 0
    for text, label in group_array:
        text = str(text)
        string_length = len(text) + 1
        
        start = end
        end = start + string_length
        
        if label != 'O':
            annot = (start, end-1, label)
            annotations['entities'].append(annot)
        content = content + text + ' '
        
    card_data = (content, annotations)
    all_cards_data.append(card_data)

In [18]:
all_cards_data

[('. 040-4852 "8881," 90309 52549 Fi /laurelsoverseaseducation @ LAURELS OVERSEAS EDUCATIONAL CONSULTANCY PVT. LTD. Sea U.K AUSTRALIA CANADA IRELAND www.laurelseducation.com info@laurelseducation.com ',
  {'entities': [(2, 10, 'B-PHONE'),
    (11, 18, 'I-PHONE'),
    (19, 24, 'B-PHONE'),
    (25, 30, 'I-PHONE'),
    (62, 69, 'B-ORG'),
    (70, 78, 'I-ORG'),
    (79, 90, 'I-ORG'),
    (91, 102, 'I-ORG'),
    (103, 107, 'I-ORG'),
    (108, 112, 'I-ORG'),
    (146, 170, 'B-WEB'),
    (171, 196, 'B-EMAIL')]}),
 ('john smith marketing manager web www.psdgraphics.com phone 123-456-7890 mail email@psdgraphics.com ',
  {'entities': [(0, 4, 'B-NAME'),
    (5, 10, 'I-NAME'),
    (11, 20, 'B-DES'),
    (21, 28, 'I-DES'),
    (33, 52, 'B-WEB'),
    (59, 71, 'B-PHONE'),
    (77, 98, 'B-EMAIL')]}),
 ('Sau 0 98489 24441 dy "08672," 224441 /ENKATESWAPA wie ',
  {'entities': [(6, 11, 'B-PHONE'), (12, 17, 'I-PHONE'), (37, 49, 'B-ORG')]}),
 ('Prasad @ "9,96,31,73,53,59,49,04,00,000" i Flex Design Album D

## Split Data into Training and Split

In [19]:
import random

In [20]:
random.shuffle(all_cards_data)

In [21]:
len(all_cards_data)

267

In [22]:
train_data = all_cards_data[:240]
test_data = all_cards_data[240:]

In [23]:
import pickle

pickle.dump(train_data, open('data/train_data.pickle', 'wb'))
pickle.dump(test_data, open('data/test_data.pickle', 'wb'))