## Presteps : 
    
1. Extract Text from all Image
2. Clean and Prepare text
3. Labelling(BIO Tagging) - Manual Process 


In [1]:
import numpy as np
import pandas as pd
import string 
import re

## Load the Labelled Data

In [2]:
with open('businessCard.txt', mode = 'r', encoding='utf8', errors='ignore') as f:
    text = f.read()

### Split the Data

In [3]:
data = list(map(lambda x : x.split('\t'), text.split('\n')))

In [4]:
data

[['id', 'text', 'tag'],
 ['000.jpeg', ' ', 'O'],
 ['000.jpeg', '.', 'O'],
 ['000.jpeg', '040-4852', 'B-PHONE'],
 ['000.jpeg', '"8881,"', 'I-PHONE'],
 ['000.jpeg', '90309', 'B-PHONE'],
 ['000.jpeg', '52549', 'I-PHONE'],
 ['000.jpeg', 'Fi', 'O'],
 ['000.jpeg', '/laurelsoverseaseducation', 'O'],
 ['000.jpeg', '@:', 'O'],
 ['000.jpeg', 'LAURELS', 'B-ORG'],
 ['000.jpeg', 'OVERSEAS', 'I-ORG'],
 ['000.jpeg', 'EDUCATIONAL', 'I-ORG'],
 ['000.jpeg', 'CONSULTANCY', 'I-ORG'],
 ['000.jpeg', 'PVT.', 'I-ORG'],
 ['000.jpeg', 'LTD.', 'I-ORG'],
 ['000.jpeg', 'Sea', 'O'],
 ['000.jpeg', '|', 'O'],
 ['000.jpeg', 'U.K', 'O'],
 ['000.jpeg', 'AUSTRALIA', 'O'],
 ['000.jpeg', 'CANADA', 'O'],
 ['000.jpeg', 'IRELAND', 'O'],
 ['000.jpeg', ' ', 'O'],
 ['000.jpeg', ' ', 'O'],
 ['000.jpeg', ' ', 'O'],
 ['000.jpeg', ' ', 'O'],
 ['000.jpeg', ' ', 'O'],
 ['000.jpeg', ' ', 'O'],
 ['000.jpeg', 'www.laurelseducation.com', 'B-WEB'],
 ['000.jpeg', ')%info@laurelseducation.com', 'B-EMAIL'],
 ['000.jpeg', ' ', 'O'],
 ['001.jpe

### Prepare Data Frame

In [5]:
df = pd.DataFrame(data[1:], columns=data[0])

In [6]:
df.head()

Unnamed: 0,id,text,tag
0,000.jpeg,,O
1,000.jpeg,.,O
2,000.jpeg,040-4852,B-PHONE
3,000.jpeg,"""8881,""",I-PHONE
4,000.jpeg,90309,B-PHONE


## Cleaning Text

. Remove White Spaces

. Remove Unwanted Punctuations/Charecters

In [7]:
# White Spaces available in String

whitespace = string.whitespace
whitespace

' \t\n\r\x0b\x0c'

In [8]:
# Punctuations available in String

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [9]:
# ", @, -, _ e.t.c play some important role, hence we are not removing this 
punctuation = "!#$%&\'()*+:;<=>?[\\]^`{|}~"

In [10]:
# Table for white space and Punctuations

tableWhitespace = str.maketrans('', '', whitespace)
tablePunctuation = str.maketrans('', '', punctuation)

In [11]:
# Function for text cleaning

def cleanText(txt):
    text = str(txt)
    # Below is the changes in Version_2 (Considering Capital Letters )
    # text = text.lower()
    removeWhiteSpace = text.translate(tableWhitespace)
    removePunctuation = removeWhiteSpace.translate(tablePunctuation)
    
    return str(removePunctuation)

In [12]:
df['text'] = df['text'].apply(cleanText)

In [13]:
# Remove Spaces

dataClean = df.query("text != '' ")
dataClean

Unnamed: 0,id,text,tag
1,000.jpeg,.,O
2,000.jpeg,040-4852,B-PHONE
3,000.jpeg,"""8881,""",I-PHONE
4,000.jpeg,90309,B-PHONE
5,000.jpeg,52549,I-PHONE
...,...,...,...
10439,290.jpeg,SrmrriSurres,B-ORG
10441,290.jpeg,Richard,B-NAME
10442,290.jpeg,Pretorius,I-NAME
10444,290.jpeg,Director,B-DES


In [14]:
# Remove Null Values

dataClean.dropna(inplace = True)
dataClean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataClean.dropna(inplace = True)


Unnamed: 0,id,text,tag
1,000.jpeg,.,O
2,000.jpeg,040-4852,B-PHONE
3,000.jpeg,"""8881,""",I-PHONE
4,000.jpeg,90309,B-PHONE
5,000.jpeg,52549,I-PHONE
...,...,...,...
10437,290.jpeg,p,O
10439,290.jpeg,SrmrriSurres,B-ORG
10441,290.jpeg,Richard,B-NAME
10442,290.jpeg,Pretorius,I-NAME


## Convert Data into Spacy Format

In [15]:
group = dataClean.groupby('id')
cards = group.groups.keys()

### Perform the activity for 1 card (Sample Check)

In [16]:
# Fetch the value for only 1 group

grouparray = group.get_group('000.jpeg')[['text', 'tag']].values
content =  ''
annotations = {'entities' :[]}
start = 0
end = 0

for text, label in grouparray:
    text = str(text)
    stringlegth = len(text) + 1
    
    start = end
    end = start + stringlegth
    
    # We are not interested in O
    if label != 'O':
        annot = (start, end-1, label)
        annotations['entities'].append(annot)
        
    content = content+text+' '

In [17]:
content

'. 040-4852 "8881," 90309 52549 Fi /laurelsoverseaseducation @ LAURELS OVERSEAS EDUCATIONAL CONSULTANCY PVT. LTD. Sea U.K AUSTRALIA CANADA IRELAND www.laurelseducation.com info@laurelseducation.com '

In [18]:
annotations

{'entities': [(2, 10, 'B-PHONE'),
  (11, 18, 'I-PHONE'),
  (19, 24, 'B-PHONE'),
  (25, 30, 'I-PHONE'),
  (62, 69, 'B-ORG'),
  (70, 78, 'I-ORG'),
  (79, 90, 'I-ORG'),
  (91, 102, 'I-ORG'),
  (103, 107, 'I-ORG'),
  (108, 112, 'I-ORG'),
  (146, 170, 'B-WEB'),
  (171, 196, 'B-EMAIL')]}

### Verifying

In [19]:
content.find('040-4852')

2

In [20]:
content.find('040-4852')+len('040-4852')

10

In [21]:
content.find('www.laurelseducation.com')

146

In [22]:
content.find('www.laurelseducation.com')+len('www.laurelseducation.com')

170

### Perform the activity for all cards

In [23]:
# Fetch the value for only 1 group

allCardsData = []

for card in cards:
    
    cardData = []
    
    grouparray = group.get_group(card)[['text', 'tag']].values
    content =  ''
    annotations = {'entities' :[]}
    start = 0
    end = 0

    for text, label in grouparray:
        text = str(text)
        stringlegth = len(text) + 1

        start = end
        end = start + stringlegth

        # We are not interested in O
        if label != 'O':
            annot = (start, end-1, label)
            annotations['entities'].append(annot)

        content = content+text+' '
        
    cardData =(content, annotations)
    allCardsData.append(cardData)

In [24]:
allCardsData

[('. 040-4852 "8881," 90309 52549 Fi /laurelsoverseaseducation @ LAURELS OVERSEAS EDUCATIONAL CONSULTANCY PVT. LTD. Sea U.K AUSTRALIA CANADA IRELAND www.laurelseducation.com info@laurelseducation.com ',
  {'entities': [(2, 10, 'B-PHONE'),
    (11, 18, 'I-PHONE'),
    (19, 24, 'B-PHONE'),
    (25, 30, 'I-PHONE'),
    (62, 69, 'B-ORG'),
    (70, 78, 'I-ORG'),
    (79, 90, 'I-ORG'),
    (91, 102, 'I-ORG'),
    (103, 107, 'I-ORG'),
    (108, 112, 'I-ORG'),
    (146, 170, 'B-WEB'),
    (171, 196, 'B-EMAIL')]}),
 ('john smith marketing manager web www.psdgraphics.com phone 123-456-7890 mail email@psdgraphics.com ',
  {'entities': [(0, 4, 'B-NAME'),
    (5, 10, 'I-NAME'),
    (11, 20, 'B-DES'),
    (21, 28, 'I-DES'),
    (33, 52, 'B-WEB'),
    (59, 71, 'B-PHONE'),
    (77, 98, 'B-EMAIL')]}),
 ('Sau 0 98489 24441 dy "08672," 224441 /ENKATESWAPA wie ',
  {'entities': [(6, 11, 'B-PHONE'), (12, 17, 'I-PHONE'), (37, 49, 'B-ORG')]}),
 ('Prasad @ "9,96,31,73,53,59,49,04,00,000" i Flex Design Album D

## Split Data into Training & Testing 

In [25]:
import random
random.shuffle(allCardsData)

TrainData = allCardsData[:240]
TestData = allCardsData[240:]

## Save the Train and Test Data to pickle

In [26]:
import pickle

pickle.dump(TrainData, open('./data/TrainData.pickle', mode = 'wb'))
pickle.dump(TestData,  open('./data/TestData.pickle' , mode = 'wb'))