In [1]:
import numpy as np
import pandas as pd
import string
import re

In [2]:
with open('businessCard.txt', mode='r', encoding='utf8', errors='ignore') as f:
    text = f.read()

In [3]:
data = list(map(lambda x: x.split('\t'), text.split('\n')))

In [4]:
df = pd.DataFrame(data[1:], columns=data[0])

In [5]:
df.head()

Unnamed: 0,id,text,tag
0,000.jpeg,,O
1,000.jpeg,.,O
2,000.jpeg,040-4852,B-PHONE
3,000.jpeg,"""8881,""",I-PHONE
4,000.jpeg,90309,B-PHONE


In [6]:
### Cleaning Text
# Remove white spaces
# Remove unwanted special characters

In [7]:
whitespace = string.whitespace
punctuation = '!#$%&\'()*+-:;<=>?[\\]^`{|}~'
tableWhitespace = str.maketrans('','', whitespace)
tablePunctuation = str.maketrans('', '', punctuation)
def cleanText(txt):
    text = str(txt)
    # text = text.lower()
    removewhitespace = text.translate(tableWhitespace)
    removepunctuation = removewhitespace.translate(tablePunctuation)

    return str(removepunctuation)

In [8]:
df['text'] = df['text'].apply(cleanText)

In [9]:
dataClean = df.query("text != '' ")
dataClean.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataClean.dropna(inplace=True)


In [10]:
dataClean.head(10)

Unnamed: 0,id,text,tag
1,000.jpeg,.,O
2,000.jpeg,0404852,B-PHONE
3,000.jpeg,"""8881,""",I-PHONE
4,000.jpeg,90309,B-PHONE
5,000.jpeg,52549,I-PHONE
6,000.jpeg,Fi,O
7,000.jpeg,/laurelsoverseaseducation,O
8,000.jpeg,@,O
9,000.jpeg,LAURELS,B-ORG
10,000.jpeg,OVERSEAS,I-ORG


In [11]:
### Convert Data into spacy format

In [12]:
group = dataClean.groupby(by='id')

In [13]:
cards = group.groups.keys()

In [14]:
allCardsData = []
for card in cards:
    cardData = []
    grouparray = group.get_group(card)[['text', 'tag']].values
    content = ''
    annotations = {'entities':[]}
    start = 0
    end = 0
    for text, label in grouparray:
        text = str(text)
        stringLength = len(text) + 1
    
        start = end
        end = start + stringLength
    
        if label != 'O':
            annot = (start, end-1, label)
            annotations['entities'].append(annot)
    
        content = content + text + ' '

    cardData = (content, annotations)
    allCardsData.append(cardData)

In [15]:
allCardsData

[('. 0404852 "8881," 90309 52549 Fi /laurelsoverseaseducation @ LAURELS OVERSEAS EDUCATIONAL CONSULTANCY PVT. LTD. Sea U.K AUSTRALIA CANADA IRELAND www.laurelseducation.com info@laurelseducation.com ',
  {'entities': [(2, 9, 'B-PHONE'),
    (10, 17, 'I-PHONE'),
    (18, 23, 'B-PHONE'),
    (24, 29, 'I-PHONE'),
    (61, 68, 'B-ORG'),
    (69, 77, 'I-ORG'),
    (78, 89, 'I-ORG'),
    (90, 101, 'I-ORG'),
    (102, 106, 'I-ORG'),
    (107, 111, 'I-ORG'),
    (145, 169, 'B-WEB'),
    (170, 195, 'B-EMAIL')]}),
 ('john smith marketing manager web www.psdgraphics.com phone 1234567890 mail email@psdgraphics.com ',
  {'entities': [(0, 4, 'B-NAME'),
    (5, 10, 'I-NAME'),
    (11, 20, 'B-DES'),
    (21, 28, 'I-DES'),
    (33, 52, 'B-WEB'),
    (59, 69, 'B-PHONE'),
    (75, 96, 'B-EMAIL')]}),
 ('Sau 0 98489 24441 dy "08672," 224441 /ENKATESWAPA wie ',
  {'entities': [(6, 11, 'B-PHONE'), (12, 17, 'I-PHONE'), (37, 49, 'B-ORG')]}),
 ('Prasad @ "9,96,31,73,53,59,49,04,00,000" i Flex Design Album Desig

In [16]:
### Split the Data into training and testing set

In [17]:
import random

In [18]:
random.shuffle(allCardsData)

In [19]:
len(allCardsData)

267

In [20]:
TrainData = allCardsData[:240]
TestData = allCardsData[240:]

In [21]:
import pickle

In [22]:
pickle.dump(TrainData, open('data/TrainData.pickle', mode='wb'))
pickle.dump(TestData, open('data/TestData.pickle', mode='wb'))