## PII (Personal Identifiable Information) - MultiLabel Classification 

Tensorflow, LSTM, NLTK, RE, Sklearn

In [1]:
#Required libraries 
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
from nltk.corpus import wordnet
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout


In [2]:
#Load the excel file pandas read the file using ExcelFile
xls = pd.ExcelFile('PII_Train_Large_Data_Test_Data.xlsx')
xls.sheet_names #See if the data is in one sheet or available in mutiple 

['Export Summary',
 'PII Train Large Data - PII Trai',
 'PII Test Data - PII Test Data']

'PII Train Large Data - PII Trai' --AND-- 'PII Test Data - PII Test Data'

In [3]:
#Extarct the train_data from  'PII Train Large Data - PII Trai' sheet
train_data = pd.read_excel(xls, 'PII Train Large Data - PII Trai')
train_data.columns

Index(['PII Train Large Data', 'Unnamed: 1', 'Unnamed: 2'], dtype='object')

In [4]:
train_data.head() 

Unnamed: 0,PII Train Large Data,Unnamed: 1,Unnamed: 2
0,Text,Labels,PII
1,Candidate economic character present money dau...,Address,Apt. 026
2,Film range sound. People age Apt. 476 that.,Address,Apt. 476
3,Back want myself class certain. Tree pretty ca...,Address,Suite 492
4,Bring guy 81627 Kimberly Squares Washingtonber...,Address,"81627 Kimberly Squares Washingtonberg, RI 13540"


In [5]:
#Extarct the train_data from  'PII Train Large Data - PII Trai' sheet
test_data = pd.read_excel(xls, 'PII Test Data - PII Test Data')
test_data.columns

Index(['PII Test Data', 'Unnamed: 1', 'Unnamed: 2'], dtype='object')

In [6]:
test_data.head() 

Unnamed: 0,PII Test Data,Unnamed: 1,Unnamed: 2
0,Text,Label,PII
1,Term although process suddenly parent. Poor go...,,
2,"356 Collins Highway New Kathleen, NM 10160 Rem...",,
3,Appear job opportunity job. Piece 405 Callahan...,,
4,During half leave simple west lose piece 859 D...,,


### Data Cleaning

In [7]:
#set the dataframe 
train_data = pd.read_excel(xls, 'PII Train Large Data - PII Trai', skiprows = 1)
train_data.columns

Index(['Text', 'Labels', 'PII'], dtype='object')

In [8]:
train_data.sample(10)

Unnamed: 0,Text,Labels,PII
658,Nature American should reduce two. Store goal ...,SSN,693 88 4776
194,Store effort campaign girl worker technology. ...,CreditCardNumber,676293890015
708,Cup item remain environment not. Foot nation k...,,
73,Again 970 Bethany Fords Suite 491 West Lisasid...,Address,"970 Bethany Fords Suite 491 West Lisaside, NC ..."
600,Candidate economic character present 829 49 91...,SSN,829 49 9139
334,Room up generation free discover grow. Pattern...,Name,Mary Miller
94,Store effort campaign girl worker technology. ...,Address,"309 Bryan Islands Suite 301 Lake Valerie, CO 9..."
448,Party woman past grow edge. Pretty eat Democra...,Phone_number,800-428-6217
385,White Ready off score foot market protect.,Name,White
82,Course college still loss Apt. 745 scene. Seri...,Address,Apt. 745


In [9]:
#set the dataframe 
test_data = pd.read_excel(xls, 'PII Test Data - PII Test Data', skiprows = 1)
test_data.columns

Index(['Text', 'Label', 'PII'], dtype='object')

In [10]:
test_data.sample(5)

Unnamed: 0,Text,Label,PII
8429,Southern measure cultural teacher. (490)446-52...,,
10182,Recent air government simply it yourself. Sist...,,
4847,Specific owner wburns@gmail.com reveal travel....,,
1260,Minute floor mean 90696 Mckenzie Corners Apt. ...,,
4972,Certainly free natural both. Receive approach ...,,


In [11]:
#keep a copy of original text and add one more lower_text column
train_data['lower_text']= train_data['Text'].str.lower()
train_data.sample(10)


Unnamed: 0,Text,Labels,PII,lower_text
395,Coach he west magazine against beat. By someti...,Name,Christina Hill,coach he west magazine against beat. by someti...
250,Old to health ask martinmichelle@wilkinson.com...,Email,martinmichelle@wilkinson.com,old to health ask martinmichelle@wilkinson.com...
354,Civil notice travel commercial. Consumer itsel...,Name,Joseph Lewis,civil notice travel commercial. consumer itsel...
752,Exist establish we reason job record accept. W...,,,exist establish we reason job record accept. w...
298,Test artist person billion. sherri68@gmail.com...,Email,sherri68@gmail.com,test artist person billion. sherri68@gmail.com...
63,"52013 Jason Vista Lake Kathleen, PA 89168 Fede...",Address,"52013 Jason Vista Lake Kathleen, PA 89168","52013 jason vista lake kathleen, pa 89168 fede..."
342,Goal agent resource office material game. Lot ...,Name,Nelson,goal agent resource office material game. lot ...
228,Everyone road skill act condition heart frobin...,Email,frobinson@romero.info,everyone road skill act condition heart frobin...
737,Ok dream campaign task six only Congress. Scor...,,,ok dream campaign task six only congress. scor...
76,Any information garden to citizen movie hear a...,Address,"92546 Clark Vista Stephenton, IL 39677",any information garden to citizen movie hear a...


In [12]:
test_data['lower_text']= test_data['Text'].str.lower()
test_data.sample(5)

Unnamed: 0,Text,Label,PII,lower_text
7656,Prepare financial your wear watch manage. Ande...,,,prepare financial your wear watch manage. ande...
8259,001-341-907-5046x17440 High future east receiv...,,,001-341-907-5046x17440 high future east receiv...
10416,Do relate single board. Along lose former chan...,,,do relate single board. along lose former chan...
6966,Drug operation bag employee south increase acc...,,,drug operation bag employee south increase acc...
14161,Weight western raise science might. Reduce yea...,,,weight western raise science might. reduce yea...


In [13]:
#clean any double spaces if present in the text
def rmExtraSpace(text):
    r = re.sub("\s+", " " , text)
    return r

train_data.lower_text.apply(lambda x: rmExtraSpace(x))
train_data['lower_text'].sample(5)

452    exist establish we reason job record accept. w...
469    character film 001-753-147-3410 whole above op...
714    wear language word kitchen might figure miss j...
710    sound improve parent dream send language. hund...
228    everyone road skill act condition heart frobin...
Name: lower_text, dtype: object

In [14]:
test_data.lower_text.apply(lambda x: rmExtraSpace(x))
test_data['lower_text'].sample(5)

2752     magazine common town apply. relate model there...
4653     road candidate behavior important girl develop...
9040     indeed sometimes analysis place get federal na...
7247               film cold especially john likely media.
10560    reach although nice analysis amount opportunit...
Name: lower_text, dtype: object

### Pre-process the data 

In [15]:
#Since we observe different number formats in the text
#Processing the text to fit into the model later

In [16]:
#Identify the Numbers and replace with the len(number)D
def identifyNumbers(text):
    index = 0
    while index < (len(text)):
        ch = text[index]
        if ch.isdigit():
            count = count_digits(text, index)
            # Replace the text
            replacementstring= '{}D'.format(count)
            replacementstring_length = len(replacementstring)
            text = text[:index]+replacementstring+text[index+count:]
            index = index + replacementstring_length
        else:
            index +=1
    return text


def count_digits(text, index):
    count = 0
    while index<len(text) and text[index].isdigit():
        count +=1
        index +=1
    return count


In [17]:
#Imply the count_digits and replace 
train_data['lower_text'] = train_data.lower_text.apply(lambda x: identifyNumbers(x))
train_data['lower_text'].sample(5)

49     series seem sister health. identify company au...
562    around well meet whose five 3D 1Ddi before. so...
614    wear language word kitchen might figure miss j...
685        ready off score foot market protect. 3D 2D 4D
698    test artist person billion. trouble staff indu...
Name: lower_text, dtype: object

In [18]:
#Imply the count_digits and replace 
test_data['lower_text'] = test_data.lower_text.apply(lambda x: identifyNumbers(x))
test_data['lower_text'].sample()

14986    modern appear everyone. culture reality wife a...
Name: lower_text, dtype: object

In [19]:
test_data['lower_text'].sample(5)

13673    democrat plan pretty everybody. forget 3D-2D-4...
2438     car child while space community human 16D your...
192      whatever technology rather artist early throug...
8559     force financial and. tonight discover ago fear...
2973     relationship according heart what. film daught...
Name: lower_text, dtype: object

In [20]:
#Adding space when len(numbers) to make it easy way to tokenize the data.

def addSpaceBeforeAfterDigit(text):
    index = 0
    while index < len(text):
        ch = text[index]
        if ch != ' ' and not ch.isalnum():
            if index >=2:
                if text[index-2].isdigit():
                    text = text[:index] + ' ' + text[index:]
                    index +=1
            if index < len(text) - 1:
                if text[index+1].isdigit():
                    text = text[:index+1] + ' ' + text[index+1:]
                    index +=1
        index+=1
        
    return text


train_data['lower_text'] = train_data.lower_text.apply(lambda x: addSpaceBeforeAfterDigit(x))
train_data.sample(5)

Unnamed: 0,Text,Labels,PII,lower_text
376,Any information garden to citizen movie hear a...,Name,Henry,any information garden to citizen movie hear a...
795,Coach he west magazine against beat. By someti...,,,coach he west magazine against beat. by someti...
158,Nature American should reduce two. Store goal ...,CreditCardNumber,30483588661316,nature american should reduce two. store goal ...
259,Defense region trade hold bed time audience ch...,Email,lowens@hotmail.com,defense region trade hold bed time audience ch...
289,Describe question cover suggest actually with....,Email,johnsontracey@gmail.com,describe question cover suggest actually with....


In [21]:
test_data['lower_text'] = test_data.lower_text.apply(lambda x: addSpaceBeforeAfterDigit(x))
test_data.sample(5)

Unnamed: 0,Text,Label,PII,lower_text
8652,Say especially large interview. 876.088.2835 H...,,,say especially large interview. 3D . 3D . 4D h...
7083,Return truth hand either Tyler leader to outsi...,,,return truth hand either tyler leader to outsi...
4908,Form pull describe resource character full ste...,,,form pull describe resource character full ste...
4552,Table collection shake church cover pay stuff....,,,table collection shake church cover pay stuff....
6496,Your foreign answer thus turn. Person fall man...,,,your foreign answer thus turn. person fall man...


In [22]:
def wTokenize(words):
    filtered_sentence = []
    words = word_tokenize(words)
    for w in words:
        if w not in stop:
            filtered_sentence.append(w)
    return filtered_sentence

train_data['lower_text'] = train_data['lower_text'].apply(lambda x: wTokenize(x))


In [23]:
train_data['lower_text'].sample(5)

283    [movement, near, heathergibson, @, miller.com,...
363    [federal, nice, idea, sell, ., somebody, edwin...
492    [report, author, increase, 3D, -, 3D, -, 4Dx4D...
778    [break, light, base, sea, ., magazine, decide,...
756    [program, indicate, whose, attorney, five, acc...
Name: lower_text, dtype: object

### Build model

In [24]:
#The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 5000
#Max number of words in each text.
MAX_SEQUENCE_LENGTH = 250
#Embedding can be set fixed.
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-/:;<=>?[\]^_`{|}~', lower=True)
#tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
X = tokenizer.fit_on_texts(train_data['lower_text'].values)
X = sequences = tokenizer.texts_to_sequences(train_data['lower_text'].values)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

print('Shape of data tensor:', X.shape)


Found 1058 unique tokens.
Shape of data tensor: (800, 250)


Used same parameter for test_data

In [25]:
Y = pd.get_dummies(train_data['Labels']).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (800, 8)


In [26]:
#Split Train Test 
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(640, 250) (640, 8)
(160, 250) (160, 8)


In [27]:

# #Embedding can be set fixed.
# EMBEDDING_DIM = 100

model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, # size of the embedded vectors
                    input_length=MAX_SEQUENCE_LENGTH))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))# returns a sequence of vectors of dimension 10
model.add(Dense(8, activation='softmax'))
model.compile(loss='categorical_crossentropy',#loss
              optimizer='adam', metrics=['accuracy'])
print(model.summary())


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 250, 100)          500000    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 250, 100)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 8)                 808       
Total params: 581,208
Trainable params: 581,208
Non-trainable params: 0
_________________________________________________________________
None


In [29]:
X_train.shape,Y_train.shape

((640, 250), (640, 8))

In [30]:
#Model fit
epochs = 5
batch_size = 8

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,
                    callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [31]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 1.034
  Accuracy: 0.550


In [33]:
new_text = ['my number is sreess@123.com']
seq = tokenizer.texts_to_sequences(new_text)
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
pred = model.predict(padded)
labels = ['SSN', 'Name', 'Address', 'CreditCardNumber', 'Email', 'Phone_number', 'Plates', 'None']
print(pred, labels[np.argmax(pred)])


[[0.05052647 0.2055744  0.0435201  0.20960784 0.22173166 0.04189555
  0.17518768 0.0519564 ]] Email


In [34]:
new_text = ['my number is (123) 45 5478']
seq = tokenizer.texts_to_sequences(new_text)
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
pred = model.predict(padded)
labels = ['SSN', 'Name', 'Address', 'CreditCardNumber', 'Email', 'Phone_number', 'Plates', 'None']
print(pred, labels[np.argmax(pred)])


[[0.05052647 0.2055744  0.0435201  0.20960784 0.22173166 0.04189555
  0.17518768 0.0519564 ]] SSN


In [35]:
new_text = ['ned wants the ghe and says he is in 7787 brown st apt. 123']
seq = tokenizer.texts_to_sequences(new_text)
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
pred = model.predict(padded)
labels = ['SSN', 'Name', 'Address', 'CreditCardNumber', 'Email', 'Phone_number', 'Plates', 'None']
print(pred, labels[np.argmax(pred)])


[[0.06509028 0.18201326 0.05197835 0.19698647 0.20959812 0.05382587
  0.17648517 0.0640225 ]] Address


In [36]:
new_text = ['ma indication to the nullable info in the street of nba is 12556344447777599']
seq = tokenizer.texts_to_sequences(new_text)
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
pred = model.predict(padded)
labels = ['SSN', 'Name', 'Address', 'CreditCardNumber', 'Email', 'Phone_number', 'Plates', 'None']
print(pred, labels[np.argmax(pred)])

[[0.06509028 0.18201326 0.05197835 0.19698647 0.20959812 0.05382587
  0.17648517 0.0640225 ]] CreditCardNumber


## Regular expresion functionality in PII and  human name parser (NLTK)

In [37]:
#Load the excel file pandas read the file using ExcelFile
xls = pd.ExcelFile('PII_Train_Large_Data_Test_Data.xlsx')
xls.sheet_names #See if the data is in one sheet or available in mutiple 

['Export Summary',
 'PII Train Large Data - PII Trai',
 'PII Test Data - PII Test Data']

In [38]:
#set the dataframe 
train_data = pd.read_excel(xls, 'PII Train Large Data - PII Trai', skiprows = 1)
train_data.columns

Index(['Text', 'Labels', 'PII'], dtype='object')

In [39]:
train_data.sample(10)

Unnamed: 0,Text,Labels,PII
110,Sound improve parent dream send language. Hund...,CreditCardNumber,2254280030993205
710,Sound improve parent dream send language. Hund...,,
330,Most nor star Monica Abbott remain.,Name,Monica Abbott
571,Road audience large this. Future artist condit...,Plates,815 5DI
796,Speech national especially available own black...,,
561,Certainly success training N56 4LY idea large....,Plates,N56 4LY
318,Community stand nice whatever film. Blood go p...,Name,Nelson
589,Describe 8-89413E question cover suggest actua...,Plates,8-89413E
349,Series seem Jeff Mason sister health. Identify...,Name,Jeff Mason
270,zyoung@drake.org Drop yet bad each whose. Mach...,Email,zyoung@drake.org


In [41]:
import re
def address(text):
    regexp = "[0-9]{1,5}.+, [A-Z]{2} [0-9]{1,5}"
    a = re.findall(regexp, text)
    if len(a) != 0:
        return a
    d=re.findall("[0-9]{1,5} [A-Z][A-Za-z]+ [[A-Z][A-Za-z]+]? [A[PT|pt]\.? ?[0-9]{1,4}]?",text)
    if len(d) !=0:
        return d
    b=re.findall("Apt. +[0-9]{1,4}",text)
    if len(b) != 0:
        return b
    c=re.findall("Suite +[0-9]{1,4}",text)
    if len(c) !=0:
        return c
    

train_data['Digit_data'] = train_data.Text.apply(lambda x: address(x))
train_data.head(150)

Unnamed: 0,Text,Labels,PII,Digit_data
0,Candidate economic character present money dau...,Address,Apt. 026,[Apt. 026]
1,Film range sound. People age Apt. 476 that.,Address,Apt. 476,[Apt. 476]
2,Back want myself class certain. Tree pretty ca...,Address,Suite 492,[Suite 492]
3,Bring guy 81627 Kimberly Squares Washingtonber...,Address,"81627 Kimberly Squares Washingtonberg, RI 13540","[81627 Kimberly Squares Washingtonberg, RI 13540]"
4,"52013 Jason Vista Lake Kathleen, PA 89168 May ...",Address,"52013 Jason Vista Lake Kathleen, PA 89168","[52013 Jason Vista Lake Kathleen, PA 89168]"
5,Themselves early our bit. Institution peace sp...,Address,"4058 Gordon Fields South Charlestown, NJ 40537","[4058 Gordon Fields South Charlestown, NJ 40537]"
6,11792 Stevens Ferry Career respond front happe...,Address,11792 Stevens Ferry,
7,Fine during five spring Apt. 174 page where la...,Address,Apt. 174,[Apt. 174]
8,"Cup 52013 Jason Vista Lake Kathleen, PA 89168 ...",Address,"52013 Jason Vista Lake Kathleen, PA 89168","[52013 Jason Vista Lake Kathleen, PA 89168]"
9,Question Suite 052 as should sign face. Memory...,Address,Suite 052,[Suite 052]


In [46]:
from nltk.corpus import stopwords
from nameparser.parser import HumanName
from nltk.corpus import wordnet
stop = stopwords.words('english')

def extract_phone_numbers(string):
    r = re.compile(r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})')
    phone_numbers = r.findall(string)
    return [re.sub(r'\D', '', number) for number in phone_numbers]

def extract_email_addresses(string):
    r = re.compile(r'[\w\.-]+@[\w\.-]+')
    return r.findall(string)

#import pdb; pdb.set_trace()
person_list = []
person_names=person_list
def get_human_names(text):
    tokens = nltk.tokenize.word_tokenize(text)
    pos = nltk.pos_tag(tokens)
    sentt = nltk.ne_chunk(pos, binary = False)

    person = []
    name = ""
    for subtree in sentt.subtrees(filter=lambda t: t.label() == 'PERSON'):
        for leaf in subtree.leaves():
            person.append(leaf[0])
        if len(person) > 1: #avoid grabbing lone surnames
            for part in person:
                name += part + ' '
            if name[:-1] not in person_list:
                person_list.append(name[:-1])
            name = ''
        person = []
        

if __name__ == '__main__':
    train_data['numbers'] = train_data.Text.apply(lambda x: extract_phone_numbers(x))
    train_data['emails'] = train_data.Text.apply(lambda x: extract_email_addresses(x))
    train_data['names'] = train_data.Text.apply(lambda x: get_human_names(x))
    for person in person_list:
            person_split = person.split(" ")
            for name in person_split:
                if wordnet.synsets(name):
                    if(name in person):
                        person_names.remove(person)
                        break
                        
                      

In [47]:
train_data.head(10)

Unnamed: 0,Text,Labels,PII,Digit_data,numbers,emails,names
0,Candidate economic character present money dau...,Address,Apt. 026,[Apt. 026],[],[],
1,Film range sound. People age Apt. 476 that.,Address,Apt. 476,[Apt. 476],[],[],
2,Back want myself class certain. Tree pretty ca...,Address,Suite 492,[Suite 492],[],[],
3,Bring guy 81627 Kimberly Squares Washingtonber...,Address,"81627 Kimberly Squares Washingtonberg, RI 13540","[81627 Kimberly Squares Washingtonberg, RI 13540]",[],[],
4,"52013 Jason Vista Lake Kathleen, PA 89168 May ...",Address,"52013 Jason Vista Lake Kathleen, PA 89168","[52013 Jason Vista Lake Kathleen, PA 89168]",[],[],
5,Themselves early our bit. Institution peace sp...,Address,"4058 Gordon Fields South Charlestown, NJ 40537","[4058 Gordon Fields South Charlestown, NJ 40537]",[],[],
6,11792 Stevens Ferry Career respond front happe...,Address,11792 Stevens Ferry,,[],[],
7,Fine during five spring Apt. 174 page where la...,Address,Apt. 174,[Apt. 174],[],[],
8,"Cup 52013 Jason Vista Lake Kathleen, PA 89168 ...",Address,"52013 Jason Vista Lake Kathleen, PA 89168","[52013 Jason Vista Lake Kathleen, PA 89168]",[],[],
9,Question Suite 052 as should sign face. Memory...,Address,Suite 052,[Suite 052],[],[],
