# Detecting PII using Spacy

In [1]:
import pandas as pd

import spacy
from spacy import displacy

In [2]:
nlp = spacy.load(r'C:/Users/Namwater/anaconda3/Lib/site-packages/en_core_web_sm/en_core_web_sm-2.3.1')
nlp.pipe_names

['tagger', 'parser', 'ner']

### Importing datasets

**This is generating dataset by using faker lib**

In [3]:
gen_pii = pd.read_csv('D:/DSBA/Project/Final-Project-2/data/Text files/train_text_with_pii_2020_10_18_16_36_29_513505.csv')
gen_pii.head()

Unnamed: 0,Text,Labels,PII
0,Suite 426 Produce education hand statement. St...,Address,Suite 426
1,252 Kelly Camp Imagine food analysis so. Reall...,Address,252 Kelly Camp
2,Education poor interview society on nice simpl...,Address,"9202 Jennifer Valleys Suite 890 Port Sara, ID ..."
3,Practice enough Apt. 480 learn instead read ro...,Address,Apt. 480
4,Cause example so serious mention. Reflect Amer...,Address,Suite 072


In [11]:
# Querying only person name
gen_pii[gen_pii['Labels'] == 'Name']

Unnamed: 0,Text,Labels,PII
3000,Produce education hand statement. Still talk M...,Name,Marcus
3001,Imagine food analysis so. Really population en...,Name,Angela Greene
3002,Education poor interview society on nice simpl...,Name,Amber
3003,Practice enough learn instead read room. Amy C...,Name,Amy Clay
3004,Cause example so serious mention. Reflect Amer...,Name,Pamela
...,...,...,...
3995,Today anyone Rhonda message year collection. V...,Name,Rhonda
3996,Smith Some hospital half mean order condition ...,Name,Smith
3997,Describe space mission performance. Resource r...,Name,Sanders
3998,Conference certain we condition only concern. ...,Name,Annette


***

### Predefined named entities

**Try with generating dataset**

In [12]:
text = gen_pii['Text'][3999].lower()
text

'nicole allen mother himself time three brother.'

In [13]:
doc = nlp(text)

Learn more about annotation: https://spacy.io/api/annotation

In [49]:
displacy.render(doc, style = 'ent')

In [51]:
for ent in doc.ents:
    print(ent.text, ': [', ent.start_char, ',', ent.end_char, '] -', ent.label_)

nicole allen : [ 0 , 12 ] - PERSON
three : [ 33 , 38 ] - CARDINAL


In [19]:
# Adding IOB Scheme
new_tokens = []

for token in doc:
    print('Text: ' + token.text + ' | Entity: ' + token.ent_type_ + token.ent_iob_)

Text: nicole | Entity: PERSONB
Text: allen | Entity: PERSONI
Text: mother | Entity: O
Text: himself | Entity: O
Text: time | Entity: O
Text: three | Entity: CARDINALB
Text: brother | Entity: O
Text: . | Entity: O


In [55]:
new_tokens = []

for token in doc:
    if not token.ent_type_:
        new_tokens.append(token.text)
    else:
        new_tokens.append('xxxx')
        
new_text = ' '.join(new_tokens)

print('new Text->', new_text)

new Text-> xxxx xxxx mother himself time xxxx brother .


**Try with conversation dataset**

In [54]:
conv = pd.read_csv('D:/DSBA/Project/Final-Project-2/data/Text files/text.csv')
conv.head()

Unnamed: 0,Sentence
0,"Hello, you have called Virtual bank, this is N..."
1,"Hello, you have called Virtual bank, this is L..."
2,"Hello, you have called Virtual bank, this is M..."
3,"Hello, you have called Virtual bank, this is H..."
4,"Hello, you have called Virtual bank, this is L..."


In [60]:
def pii_concealer(sent):
    
    doc = nlp(sent.lower())
    new_tokens = []
    
    for token in doc:
        if not token.ent_type_:
            new_tokens.append(token.text)
        else:
            new_tokens.append('xxxx')
        
    new_text = ' '.join(new_tokens)

    return new_text

In [64]:
print('Original Sentence:')
print(displacy.render(nlp(conv['Sentence'][0].lower()), style = 'ent'), '\n')
print('After Conceal PII:')
print(pii_concealer(conv['Sentence'][0]))

Original Sentence:


None 

After Conceal PII:
hello , you have called virtual bank , this is nancy speaking . how may i help you ? 
 oh , i just had withdrawn some cash from the xxxx xxxx and xxxx transaction failed but money got debited . can you fix this problem ? 
 sure . what is your account number ? 
 it is xxxx . 
 just a moment … . okay and what is your name ma’am ? 
 my name is sandra reed . 
 okay , miss reed . can i have your identify number ? 
 okay . xxxx . 
 okay . i have xxxx . 
 correct . 
 where is the xxxx machine that you had withdrawn the cash ? 
 i do not know where exactly it is , but it is in the xxxx xxxx . 
 that is fine , we will check your withdrawal transaction and we will refund the money to your account . do you want to receive the message when we refunding the money ? 
 yes , please . 
 okay , what is your phone number ma’am ? 
 xxxx . 
 okay , i have xxxx . we will send the message when we refund the money to your account . 
 thanks , xxxx . 
 have xxxx xxxx xxxx ma’am . tha