# Detecting PII using spaCy

In [1]:
import pandas as pd

import spacy
from spacy import displacy

### Importing datasets

**This is generating dataset by using faker lib**

In [2]:
gen_pii = pd.read_csv('D:/DSBA/Project/Final-Project-2/data/Text files/train_text_with_pii_2020_10_18_16_36_29_513505.csv')
gen_pii.head()

Unnamed: 0,Text,Labels,PII
0,Suite 426 Produce education hand statement. St...,Address,Suite 426
1,252 Kelly Camp Imagine food analysis so. Reall...,Address,252 Kelly Camp
2,Education poor interview society on nice simpl...,Address,"9202 Jennifer Valleys Suite 890 Port Sara, ID ..."
3,Practice enough Apt. 480 learn instead read ro...,Address,Apt. 480
4,Cause example so serious mention. Reflect Amer...,Address,Suite 072


In [3]:
# Querying only person name
gen_pii[gen_pii['Labels'] == 'Name']

Unnamed: 0,Text,Labels,PII
3000,Produce education hand statement. Still talk M...,Name,Marcus
3001,Imagine food analysis so. Really population en...,Name,Angela Greene
3002,Education poor interview society on nice simpl...,Name,Amber
3003,Practice enough learn instead read room. Amy C...,Name,Amy Clay
3004,Cause example so serious mention. Reflect Amer...,Name,Pamela
...,...,...,...
3995,Today anyone Rhonda message year collection. V...,Name,Rhonda
3996,Smith Some hospital half mean order condition ...,Name,Smith
3997,Describe space mission performance. Resource r...,Name,Sanders
3998,Conference certain we condition only concern. ...,Name,Annette


In [4]:
# Real dataset
conv = pd.read_csv('D:/DSBA/Project/Final-Project-2/data/Text files/text.csv')
conv.head()

Unnamed: 0,Sentence
0,"Hello, you have called Virtual bank, this is N..."
1,"Hello, you have called Virtual bank, this is L..."
2,"Hello, you have called Virtual bank, this is M..."
3,"Hello, you have called Virtual bank, this is H..."
4,"Hello, you have called Virtual bank, this is L..."


***

### Predefined named entities

**Try with generating dataset**

More from: https://spacy.io/usage/spacy-101#annotations

In [26]:
text = gen_pii['Text'][3003].lower()
text

'practice enough learn instead read room. amy clay wrong remember no eat either offer person. physical real certain sister. charge late turn style fly.'

In [27]:
nlp = spacy.load(r'C:/Users/Namwater/anaconda3/Lib/site-packages/en_core_web_sm/en_core_web_sm-2.3.1')
nlp.pipe_names

['tagger', 'parser', 'ner']

In [28]:
doc = nlp(text)

Learn more about annotation: https://spacy.io/api/annotation

In [29]:
# Linguistic annotations
for token in doc:
    print(token.text, token.pos_, token.dep_)

practice NOUN nsubj
enough ADV amod
learn VERB ROOT
instead ADV advmod
read VERB xcomp
room NOUN dobj
. PUNCT punct
amy PROPN compound
clay PROPN compound
wrong NOUN nsubj
remember VERB ROOT
no DET det
eat NOUN dobj
either DET preconj
offer NOUN conj
person NOUN dobj
. PUNCT punct
physical ADJ amod
real ADJ amod
certain ADJ amod
sister NOUN ROOT
. PUNCT punct
charge VERB nsubj
late ADJ amod
turn NOUN compound
style NOUN nsubj
fly NOUN ROOT
. PUNCT punct


**Text:** The original word text.

**Lemma:** The base form of the word.

**POS:** The simple UPOS part-of-speech tag.

**Tag:** The detailed part-of-speech tag.

**Dep:** Syntactic dependency, i.e. the relation between tokens.

**Shape:** The word shape – capitalization, punctuation, digits.

**is alpha:** Is the token an alpha character?

**is stop:** Is the token part of a stop list, i.e. the most common words of the language?

In [30]:
# Part of speech
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

practice practice NOUN NN nsubj xxxx True False
enough enough ADV RB amod xxxx True True
learn learn VERB VB ROOT xxxx True False
instead instead ADV RB advmod xxxx True False
read read VERB VBD xcomp xxxx True False
room room NOUN NN dobj xxxx True False
. . PUNCT . punct . False False
amy amy PROPN NNP compound xxx True False
clay clay PROPN NNP compound xxxx True False
wrong wrong NOUN NN nsubj xxxx True False
remember remember VERB VB ROOT xxxx True False
no no DET DT det xx True True
eat eat NOUN NN dobj xxx True False
either either DET DT preconj xxxx True True
offer offer NOUN NN conj xxxx True False
person person NOUN NN dobj xxxx True False
. . PUNCT . punct . False False
physical physical ADJ JJ amod xxxx True False
real real ADJ JJ amod xxxx True False
certain certain ADJ JJ amod xxxx True False
sister sister NOUN NN ROOT xxxx True False
. . PUNCT . punct . False False
charge charge VERB VB nsubj xxxx True False
late late ADJ JJ amod xxxx True False
turn turn NOUN NN compoun

In [31]:
# Using spaCy’s built-in displaCy visualizer
displacy.render(doc, style = 'dep')

In [32]:
displacy.render(doc, style = 'ent')

In [33]:
for ent in doc.ents:
    print(ent.text, ': [', ent.start_char, ',', ent.end_char, '] -', ent.label_)

amy clay : [ 41 , 49 ] - PERSON


In [34]:
# Adding IOB Scheme
new_tokens = []

for token in doc:
    print('Text: ' + token.text + ' | Entity: ' + token.ent_type_ + token.ent_iob_)

Text: practice | Entity: O
Text: enough | Entity: O
Text: learn | Entity: O
Text: instead | Entity: O
Text: read | Entity: O
Text: room | Entity: O
Text: . | Entity: O
Text: amy | Entity: PERSONB
Text: clay | Entity: PERSONI
Text: wrong | Entity: O
Text: remember | Entity: O
Text: no | Entity: O
Text: eat | Entity: O
Text: either | Entity: O
Text: offer | Entity: O
Text: person | Entity: O
Text: . | Entity: O
Text: physical | Entity: O
Text: real | Entity: O
Text: certain | Entity: O
Text: sister | Entity: O
Text: . | Entity: O
Text: charge | Entity: O
Text: late | Entity: O
Text: turn | Entity: O
Text: style | Entity: O
Text: fly | Entity: O
Text: . | Entity: O


In [35]:
new_tokens = []

for token in doc:
    if not token.ent_type_:
        new_tokens.append(token.text)
    else:
        new_tokens.append('xxxx')
        
new_text = ' '.join(new_tokens)

print('new Text->', new_text)

new Text-> practice enough learn instead read room . xxxx xxxx wrong remember no eat either offer person . physical real certain sister . charge late turn style fly .


**Try with conversation dataset**

In [36]:
def pii_concealer(sent):
    
    doc = nlp(sent.lower())
    new_tokens = []
    
    for token in doc:
        if not token.ent_type_:
            new_tokens.append(token.text)
        else:
            new_tokens.append('xxxx')
        
    new_text = ' '.join(new_tokens)

    return new_text

In [37]:
print('Original Sentence:')
print(displacy.render(nlp(conv['Sentence'][0].lower()), style = 'ent'), '\n')
print('After Conceal PII:')
print(pii_concealer(conv['Sentence'][0]))

Original Sentence:


None 

After Conceal PII:
hello , you have called virtual bank , this is nancy speaking . how may i help you ? 
 oh , i just had withdrawn some cash from the xxxx xxxx and xxxx transaction failed but money got debited . can you fix this problem ? 
 sure . what is your account number ? 
 it is xxxx . 
 just a moment … . okay and what is your name ma’am ? 
 my name is sandra reed . 
 okay , miss reed . can i have your identify number ? 
 okay . xxxx . 
 okay . i have xxxx . 
 correct . 
 where is the xxxx machine that you had withdrawn the cash ? 
 i do not know where exactly it is , but it is in the xxxx xxxx . 
 that is fine , we will check your withdrawal transaction and we will refund the money to your account . do you want to receive the message when we refunding the money ? 
 yes , please . 
 okay , what is your phone number ma’am ? 
 xxxx . 
 okay , i have xxxx . we will send the message when we refund the money to your account . 
 thanks , xxxx . 
 have xxxx xxxx xxxx ma’am . tha

### Creating custom NER

In [40]:
# Getting the pipeline component
ner = nlp.get_pipe("ner")

In [62]:
# training data
train_data = [
              ("it is 6102651715", {"entities": [(6, 15, "CARDINAL")]}),
              ("that is a-p-p-l-e-b-a-u-m.", {"entities": [(8, 24, "PERSON")]}),
              ("c-l-a-r-k-s-o-n.", {"entities": [(0, 14, "PERSON")]}),
              ("you have called virtual bank", {"entities": [(16, 23, "ORG")]}),
              ("this is nancy speaking", {"entities": [(9, 12, 'PERSON')]}),
              ("nicole allen mother himself time three brother.", {"entities": [(0, 11, "PERSON")]}),
              ("my name is sandra reed.", {"entities": [(11, 21, "PERSON")]}),
              ("that is 874525400.", {"entities": [(8, 16, "CARDINAL")]}),
              ("8544702415996.", {"entities": [(0, 13, "CARDINAL")]}),
              ("hello debra.", {"entities": [(6, 10, "PERSON")]}),
              ("it is in the pattaya beach.", {"entities": [(13, 25, "GPE")]})
              ]

In [63]:
# Adding labels to the `ner`
for _, annotations in train_data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

In [64]:
# Disable pipeline components you dont need to change
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [65]:
# Import requirements
import random
from spacy.util import minibatch, compounding
from pathlib import Path

# TRAINING THE MODEL
with nlp.disable_pipes(*unaffected_pipes):

  # Training for 30 iterations
  for iteration in range(100):

    # shuffling examples before every iteration
    random.shuffle(train_data)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = minibatch(train_data, size = compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop = 0.5,  # dropout - make it harder to memorise data
                    losses = losses,
                )
        print("Losses", losses)

  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)


Losses {'ner': 5.995546699639807}
Losses {'ner': 11.68803693775677}
Losses {'ner': 13.451207169130205}
Losses {'ner': 8.611730352036396}
Losses {'ner': 10.740479795702537}
Losses {'ner': 14.420244305079734}
Losses {'ner': 2.9626108988822786e-06}
Losses {'ner': 7.917507308623505}
Losses {'ner': 11.827005268469032}
Losses {'ner': 2.2232909267854737}
Losses {'ner': 8.137513815447072}
Losses {'ner': 11.929425302843956}
Losses {'ner': 7.86480909447666}
Losses {'ner': 11.836489334935058}
Losses {'ner': 16.20008281520984}
Losses {'ner': 0.2272561914640363}
Losses {'ner': 12.078115460747355}
Losses {'ner': 14.327992731681286}
Losses {'ner': 4.6304563062456054}
Losses {'ner': 9.382572567011607}
Losses {'ner': 11.197289885757868}
Losses {'ner': 3.1957346671260893}
Losses {'ner': 13.60275092079678}
Losses {'ner': 13.779308382993577}
Losses {'ner': 10.599806549027562}
Losses {'ner': 12.181551577907761}
Losses {'ner': 13.188278128602782}
Losses {'ner': 2.004512615774729}
Losses {'ner': 7.2579660884

Losses {'ner': 0.01329850312797544}
Losses {'ner': 0.007186283714034292}
Losses {'ner': 0.007251348798370954}
Losses {'ner': 0.007268795158309323}
Losses {'ner': 3.131591783805964e-08}
Losses {'ner': 0.1505017921184657}
Losses {'ner': 6.199883764542075}
Losses {'ner': 0.00228943816009642}
Losses {'ner': 0.0024572170184291656}
Losses {'ner': 0.0024648019418385264}
Losses {'ner': 0.006498326973238534}
Losses {'ner': 0.006946743446052551}
Losses {'ner': 0.006958850160604413}
Losses {'ner': 0.011132056136043289}
Losses {'ner': 0.011134116800371047}
Losses {'ner': 0.10033859208590515}
Losses {'ner': 0.010172019607686877}
Losses {'ner': 0.010174495181202128}
Losses {'ner': 0.010183934837890318}
Losses {'ner': 0.07316323723397034}
Losses {'ner': 0.07316509825738962}
Losses {'ner': 0.07317175906790076}
Losses {'ner': 3.856951105559739}
Losses {'ner': 3.8569511153018277}
Losses {'ner': 3.8570945540958976}
Losses {'ner': 0.0028542330000911435}
Losses {'ner': 0.004106165652684491}
Losses {'ner': 

In [66]:
# Testing the model
doc = nlp("this is nancy speaking.")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities []


In [67]:
print(displacy.render(nlp(conv['Sentence'][0].lower()), style = 'ent'))

None


***