# Detecting PII using spaCy

In [1]:
import pandas as pd

import spacy
from spacy import displacy

### Importing datasets

**This is generating dataset by using faker lib**

In [2]:
gen_pii = pd.read_csv('D:/DSBA/Project/Final-Project-2/data/Text files/train_text_with_pii_2020_10_18_16_36_29_513505.csv')
gen_pii.head()

Unnamed: 0,Text,Labels,PII
0,Suite 426 Produce education hand statement. St...,Address,Suite 426
1,252 Kelly Camp Imagine food analysis so. Reall...,Address,252 Kelly Camp
2,Education poor interview society on nice simpl...,Address,"9202 Jennifer Valleys Suite 890 Port Sara, ID ..."
3,Practice enough Apt. 480 learn instead read ro...,Address,Apt. 480
4,Cause example so serious mention. Reflect Amer...,Address,Suite 072


In [3]:
# Querying only person name
gen_pii[gen_pii['Labels'] == 'Name']

Unnamed: 0,Text,Labels,PII
3000,Produce education hand statement. Still talk M...,Name,Marcus
3001,Imagine food analysis so. Really population en...,Name,Angela Greene
3002,Education poor interview society on nice simpl...,Name,Amber
3003,Practice enough learn instead read room. Amy C...,Name,Amy Clay
3004,Cause example so serious mention. Reflect Amer...,Name,Pamela
...,...,...,...
3995,Today anyone Rhonda message year collection. V...,Name,Rhonda
3996,Smith Some hospital half mean order condition ...,Name,Smith
3997,Describe space mission performance. Resource r...,Name,Sanders
3998,Conference certain we condition only concern. ...,Name,Annette


In [4]:
# Real dataset
conv = pd.read_csv('D:/DSBA/Project/Final-Project-2/data/Text files/text.csv')
conv.head()

Unnamed: 0,Sentence
0,"Hello, you have called Virtual bank, this is N..."
1,"Hello, you have called Virtual bank, this is L..."
2,"Hello, you have called Virtual bank, this is M..."
3,"Hello, you have called Virtual bank, this is H..."
4,"Hello, you have called Virtual bank, this is L..."


***

### Predefined named entities

**Try with generating dataset**

More from: https://spacy.io/usage/spacy-101#annotations

In [5]:
text = gen_pii['Text'][3999].lower()
text

'nicole allen mother himself time three brother.'

In [6]:
nlp = spacy.load(r'C:/Users/Namwater/anaconda3/Lib/site-packages/en_core_web_sm/en_core_web_sm-2.3.1')
nlp.pipe_names

['tagger', 'parser', 'ner']

In [7]:
doc = nlp(text)

Learn more about annotation: https://spacy.io/api/annotation

In [8]:
# Linguistic annotations
for token in doc:
    print(token.text, token.pos_, token.dep_)

nicole PROPN amod
allen PROPN compound
mother NOUN nsubj
himself PRON appos
time NOUN ROOT
three NUM nummod
brother NOUN dobj
. PUNCT punct


**Text:** The original word text.

**Lemma:** The base form of the word.

**POS:** The simple UPOS part-of-speech tag.

**Tag:** The detailed part-of-speech tag.

**Dep:** Syntactic dependency, i.e. the relation between tokens.

**Shape:** The word shape – capitalization, punctuation, digits.

**is alpha:** Is the token an alpha character?

**is stop:** Is the token part of a stop list, i.e. the most common words of the language?

In [9]:
# Part of speech
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

nicole nicole PROPN NNP amod xxxx True False
allen allen PROPN NNP compound xxxx True False
mother mother NOUN NN nsubj xxxx True False
himself -PRON- PRON PRP appos xxxx True True
time time NOUN NN ROOT xxxx True False
three three NUM CD nummod xxxx True True
brother brother NOUN NN dobj xxxx True False
. . PUNCT . punct . False False


In [10]:
# Using spaCy’s built-in displaCy visualizer
displacy.render(doc, style = 'dep')

In [11]:
options = {"compact": True, "bg": "#09a3d5",
           "color": "white", "font": "Source Sans Pro"}
displacy.render(doc, style = 'dep', options = options)

In [12]:
displacy.render(doc, style = 'ent')

In [None]:
for ent in doc.ents:
    print(ent.text, ': [', ent.start_char, ',', ent.end_char, '] -', ent.label_)

In [None]:
# Adding IOB Scheme
new_tokens = []

for token in doc:
    print('Text: ' + token.text + ' | Entity: ' + token.ent_type_ + token.ent_iob_)

In [None]:
new_tokens = []

for token in doc:
    if not token.ent_type_:
        new_tokens.append(token.text)
    else:
        new_tokens.append('xxxx')
        
new_text = ' '.join(new_tokens)

print('new Text->', new_text)

**Try with conversation dataset**

In [None]:
def pii_concealer(sent):
    
    doc = nlp(sent.lower())
    new_tokens = []
    
    for token in doc:
        if not token.ent_type_:
            new_tokens.append(token.text)
        else:
            new_tokens.append('xxxx')
        
    new_text = ' '.join(new_tokens)

    return new_text

In [None]:
print('Original Sentence:')
print(displacy.render(nlp(conv['Sentence'][0].lower()), style = 'ent'), '\n')
print('After Conceal PII:')
print(pii_concealer(conv['Sentence'][0]))

***