# Detecting PII using spaCy

In [1]:
import pandas as pd

import spacy
from spacy import displacy

### Importing datasets

**This is generating dataset by using faker lib**

In [2]:
gen_pii = pd.read_csv('D:/DSBA/Project/Final-Project-2/data/Text files/train_text_with_pii_2020_10_18_16_36_29_513505.csv')
gen_pii.head()

Unnamed: 0,Text,Labels,PII
0,Suite 426 Produce education hand statement. St...,Address,Suite 426
1,252 Kelly Camp Imagine food analysis so. Reall...,Address,252 Kelly Camp
2,Education poor interview society on nice simpl...,Address,"9202 Jennifer Valleys Suite 890 Port Sara, ID ..."
3,Practice enough Apt. 480 learn instead read ro...,Address,Apt. 480
4,Cause example so serious mention. Reflect Amer...,Address,Suite 072


In [3]:
# Querying only person name
gen_pii[gen_pii['Labels'] == 'Name']

Unnamed: 0,Text,Labels,PII
3000,Produce education hand statement. Still talk M...,Name,Marcus
3001,Imagine food analysis so. Really population en...,Name,Angela Greene
3002,Education poor interview society on nice simpl...,Name,Amber
3003,Practice enough learn instead read room. Amy C...,Name,Amy Clay
3004,Cause example so serious mention. Reflect Amer...,Name,Pamela
...,...,...,...
3995,Today anyone Rhonda message year collection. V...,Name,Rhonda
3996,Smith Some hospital half mean order condition ...,Name,Smith
3997,Describe space mission performance. Resource r...,Name,Sanders
3998,Conference certain we condition only concern. ...,Name,Annette


In [4]:
# Real dataset
conv = pd.read_csv('D:/DSBA/Project/Final-Project-2/data/Text files/text.csv')
conv.head()

Unnamed: 0,Sentence
0,"Hello, you have called Virtual bank, this is N..."
1,"Hello, you have called Virtual bank, this is L..."
2,"Hello, you have called Virtual bank, this is M..."
3,"Hello, you have called Virtual bank, this is H..."
4,"Hello, you have called Virtual bank, this is L..."


***

### Predefined named entities

**Try with generating dataset**

More from: https://spacy.io/usage/spacy-101#annotations

In [None]:
text = gen_pii['Text'][3003].lower()
text

In [None]:
nlp = spacy.load(r'C:/Users/Namwater/anaconda3/Lib/site-packages/en_core_web_sm/en_core_web_sm-2.3.1')
nlp.pipe_names

In [None]:
doc = nlp(text)

Learn more about annotation: https://spacy.io/api/annotation

In [None]:
# Linguistic annotations
for token in doc:
    print(token.text, token.pos_, token.dep_)

**Text:** The original word text.

**Lemma:** The base form of the word.

**POS:** The simple UPOS part-of-speech tag.

**Tag:** The detailed part-of-speech tag.

**Dep:** Syntactic dependency, i.e. the relation between tokens.

**Shape:** The word shape – capitalization, punctuation, digits.

**is alpha:** Is the token an alpha character?

**is stop:** Is the token part of a stop list, i.e. the most common words of the language?

In [None]:
# Part of speech
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

In [None]:
# Using spaCy’s built-in displaCy visualizer
displacy.render(doc, style = 'dep')

In [None]:
displacy.render(doc, style = 'ent')

In [None]:
for ent in doc.ents:
    print(ent.text, ': [', ent.start_char, ',', ent.end_char, '] -', ent.label_)

In [None]:
# Adding IOB Scheme
new_tokens = []

for token in doc:
    print('Text: ' + token.text + ' | Entity: ' + token.ent_type_ + token.ent_iob_)

In [None]:
new_tokens = []

for token in doc:
    if not token.ent_type_:
        new_tokens.append(token.text)
    else:
        new_tokens.append('xxxx')
        
new_text = ' '.join(new_tokens)

print('new Text->', new_text)

**Try with conversation dataset**

In [None]:
def pii_concealer(sent):
    
    doc = nlp(sent.lower())
    new_tokens = []
    
    for token in doc:
        if not token.ent_type_:
            new_tokens.append(token.text)
        else:
            new_tokens.append('xxxx')
        
    new_text = ' '.join(new_tokens)

    return new_text

In [None]:
print('Original Sentence:')
print(displacy.render(nlp(conv['Sentence'][0].lower()), style = 'ent'), '\n')
print('After Conceal PII:')
print(pii_concealer(conv['Sentence'][0]))

### Creating custom NER

In [None]:
# Getting the pipeline component
ner = nlp.get_pipe("ner")

In [None]:
# training data
train_data = [
              ("it is 6102651715", {"entities": [(6, 15, "CARDINAL")]}),
              ("that is a-p-p-l-e-b-a-u-m.", {"entities": [(8, 24, "PERSON")]}),
              ("c-l-a-r-k-s-o-n.", {"entities": [(0, 14, "PERSON")]}),
              ("you have called virtual bank", {"entities": [(16, 23, "ORG")]}),
              ("this is nancy speaking", {"entities": [(9, 12, 'PERSON')]}),
              ("nicole allen mother himself time three brother.", {"entities": [(0, 11, "PERSON")]}),
              ("my name is sandra reed.", {"entities": [(11, 21, "PERSON")]}),
              ("that is 874525400.", {"entities": [(8, 16, "CARDINAL")]}),
              ("8544702415996.", {"entities": [(0, 13, "CARDINAL")]}),
              ("hello debra.", {"entities": [(6, 10, "PERSON")]}),
              ("it is in the pattaya beach.", {"entities": [(13, 25, "GPE")]})
              ]

In [None]:
# Adding labels to the `ner`
for _, annotations in train_data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

In [None]:
# Disable pipeline components you dont need to change
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [None]:
# Import requirements
import random
from spacy.util import minibatch, compounding
from pathlib import Path

# TRAINING THE MODEL
with nlp.disable_pipes(*unaffected_pipes):

  # Training for 30 iterations
  for iteration in range(100):

    # shuffling examples before every iteration
    random.shuffle(train_data)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = minibatch(train_data, size = compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop = 0.5,  # dropout - make it harder to memorise data
                    losses = losses,
                )
        print("Losses", losses)

In [None]:
# Testing the model
doc = nlp("this is nancy speaking.")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

In [None]:
print(displacy.render(nlp(conv['Sentence'][0].lower()), style = 'ent'))

***