## Presidio by Microsoft is an open-sourced framework for PII detection

We will use the default config as a baseline submission. Docs can be found [here](https://microsoft.github.io/presidio/analyzer/)

In [1]:
%%capture
!pip install presidio_analyzer --no-index --find-links=file:///kaggle/input/presidio-wheels/presidio

In [2]:
import json
import pandas as pd
from tqdm.auto import tqdm
from presidio_analyzer import AnalyzerEngine

In [3]:
analyzer = AnalyzerEngine()

In [4]:
train = json.load(open('/kaggle/input/pii-detection-removal-from-educational-data/train.json'))

In [5]:
len(train)

6807

In [6]:
train[0].keys()

dict_keys(['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels'])

## Data Description

* `document` is the document id (int)
* `full_text` is the text on which we need to peform PII detection (str)
* `tokens` is a list of tokens of `full_text` which we have to label (list[str])
* `trailing_whitespace` is a list of boolean values corresponding to every `token` whether it is followed by a whitespace or not (list[bool])
* `labels` is a list of BIO labels for every token. "B" indicates the beginning of an entity, "I" indicates if the token is inbetween (inner) two entities, "O" is the outer entity i.e not labeled. (list[str])


**example**:

"Hi John Smith!"

tokens - "Hi", "John", "Smith", "!"

trailing_whitespace - true, true, false, false

labels - "O", "B-NAME_STUDENT", "I-NAME_STUDENT", "O"

In [7]:
train[0]['full_text']

"Design Thinking for innovation reflexion-Avril 2021-Nathalie Sylla\n\nChallenge & selection\n\nThe tool I use to help all stakeholders finding their way through the complexity of a project is the  mind map.\n\nWhat exactly is a mind map? According to the definition of Buzan T. and Buzan B. (1999, Dessine-moi  l'intelligence. Paris: Les Éditions d'Organisation.), the mind map (or heuristic diagram) is a graphic  representation technique that follows the natural functioning of the mind and allows the brain's  potential to be released. Cf Annex1\n\nThis tool has many advantages:\n\n•  It is accessible to all and does not require significant material investment and can be done  quickly\n\n•  It is scalable\n\n•  It allows categorization and linking of information\n\n•  It can be applied to any type of situation: notetaking, problem solving, analysis, creation of  new ideas\n\n•  It is suitable for all people and is easy to learn\n\n•  It is fun and encourages exchanges\n\n•  It makes visi

In [8]:
results = analyzer.analyze(text=train[0]['full_text'].replace("\n"," "),
                           entities=["PHONE_NUMBER", "PERSON", "URL", "EMAIL_ADDRESS"],
                           language='en', return_decision_process = True)

## True Labels

In [9]:
[(i,t,l) for i, (t,l) in enumerate(zip(train[0]['tokens'],train[0]['labels'])) if l != 'O']

[(9, 'Nathalie', 'B-NAME_STUDENT'),
 (10, 'Sylla', 'I-NAME_STUDENT'),
 (482, 'Nathalie', 'B-NAME_STUDENT'),
 (483, 'Sylla', 'I-NAME_STUDENT'),
 (741, 'Nathalie', 'B-NAME_STUDENT'),
 (742, 'Sylla', 'I-NAME_STUDENT')]

## Predicted Labels

In [10]:
[(i.entity_type, train[0]['full_text'][i.start:i.end], i.start, i.end) for i in results]

[('PERSON', 'Nathalie Sylla\n\n', 52, 68),
 ('PERSON', 'Buzan T.', 263, 271),
 ('PERSON', 'Buzan B.', 276, 284),
 ('PERSON', "Dessine-moi  l'intelligence", 292, 319),
 ('PERSON', 'Cf Annex1', 533, 542),
 ('PERSON', 'Nathalie Sylla', 2281, 2295),
 ('PERSON', 'Nathalie Sylla\n\n', 3648, 3664)]

## Using Presidio predictions for token predictions [per token]

In [11]:
def tokens2index(row):
    tokens  = row['tokens']
    start_ind = []
    end_ind = []
    prev_ind = 0
    for tok in tokens:
        start = prev_ind + row['full_text'][prev_ind:].index(tok)
        end = start+len(tok)
        start_ind.append(start)
        end_ind.append(end)
        prev_ind = end
    return start_ind, end_ind
for i in tqdm(range(len(train))):
    start, end = tokens2index(train[i])
    train[i]['start'] = start
    train[i]['end'] = end

  0%|          | 0/6807 [00:00<?, ?it/s]

In [12]:
def find_or_next_larger(arr, target):
    left, right = 0, len(arr) - 1

    while left <= right:
        mid = (left + right) // 2

        if arr[mid] == target:
            return mid
        elif arr[mid] < target:
            left = mid + 1
        else:
            right = mid - 1
    return left
def count_trailing_whitespaces(word):
    return len(word) - len(word.rstrip())

## On train data

In [13]:
preds = []
for i, d in enumerate(tqdm(train[:10])):
    results = analyzer.analyze(text=d['full_text'],
                           entities=["PHONE_NUMBER", "PERSON", "URL", "EMAIL_ADDRESS"],
                           language='en')
    pre_preds = []
    for r in results:
        s = find_or_next_larger(d['start'], r.start)
        end = r.end
        word = d['full_text'][r.start:r.end]
        end = end - count_trailing_whitespaces(word)
        temp_preds = [s]
        try:
            while d['end'][s+1] <= end:
                temp_preds.append(s+1)
                s +=1
        except:
            pass
        if i==0:
            print(temp_preds, r.entity_type, word,r.score)
        
        if r.entity_type == 'PHONE_NUMBER':
            label =  'PHONE_NUM'
        if r.entity_type == 'PERSON':
            label =  'NAME_STUDENT'
        if r.entity_type == 'URL':
            label = 'URL_PERSONAL'
        if r.entity_type == 'EMAIL_ADDRESS':
            label = "EMAIL"
        for p in temp_preds:
            if len(pre_preds) > 0:
                if pre_preds[-1]['rlabel'] == r.entity_type and ((p - pre_preds[-1]['token'])==1):
                    label_f = "I-"+label
                else:
                    label_f = "B-"+label
            else:
                label_f = "B-"+label
            pre_preds.append(({
#                     "row_id":i,
                    "document":d['document'],
                    "token":p,
                    "label":label_f,
                    "rlabel":r.entity_type
                }))
    preds.extend(pre_preds)

  0%|          | 0/10 [00:00<?, ?it/s]

[9, 10] PERSON Nathalie Sylla

 0.85
[52, 53] PERSON Buzan T. 0.85
[55, 56] PERSON Buzan B. 0.85
[60, 61, 62, 63, 64] PERSON Dessine-moi  l'intelligence 0.85
[482, 483] PERSON Nathalie Sylla 0.85
[741, 742] PERSON Nathalie Sylla 0.85


In [14]:
pd.DataFrame(preds)

Unnamed: 0,document,token,label,rlabel
0,7,9,B-NAME_STUDENT,PERSON
1,7,10,I-NAME_STUDENT,PERSON
2,7,52,B-NAME_STUDENT,PERSON
3,7,53,I-NAME_STUDENT,PERSON
4,7,55,B-NAME_STUDENT,PERSON
...,...,...,...,...
97,123,1591,B-URL_PERSONAL,URL
98,123,1648,B-URL_PERSONAL,URL
99,123,1690,B-URL_PERSONAL,URL
100,123,1550,B-URL_PERSONAL,URL


## Inference and submission

In [15]:
test = json.load(open('/kaggle/input/pii-detection-removal-from-educational-data/test.json'))
len(test)

10

In [16]:
for i in tqdm(range(len(test))):
    start, end = tokens2index(test[i])
    test[i]['start'] = start
    test[i]['end'] = end

  0%|          | 0/10 [00:00<?, ?it/s]

In [17]:
preds = []
for i, d in enumerate(tqdm(test)):
    results = analyzer.analyze(text=d['full_text'],
                           entities=["PHONE_NUMBER", "PERSON", "URL", "EMAIL_ADDRESS"],
                           language='en')
    pre_preds = []
    for r in results:
        s = find_or_next_larger(d['start'], r.start)
        end = r.end
        word = d['full_text'][r.start:r.end]
        end = end - count_trailing_whitespaces(word)
        temp_preds = [s]
        try:
            while d['end'][s+1] <= end:
                temp_preds.append(s+1)
                s +=1
        except:
            pass
        if i==0:
            print(temp_preds, r.entity_type, word,r.score)
        
        if r.entity_type == 'PHONE_NUMBER':
            label =  'PHONE_NUM'
        if r.entity_type == 'PERSON':
            label =  'NAME_STUDENT'
        if r.entity_type == 'URL':
            label = 'URL_PERSONAL'
        if r.entity_type == 'EMAIL_ADDRESS':
            label = "EMAIL"
        for p in temp_preds:
            if len(pre_preds) > 0:
                if pre_preds[-1]['rlabel'] == r.entity_type and ((p - pre_preds[-1]['token'])==1):
                    label_f = "I-"+label
                else:
                    label_f = "B-"+label
            else:
                label_f = "B-"+label
            pre_preds.append(({
#                     "row_id":i,
                    "document":d['document'],
                    "token":p,
                    "label":label_f,
                    "rlabel":r.entity_type
                }))
    preds.extend(pre_preds)

  0%|          | 0/10 [00:00<?, ?it/s]

[9, 10] PERSON Nathalie Sylla

 0.85
[52, 53] PERSON Buzan T. 0.85
[55, 56] PERSON Buzan B. 0.85
[60, 61, 62, 63, 64] PERSON Dessine-moi  l'intelligence 0.85
[482, 483] PERSON Nathalie Sylla 0.85
[741, 742] PERSON Nathalie Sylla 0.85


In [18]:
submission = pd.DataFrame(preds).iloc[:,:-1].reset_index()
submission.columns = ['row_id','document', 'token', 'label']
submission

Unnamed: 0,row_id,document,token,label
0,0,7,9,B-NAME_STUDENT
1,1,7,10,I-NAME_STUDENT
2,2,7,52,B-NAME_STUDENT
3,3,7,53,I-NAME_STUDENT
4,4,7,55,B-NAME_STUDENT
...,...,...,...,...
97,97,123,1591,B-URL_PERSONAL
98,98,123,1648,B-URL_PERSONAL
99,99,123,1690,B-URL_PERSONAL
100,100,123,1550,B-URL_PERSONAL


In [19]:
pd.read_csv('/kaggle/input/pii-detection-removal-from-educational-data/sample_submission.csv').head(10)

Unnamed: 0,row_id,document,token,label
0,0,7,9,B-NAME_STUDENT
1,1,7,10,I-NAME_STUDENT
2,2,7,482,B-NAME_STUDENT
3,3,7,483,I-NAME_STUDENT
4,4,7,741,B-NAME_STUDENT
5,5,7,742,I-NAME_STUDENT
6,6,10,0,B-NAME_STUDENT
7,7,10,1,I-NAME_STUDENT
8,8,10,464,B-NAME_STUDENT
9,9,10,465,I-NAME_STUDENT


In [20]:
submission.to_csv('submission.csv', index = False)

## Please upvote if you found it useful! 

2024 resolution:
* Competitions Master
* Datasets GM
* Notebooks GM
* Discussions Master