In [1]:
from presidio_evaluator.data_generator import PresidioDataGenerator
import pandas as pd
from presidio_evaluator.data_generator.faker_extensions import (
    FakerSpansResult,
    RecordsFaker,
    NationalityProvider,
    OrganizationProvider,
    AgeProvider,
    AddressProviderNew,
    PhoneNumberProviderNew,
)
import pprint
import numpy as np
from collections import Counter
from presidio_evaluator import InputSample
from typing import Dict, List
import tqdm
from presidio_evaluator.validation import split_dataset, save_to_json
from datetime import date


In [11]:
templates_file_path = './data/train_templates.txt'
sentence_templates = PresidioDataGenerator.read_template_file(templates_file_path)

In [None]:
fake_name_generator_file = 'data/FakeNameGenerator.com_3000.csv'
nationalities = ['French', 'American', 'Italian',
        'Norwegian', 'Spanish', 'Japanese', 'Swedish',
       'Scottish', 'Hungarian', 'Dutch', 'English', 'Brazil',
        'Russian',  'Danish', 
       'German', 'Australian']

In [3]:
fake_name_generator_df = pd.read_csv(fake_name_generator_file)
fake_name_generator_df = fake_name_generator_df[fake_name_generator_df["NameSet"].isin(nationalities)]

In [4]:
fake_name_generator_df

Unnamed: 0,Number,Gender,NameSet,Title,GivenName,MiddleInitial,Surname,StreetAddress,City,State,...,Birthday,Age,CCType,CCNumber,CVV2,CCExpires,NationalID,Occupation,Company,Domain
1,2,female,French,Ms.,Patricia,G,Desrosiers,Avenida Noruega 42,Vila Real,VR,...,2/28/1956,63,MasterCard,5.317250e+15,874,Mar-22,,Vascular technologist,Formula Gray,LostMillions.com.pt
2,3,female,American,Ms.,Debra,O,Neal,1659 Hoog St,Brakpan,GA,...,6/11/1957,62,Visa,4.916430e+15,315,May-20,5.70611E+12,Information architect librarian,Dahlkemper's,MediumTube.co.za
3,4,male,French,Mr.,Peverell,C,Racine,183 Epimenidou Street,Limassol,LI,...,6/14/1962,57,Visa,4.485420e+15,653,May-23,,Desk clerk,Quickbiz,ImproveLook.com.cy
5,6,male,Italian,Mr.,Domenico,D,Pisano,Via Pisanelli 104,Traversara,RA,...,6/1/1979,40,Visa,4.532870e+15,237,Jun-23,WK48391724,Professional scout,Littler's,HardDriveBlog.it
7,8,female,French,Mrs.,Ormazd,M,Jomphe,Mattenstrasse 108,Sissach,,...,1/14/1999,20,Visa,4.556600e+15,691,Jun-24,,Clinical psychologist,Linens 'n Things,CyclingMonthly.ch
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2993,2994,male,Scottish,Mr.,Codey,M,Ross,Gardabraut 63,Garður,,...,3/15/1966,53,Visa,4.929240e+15,563,Sep-21,,Animator,Henry's Hamburgers,PoliticalKing.is
2994,2995,male,Italian,Mr.,Leone,C,Mazzi,Via Cavour 17,Filorsi,CE,...,10/23/1978,41,Visa,4.556830e+15,980,Dec-22,MA79272200,Nurse,ABCO Foods,VoipObserver.it
2997,2998,female,American,Mrs.,Margaret,S,Stouffer,26 rue Ernest Renan,CHERBOURG,BN,...,3/18/1950,69,MasterCard,5.580810e+15,621,Jul-23,2500301468313 93,Food cooking machine operator,Kash n' Karry,DualBags.fr
2998,2999,male,Brazil,Mr.,Luís,C,Dias,4022 Oakridge Lane,Dallas,TX,...,3/18/1988,31,MasterCard,5.417480e+15,129,Sep-24,638-78-3230,Accounts receivable clerk,Greenwich IGA,wwwcrv.com


In [5]:
data_generator = PresidioDataGenerator()
fake_name_generator_df = PresidioDataGenerator.update_fake_name_generator_df(fake_name_generator_df)

In [6]:
fake = RecordsFaker(records=fake_name_generator_df)
fake.add_provider(NationalityProvider)
fake.add_provider(OrganizationProvider)
fake.add_provider(AgeProvider)
fake.add_provider(AddressProviderNew)
fake.add_provider(PhoneNumberProviderNew)



In [7]:
data_generator = PresidioDataGenerator(
    custom_faker=fake, lower_case_ratio=0.5
)

In [8]:
data_generator.add_provider_alias(
    provider_name="credit_card_number", new_name="credit_card"
)

In [12]:
fake_records = data_generator.generate_fake_data(
    templates=sentence_templates, n_samples=300
)

fake_records = list(fake_records)
pprint.pprint(fake_records[0])

Preparing sample sentences for ingestion


Sampling: 100%|██████████| 300/300 [00:00<00:00, 9653.99it/s]

{"fake": "lucinda, can i please speak to your boss?", "spans": [{"value": "lucinda", "start": 0, "end": 7, "type": "first_name"}], "template": "{{first_name}}, can I please speak to your boss?", "template_id": 5}





In [13]:
count_per_template_id = Counter([sample.template_id for sample in fake_records])

print(f"Total: {sum(count_per_template_id.values())}")
print(f"Avg # of records per template: {np.mean(list(count_per_template_id.values()))}")
print(f"Median # of records per template: {np.median(list(count_per_template_id.values()))}")
print(f"Std: {np.std(list(count_per_template_id.values()))}")

Total: 300
Avg # of records per template: 7.894736842105263
Median # of records per template: 8.0
Std: 2.899995223990718


In [14]:
count_per_entity = Counter()
for record in fake_records:
    count_per_entity.update(Counter([span.type for span in record.spans]))

count_per_entity

Counter({'first_name': 13,
         'address': 72,
         'last_name': 29,
         'date_of_birth': 28,
         'name': 46,
         'prefix_male': 19,
         'country': 30,
         'credit_card': 43,
         'day_of_week': 12,
         'city': 2,
         'prefix': 2,
         'email': 29,
         'company': 35,
         'phone_number': 34,
         'date_time': 11,
         'name_male': 10,
         'last_name_female': 8,
         'prefix_female': 8})

In [15]:
translator = {
    "iban": "IBAN_CODE",
    "company": "ORGANIZATION",
    "organization": "ORGANIZATION",
    "name_female": "PERSON",
    "address": "STREET_ADDRESS",
    "country": "GPE",
    "state": "GPE",
    "credit_card": "CREDIT_CARD",
    "city": "GPE",
    "street_name": "STREET_ADDRESS",
    "building_number": "STREET_ADDRESS",
    "name": "PERSON",
    "last_name": "PERSON",
    "last_name_male": "PERSON",
    "last_name_female": "PERSON",
    "first_name": "PERSON",
    "first_name_male": "PERSON",
    "first_name_female": "PERSON",
    "phone_number": "PHONE_NUMBER",
    "email": "EMAIL_ADDRESS",
    "date_time": "DATE_TIME",
    "date_of_birth": "DATE_TIME",
    "day_of_week": "DATE_TIME",
    "name_male": "PERSON",
    "prefix_male": "TITLE",
    "prefix_female": "TITLE",
    "prefix": "TITLE",
    "nationality": "NRP",
    "first_name_nonbinary": "PERSON",
    "postcode": "STREET_ADDRESS",
    "secondary_address": "STREET_ADDRESS",
    "job": "TITLE",
    "state_abbr": "GPE",
    "age": "AGE",
}

def update_entity_types(dataset:List[FakerSpansResult], entity_mapping:Dict[str,str]):
    """Replace entity types using a translator dictionary."""

    for sample in dataset:
        # update entity types on spans
        for span in sample.spans:
            span.type = entity_mapping[span.type]
        # update entity types on the template string
        for key, value in entity_mapping.items():
            sample.template = sample.template.replace("{{" + key + "}}", "{{" + value + "}}")

update_entity_types(fake_records, entity_mapping=translator)

In [16]:
count_per_entity_new = Counter()
for record in fake_records:
    for span in record.spans:
        count_per_entity_new[span.type] += 1

count_per_entity_new.most_common()

[('PERSON', 106),
 ('STREET_ADDRESS', 72),
 ('DATE_TIME', 51),
 ('CREDIT_CARD', 43),
 ('ORGANIZATION', 35),
 ('PHONE_NUMBER', 34),
 ('GPE', 32),
 ('TITLE', 29),
 ('EMAIL_ADDRESS', 29)]

In [18]:
input_samples = [
    InputSample.from_faker_spans_result(faker_spans_result=fake_record, scheme="BIO")
    for fake_record in tqdm.tqdm(fake_records)
]



loading model en_core_web_sm


100%|██████████| 300/300 [00:02<00:00, 107.89it/s]


In [19]:
TRAIN_TEST_RATIOS = [0.7,0.3]
train,test = split_dataset(input_samples, TRAIN_TEST_RATIOS)

In [20]:
train[0]

Full text: I'm originally from France
Spans: [Type: GPE, value: France, start: 20, end: 26]
Tokens: I'm originally from France
Tags: ['O', 'O', 'O', 'O', 'B-GPE']

In [21]:
DATE_DATE = date.today().strftime("%b-%d-%Y")

save_to_json(train, "./data/train_{}.json".format(DATE_DATE))
save_to_json(test, "./data/val_{}.json".format(DATE_DATE))

In [23]:
train_data = pd.read_json("./data/train_Apr-12-2023.json")
val_data = pd.read_json("./data/val_Apr-12-2023.json")

In [414]:
# train_data = pd.read_json("./data/train__300__final.json")
# val_data = pd.read_json("./data/val__300__final.json")

In [24]:
def entity_from_span(data, field="entity_type"):
    spans = data.spans
    entities = []
    for span in spans:
        if len(span) > 0:
            for s in span:
                entities.append(s[field])
    return entities

In [25]:
train_entity_count = Counter(entity_from_span(train_data, "entity_value"))
val_entity_count = Counter(entity_from_span(val_data, "entity_value"))

In [26]:
def test_unique_entity_count(data):
    entity_count = Counter(entity_from_span(data, "entity_value"))
    entities = list(entity_count.keys())
    print(sum(entity_count.values()))
    print(len(entities))
    assert sum(entity_count.values()) == len(entities)
    

In [27]:
def test_no_duplicate_entity(train, val):
    train_entities = entity_from_span(train, "entity_value")
    val_entities = entity_from_span(val, "entity_value")
    intersection = set(train_entities) & set(val_entities)
    assert len(intersection) == 0, print(intersection)

In [28]:
def test_missing_classes(train, val):
    train_entities = entity_from_span(train, "entity_type")
    val_entities = entity_from_span(val, "entity_type")
    print(set(val_entities))
    intersection = set(train_entities) & set(val_entities)
    print(intersection)
    print(set(train_entities))
    assert len(intersection) == len(set(train_entities)), f"{intersection}"
    

In [29]:
def test_unique_templates(train, val):
    train_templates = train.template_id.unique()
    val_templates = val.template_id.unique()

    intersection = set(train_templates) & set(val_templates)

    assert len(intersection) == 0, f"{intersection}"
    
    
    

In [31]:
# test_unique_entity_count(train_data)
# test_unique_entity_count(val_data)
test_no_duplicate_entity(train_data, val_data)
test_missing_classes(train_data, val_data)
test_unique_templates(train_data, val_data)

{'Denmark', 'mr.', 'Mr.', 'dr.', 'Dr.'}


AssertionError: None

In [491]:
val_data = pd.concat([val_data, train_data[train_data.template_id.isin([50])]], ignore_index=True)

In [492]:
train_data = train_data[~train_data.template_id.isin([50])]

In [502]:
val_data

Unnamed: 0,full_text,masked,spans,template_id,metadata
0,who's coming to spain with me?,Who's coming to {{GPE}} with me?,"[{'entity_type': 'GPE', 'entity_value': 'spain...",16,
1,"unlike the eriksen novel, it's not about necro...","Unlike the {{PERSON}} novel, it's not about ne...","[{'entity_type': 'PERSON', 'entity_value': 'er...",82,
2,have you been to a romina m ruelas concert bef...,Have you been to a {{PERSON}} concert before?,"[{'entity_type': 'PERSON', 'entity_value': 'ro...",78,
3,How can we reach you? You can call 435 19 927,How can we reach you? You can call {{PHONE_NUM...,"[{'entity_type': 'PHONE_NUMBER', 'entity_value...",59,
4,How can we reach you? You can call 25 470375,How can we reach you? You can call {{PHONE_NUM...,"[{'entity_type': 'PHONE_NUMBER', 'entity_value...",59,
...,...,...,...,...,...
97,"unlike the smidt novel, it's not about necroph...","Unlike the {{PERSON}} novel, it's not about ne...","[{'entity_type': 'PERSON', 'entity_value': 'sm...",82,
98,please tell me your date of birth. it's 6/28/1939,Please tell me your date of birth. It's {{DATE...,"[{'entity_type': 'DATE_TIME', 'entity_value': ...",50,
99,Please tell me your date of birth. It's 12/15/...,Please tell me your date of birth. It's {{DATE...,"[{'entity_type': 'DATE_TIME', 'entity_value': ...",50,
100,Please tell me your date of birth. It's 2/27/1949,Please tell me your date of birth. It's {{DATE...,"[{'entity_type': 'DATE_TIME', 'entity_value': ...",50,


In [503]:
train_data

Unnamed: 0,full_text,masked,spans,template_id,metadata
0,what are my options?,What are my options?,[],30,
1,"hi manlio, i'm contacting you about a problem ...","Hi {{PERSON}}, I'm contacting you about a prob...","[{'entity_type': 'PERSON', 'entity_value': 'ma...",64,
2,"in case of my child's account, we need to add ...","In case of my child's account, we need to add ...","[{'entity_type': 'PERSON', 'entity_value': 'be...",4,
3,I want to increase limit on my card # 65494197...,I want to increase limit on my card # {{CREDIT...,"[{'entity_type': 'CREDIT_CARD', 'entity_value'...",0,
4,Maybe it's under Aston Lind,Maybe it's under {{PERSON}},"[{'entity_type': 'PERSON', 'entity_value': 'As...",67,
...,...,...,...,...,...
193,It's like that since 11/28/1938,It's like that since {{DATE_TIME}},"[{'entity_type': 'DATE_TIME', 'entity_value': ...",68,
194,"hello, this is mr. mr. tyler sato. who are you?","Hello, this is {{TITLE}} {{PERSON}}. Who are you?","[{'entity_type': 'PERSON', 'entity_value': 'mr...",41,
195,"for my take on ms. ruud, see guilty pleasures:...","For my take on {{TITLE}} {{PERSON}}, see Guilt...","[{'entity_type': 'PERSON', 'entity_value': 'ru...",81,
197,my name appears incorrectly on credit card sta...,My name appears incorrectly on credit card sta...,"[{'entity_type': 'PERSON', 'entity_value': 'ja...",20,


In [494]:
val_data.to_json('./data/val_300_final.json', orient='records')

In [495]:
train_data.to_json('./data/train__300__final.json', orient='records')

In [496]:
train_input_samples = InputSample.read_dataset_json('./data/train__300__final.json')

tokenizing input: 100%|██████████| 195/195 [00:00<00:00, 232.86it/s]


In [498]:
val_input_samples = InputSample.read_dataset_json('./data/val_300_final.json')

tokenizing input: 100%|██████████| 102/102 [00:00<00:00, 220.30it/s]


In [504]:
def count_entity_types(data):
    count_per_entity_new = Counter()
    for record in data:
        for span in record.spans:
            count_per_entity_new[span.type] += 1

    print(count_per_entity_new.most_common())

In [505]:
count_entity_types(train_input_samples)

AttributeError: 'Span' object has no attribute 'type'

In [501]:
count_entity_types(val_input_samples)

[('PERSON', 136), ('STREET_ADDRESS', 66), ('CREDIT_CARD', 39), ('ORGANIZATION', 29), ('GPE', 27), ('TITLE', 19), ('EMAIL_ADDRESS', 16), ('PHONE_NUMBER', 15), ('DATE_TIME', 11), ('NRP', 7), ('IBAN_CODE', 3)]


In [519]:
val_input_samples[0].spans[0]

Type: GPE, value: spain, start: 16, end: 21

In [None]:
{'MISC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'PER': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1}, 'overall_precision': 0.5, 'overall_recall': 0.5, 'overall_f1': 0.5, 'overall_accuracy': 0.8}
