In [None]:
!pip install faker
!pip install presidio-analyzer
!pip install presidio-anonymizer

In [None]:
import re
from faker import Faker
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig

from presidio_analyzer import AnalyzerEngine, PatternRecognizer, Pattern

In [None]:
fake = Faker()
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()

In [None]:
document = """Date: October 19, 2021
 Witness: John Doe
 Subject: Testimony Regarding the Loss of Wallet

 Testimony Content:

 Hello Officer,

 My name is John Doe and on October 19, 2021, my wallet was stolen in the vicinity of Kilmarnock during a bike trip. This wallet contains some very important things to me.

 Firstly, the wallet contains my credit card with number 4111 1111 1111 1111, which is registered under my name and linked to my bank account, PL61109010140000071219812874.

 Additionally, the wallet had a driver's license - DL No: 999000680 issued to my name. It also houses my Social Security Number, 602-76-4532.

 What's more, I had my polish identity card there, with the number ABC123456.

 I would like this data to be secured and protected in all possible ways. I believe It was stolen at 9:30 AM.

 In case any information arises regarding my wallet, please reach out to me on my phone number, 999-888-7777, or through my personal email, johndoe@example.com.

 Please consider this information to be highly confidential and respect my privacy.

 The bank has been informed about the stolen credit card and necessary actions have been taken from their end. They will be reachable at their official email, support@bankname.com.
 My representative there is Victoria Cherry (her business phone: 987-654-3210).

 Thank you for your assistance,

 John Doe"""

In [None]:
def print_colored_pii(string):
    colored_string = re.sub(
        r"(<[^>]*>)", lambda m: "\033[31m" + m.group(1) + "\033[0m", string
    )
    print(colored_string)

In [None]:
analyzer_results = analyzer.analyze(document, language="en")
for res in analyzer_results:
    print(res)

type: CREDIT_CARD, start: 360, end: 379, score: 1.0
type: IBAN_CODE, start: 446, end: 474, score: 1.0
type: EMAIL_ADDRESS, start: 950, end: 969, score: 1.0
type: EMAIL_ADDRESS, start: 1216, end: 1236, score: 1.0
type: UK_NHS, start: 1303, end: 1315, score: 1.0
type: DATE_TIME, start: 6, end: 22, score: 0.85
type: PERSON, start: 33, end: 41, score: 0.85
type: PERSON, start: 142, end: 150, score: 0.85
type: DATE_TIME, start: 158, end: 174, score: 0.85
type: LOCATION, start: 216, end: 226, score: 0.85
type: US_SSN, start: 606, end: 617, score: 0.85
type: DATE_TIME, start: 800, end: 807, score: 0.85
type: PERSON, start: 1266, end: 1281, score: 0.85
type: PERSON, start: 1353, end: 1361, score: 0.85
type: PHONE_NUMBER, start: 906, end: 918, score: 0.75
type: PHONE_NUMBER, start: 1303, end: 1315, score: 0.75
type: URL, start: 958, end: 969, score: 0.5
type: URL, start: 1224, end: 1236, score: 0.5
type: US_DRIVER_LICENSE, start: 535, end: 544, score: 0.4
type: IN_PAN, start: 216, end: 226, sco

In [None]:
[(document[res.start:res.end], res.start, res.end) for res in analyzer_results]


[('4111 1111 1111 1111', 360, 379),
 ('PL61109010140000071219812874', 446, 474),
 ('johndoe@example.com', 950, 969),
 ('support@bankname.com', 1216, 1236),
 ('987-654-3210', 1303, 1315),
 ('October 19, 2021', 6, 22),
 ('John Doe', 33, 41),
 ('John Doe', 142, 150),
 ('October 19, 2021', 158, 174),
 ('Kilmarnock', 216, 226),
 ('602-76-4532', 606, 617),
 ('9:30 AM', 800, 807),
 ('Victoria Cherry', 1266, 1281),
 ('John Doe', 1353, 1361),
 ('999-888-7777', 906, 918),
 ('987-654-3210', 1303, 1315),
 ('example.com', 958, 969),
 ('bankname.com', 1224, 1236),
 ('999000680', 535, 544),
 ('Kilmarnock', 216, 226),
 ('registered', 390, 400),
 ('999000680', 535, 544),
 ('999000680', 535, 544)]

In [None]:
anonymized_results = anonymizer.anonymize(
    text=document,
    analyzer_results=analyzer_results,
)
print_colored_pii(anonymized_results.text)

Date: [31m<DATE_TIME>[0m
 Witness: [31m<PERSON>[0m
 Subject: Testimony Regarding the Loss of Wallet

 Testimony Content:

 Hello Officer,

 My name is [31m<PERSON>[0m and on [31m<DATE_TIME>[0m, my wallet was stolen in the vicinity of [31m<LOCATION>[0m during a bike trip. This wallet contains some very important things to me.

 Firstly, the wallet contains my credit card with number [31m<CREDIT_CARD>[0m, which is [31m<IN_PAN>[0m under my name and linked to my bank account, [31m<IBAN_CODE>[0m.

 Additionally, the wallet had a driver's license - DL No: [31m<US_DRIVER_LICENSE>[0m issued to my name. It also houses my Social Security Number, [31m<US_SSN>[0m.

 What's more, I had my polish identity card there, with the number ABC123456.

 I would like this data to be secured and protected in all possible ways. I believe It was stolen at [31m<DATE_TIME>[0m.

 In case any information arises regarding my wallet, please reach out to me on my phone number, [31m<PHONE_NUMBER>

In [None]:
polish_id_pattern = Pattern(
    name="polish_id_pattern",
    regex="[A-Z]{3}\d{6}",
    score=1,
)
time_pattern = Pattern(
    name="time_pattern",
    regex="(1[0-2]|0?[1-9]):[0-5][0-9] (AM|PM)",
    score=1,
)

# Define the recognizer with one or more patterns
polish_id_recognizer = PatternRecognizer(
    supported_entity="POLISH_ID", patterns=[polish_id_pattern]
)
time_recognizer = PatternRecognizer(supported_entity="TIME", patterns=[time_pattern])

In [None]:
analyzer = AnalyzerEngine()
analyzer.registry.add_recognizer(polish_id_recognizer)
analyzer.registry.add_recognizer(time_recognizer)

In [None]:
analyzer_results = analyzer.analyze(document, language="en")

In [None]:
anonymized_results = anonymizer.anonymize(
    text=document,
    analyzer_results=analyzer_results,
)
print_colored_pii(anonymized_results.text)

Date: [31m<DATE_TIME>[0m
 Witness: [31m<PERSON>[0m
 Subject: Testimony Regarding the Loss of Wallet

 Testimony Content:

 Hello Officer,

 My name is [31m<PERSON>[0m and on [31m<DATE_TIME>[0m, my wallet was stolen in the vicinity of [31m<LOCATION>[0m during a bike trip. This wallet contains some very important things to me.

 Firstly, the wallet contains my credit card with number [31m<CREDIT_CARD>[0m, which is [31m<IN_PAN>[0m under my name and linked to my bank account, [31m<IBAN_CODE>[0m.

 Additionally, the wallet had a driver's license - DL No: [31m<US_DRIVER_LICENSE>[0m issued to my name. It also houses my Social Security Number, [31m<US_SSN>[0m.

 What's more, I had my polish identity card there, with the number [31m<POLISH_ID>[0m.

 I would like this data to be secured and protected in all possible ways. I believe It was stolen at [31m<TIME>[0m.

 In case any information arises regarding my wallet, please reach out to me on my phone number, [31m<PHONE_N

In [None]:
dir(fake)

['__annotations__',
 '__class__',
 '__deepcopy__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_factories',
 '_factory_map',
 '_locales',
 '_map_provider_method',
 '_optional_proxy',
 '_select_factory',
 '_select_factory_choice',
 '_select_factory_distribution',
 '_unique_proxy',
 '_weights',
 'aba',
 'add_provider',
 'address',
 'administrative_unit',
 'am_pm',
 'android_platform_token',
 'ascii_company_email',
 'ascii_email',
 'ascii_free_email',
 'ascii_safe_email',
 'bank_country',
 'basic_phone_number',
 'bban',
 'binary',
 'boolean',
 'bothify',
 'bs',
 'building_number',
 'cache_pattern',
 'catch_phrase',
 'century',
 'chr

In [None]:
def fake_name(_=None):
    return fake.name()


def fake_polish_id(_=None):
    return fake.bothify(text="???######").upper()


def fake_time(_=None):
    return fake.time(pattern="%I:%M %p")


print(fake_name())
print(fake_polish_id())
print(fake_time())

Kelly Miller
LFV414933
09:27 AM


In [None]:
fake_operators = {
    "PERSON": OperatorConfig("custom", {"lambda": lambda x: fake.name()}),
    "PHONE_NUMBER": OperatorConfig("custom", {"lambda": lambda x: fake.phone_number()}),
    "EMAIL_ADDRESS": OperatorConfig("custom", {"lambda": lambda x: fake.email()}),
    "POLISH_ID": OperatorConfig("custom", {"lambda": lambda x: fake_polish_id()}),
    "TIME": OperatorConfig("custom", {"lambda": lambda x: fake_time()}),
}


In [None]:
anonymized_text = anonymizer.anonymize(
    text=document, analyzer_results=analyzer_results, operators=fake_operators
)
print_colored_pii(anonymized_text.text)

Date: [31m<DATE_TIME>[0m
 Witness: Rhonda Davis
 Subject: Testimony Regarding the Loss of Wallet

 Testimony Content:

 Hello Officer,

 My name is Joseph Cooper DVM and on [31m<DATE_TIME>[0m, my wallet was stolen in the vicinity of [31m<LOCATION>[0m during a bike trip. This wallet contains some very important things to me.

 Firstly, the wallet contains my credit card with number [31m<CREDIT_CARD>[0m, which is [31m<IN_PAN>[0m under my name and linked to my bank account, [31m<IBAN_CODE>[0m.

 Additionally, the wallet had a driver's license - DL No: [31m<US_DRIVER_LICENSE>[0m issued to my name. It also houses my Social Security Number, [31m<US_SSN>[0m.

 What's more, I had my polish identity card there, with the number EVE651816.

 I would like this data to be secured and protected in all possible ways. I believe It was stolen at 11:42 PM.

 In case any information arises regarding my wallet, please reach out to me on my phone number, 001-383-740-8395, or through my pers