In [1]:
!pip install spacy
!pip install faker
!python -m spacy download en_core_web_trf
!pip install spacy-lookups-data


Collecting faker
  Downloading Faker-36.1.1-py3-none-any.whl.metadata (15 kB)
Downloading Faker-36.1.1-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-36.1.1
Collecting en-core-web-trf==3.7.3
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.7.3/en_core_web_trf-3.7.3-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-curated-transformers<0.3.0,>=0.2.0 (from en-core-web-trf==3.7.3)
  Downloading spacy_curated_transformers-0.2.2-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting curated-transformers<0.2.0,>=0.1.0 (from spacy-curated-transformers<0.3.0,>=0.2.0->en-core-web-trf==3.7.3)
  Downloading curated_transformers-0.1.1-py2.py3-none-any.whl.metadata (965 bytes)
Coll

In [2]:
import spacy
import random
from faker import Faker
from spacy.tokens import DocBin
from spacy.util import filter_spans
from spacy.training.example import Example

In [3]:
fake = Faker()

def generate_fake_data(n=100):
    """Generates fake text data containing PII with labeled entity annotations."""
    training_data = []

    for _ in range(n):
        name = fake.name()
        email = fake.email()
        phone = fake.phone_number()
        city = fake.city()

        text = f"My name is {name}, you can reach me at {email} or call me at {phone}. I live in {city}."

        entities = []
        start = text.find(name)
        if start != -1:
            entities.append((start, start+len(name), "PERSON"))

        start = text.find(email)
        if start != -1:
            entities.append((start, start+len(email), "EMAIL"))

        start = text.find(phone)
        if start != -1:
            entities.append((start, start+len(phone), "PHONE"))

        start = text.find(city)
        if start != -1:
            entities.append((start, start+len(city), "GPE"))

        training_data.append((text, {"entities": entities}))

    return training_data

TRAIN_DATA = generate_fake_data(200)

In [4]:
nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")

labels = ["PERSON", "EMAIL", "PHONE", "GPE"]
for label in labels:
    ner.add_label(label)

doc_bin = DocBin()

for text, annotations in TRAIN_DATA:
    doc = nlp.make_doc(text)
    ents = [doc.char_span(start, end, label) for start, end, label in annotations["entities"]]
    ents = filter_spans(ents)
    doc.ents = ents
    doc_bin.add(doc)

doc_bin.to_disk("train.spacy")  # Saveing to disk here


In [5]:
# Load blank spaCy model and add NER pipeline
nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")

# Add entity labels
for label in labels:
    ner.add_label(label)

# Load training data
from spacy.training.example import Example

doc_bin = DocBin().from_disk("train.spacy")
examples = [Example.from_dict(nlp.make_doc(doc.text), {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]}) for doc in doc_bin.get_docs(nlp.vocab)]

# Train model
optimizer = nlp.begin_training()
for epoch in range(10):  # Adjust number of epochs
    random.shuffle(examples)
    losses = {}
    for batch in spacy.util.minibatch(examples, size=2):
        nlp.update(batch, drop=0.5, losses=losses)
    print(f"Epoch {epoch + 1}, Loss: {losses}")

# Save model
nlp.to_disk("/content/drive/MyDrive/Sandbox")

print("Training complete! Model saved.")


Epoch 1, Loss: {'ner': 1491.441457826491}
Epoch 2, Loss: {'ner': 476.83335663121096}
Epoch 3, Loss: {'ner': 100.53799797239544}
Epoch 4, Loss: {'ner': 49.25246650983737}
Epoch 5, Loss: {'ner': 35.95915503957156}
Epoch 6, Loss: {'ner': 28.026148835909666}
Epoch 7, Loss: {'ner': 12.40874790891195}
Epoch 8, Loss: {'ner': 7.022409385059929}
Epoch 9, Loss: {'ner': 14.442039040160223}
Epoch 10, Loss: {'ner': 3.3278634473429127}
Training complete! Model saved.


In [7]:
# Load trained model
nlp = spacy.load("/content/drive/MyDrive/Sandbox")

test_text = "Nikita lives in India and his email is sknkpk@example.com. Call him at 987-93842-4334."

doc = nlp(test_text)

for ent in doc.ents:
    print(f"Detected: {ent.text} -> {ent.label_}")


Detected: Nikita -> GPE
Detected: India -> GPE
Detected: sknkpk@example.com -> GPE
Detected: 987-93842-4334 -> PHONE


In [8]:
def redact_pii(text):
    doc = nlp(text)
    redacted_text = text
    for ent in doc.ents:
        redacted_text = redacted_text.replace(ent.text, "[REDACTED]")
    return redacted_text

# Test redaction
test_text = "John Doe, a 34-year-old software engineer, recently moved to 5678 Maple Street, Springfield, IL, 62704. His Social Security Number is 123-45-6789, and his personal email is johndoe89@gmail.com. John often uses his work email, j.doe@techsolutions.com, for professional communications. His mobile number, +1-312-555-0198, is linked to his Chase Bank account, which uses the routing number 071000013. On March 15, 1990, John was born in Los Angeles, CA. His driver's license number is D1234567, issued by the state of Illinois. He recently booked a flight using his passport number X12345678, and his credit card details (Visa: 4111-1111-1111-1111, Exp: 12/26, CVV: 456) were saved in the airline's database. His wife, Emily Doe, born on July 9, 1992, works at Global Marketing Ltd. with the email emily.doe@globalmkt.com. Their home WiFi is named 'DoeHomeWiFi' and is secured with the password 'Springfield2024!'. John’s medical records indicate he has an appointment on April 10, 2025, at Springfield General Hospital. His insurance policy number is A123456789 with BlueCross BlueShield. His LinkedIn profile, linkedin.com/in/johndoe89, has all his professional details, while his Twitter handle @johndtweets is used for casual updates.."

print("Original:", test_text)
print("Redacted:", redact_pii(test_text))


Original: John Doe, a 34-year-old software engineer, recently moved to 5678 Maple Street, Springfield, IL, 62704. His Social Security Number is 123-45-6789, and his personal email is johndoe89@gmail.com. John often uses his work email, j.doe@techsolutions.com, for professional communications. His mobile number, +1-312-555-0198, is linked to his Chase Bank account, which uses the routing number 071000013. On March 15, 1990, John was born in Los Angeles, CA. His driver's license number is D1234567, issued by the state of Illinois. He recently booked a flight using his passport number X12345678, and his credit card details (Visa: 4111-1111-1111-1111, Exp: 12/26, CVV: 456) were saved in the airline's database. His wife, Emily Doe, born on July 9, 1992, works at Global Marketing Ltd. with the email emily.doe@globalmkt.com. Their home WiFi is named 'DoeHomeWiFi' and is secured with the password 'Springfield2024!'. John’s medical records indicate he has an appointment on April 10, 2025, at Sp