In [None]:
!pip install spacy
!pip install faker
!python -m spacy download en_core_web_trf
!pip install spacy-lookups-data


[0mCollecting spacy-lookups-data
  Using cached spacy_lookups_data-1.0.5-py2.py3-none-any.whl.metadata (4.8 kB)
Downloading spacy_lookups_data-1.0.5-py2.py3-none-any.whl (98.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[0mInstalling collected packages: spacy-lookups-data
Successfully installed spacy-lookups-data-1.0.5


In [None]:
import spacy
import random
from faker import Faker
from spacy.tokens import DocBin
from spacy.util import filter_spans
from spacy.training.example import Example


In [None]:
fake = Faker()

def generate_fake_data(n=100):
    """Generates fake text data containing PII with labeled entity annotations."""
    training_data = []

    for _ in range(n):
        name = fake.name()
        email = fake.email()
        phone = fake.phone_number()
        city = fake.city()

        text = f"My name is {name}, you can reach me at {email} or call me at {phone}. I live in {city}."

        entities = []
        start = text.find(name)
        if start != -1:
            entities.append((start, start+len(name), "PERSON"))

        start = text.find(email)
        if start != -1:
            entities.append((start, start+len(email), "EMAIL"))

        start = text.find(phone)
        if start != -1:
            entities.append((start, start+len(phone), "PHONE"))

        start = text.find(city)
        if start != -1:
            entities.append((start, start+len(city), "GPE"))

        training_data.append((text, {"entities": entities}))

    return training_data

TRAIN_DATA = generate_fake_data(200)

NameError: name 'Faker' is not defined

In [None]:
nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")

labels = ["PERSON", "EMAIL", "PHONE", "GPE"]
for label in labels:
    ner.add_label(label)

doc_bin = DocBin()

for text, annotations in TRAIN_DATA:
    doc = nlp.make_doc(text)
    ents = [doc.char_span(start, end, label) for start, end, label in annotations["entities"]]
    ents = filter_spans(ents)
    doc.ents = ents
    doc_bin.add(doc)

doc_bin.to_disk("train.spacy")  # Saveing to disk here


In [1]:
# Load blank spaCy model and add NER pipeline
nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")

# Add entity labels
for label in labels:
    ner.add_label(label)

# Load training data
from spacy.training.example import Example

doc_bin = DocBin().from_disk("train.spacy")
examples = [Example.from_dict(nlp.make_doc(doc.text), {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]}) for doc in doc_bin.get_docs(nlp.vocab)]

# Train model
optimizer = nlp.begin_training()
for epoch in range(10):  # Adjust number of epochs
    random.shuffle(examples)
    losses = {}
    for batch in spacy.util.minibatch(examples, size=2):
        nlp.update(batch, drop=0.5, losses=losses)
    print(f"Epoch {epoch + 1}, Loss: {losses}")

# Save model
nlp.to_disk("/content/drive/MyDrive/Sandbox 2025")

print("Training complete! Model saved.")


NameError: name 'spacy' is not defined

In [None]:
# Load trained model
nlp = spacy.load("/content/drive/MyDrive/Sandbox 2025")

test_text = "Nikita lives in India and his email is sknkpk@example.com. Call him at 987-93842-4334."

doc = nlp(test_text)

for ent in doc.ents:
    print(f"Detected: {ent.text} -> {ent.label_}")


Detected: Nikita -> GPE
Detected: India -> GPE
Detected: sknkpk@example.com -> EMAIL
Detected: 987-93842-4334 -> PHONE


In [None]:
def redact_pii(text):
    doc = nlp(text)
    redacted_text = text
    for ent in doc.ents:
        redacted_text = redacted_text.replace(ent.text, "[REDACTED]")
    return redacted_text

# Test redaction
test_text = "John Doe, a 34-year-old software engineer, recently moved to 5678 Maple Street, Springfield, IL, 62704. His Social Security Number is 123-45-6789, and his personal email is johndoe89@gmail.com. John often uses his work email, j.doe@techsolutions.com, for professional communications. His mobile number, +1-312-555-0198, is linked to his Chase Bank account, which uses the routing number 071000013. On March 15, 1990, John was born in Los Angeles, CA. His driver's license number is D1234567, issued by the state of Illinois. He recently booked a flight using his passport number X12345678, and his credit card details (Visa: 4111-1111-1111-1111, Exp: 12/26, CVV: 456) were saved in the airline's database. His wife, Emily Doe, born on July 9, 1992, works at Global Marketing Ltd. with the email emily.doe@globalmkt.com. Their home WiFi is named 'DoeHomeWiFi' and is secured with the password 'Springfield2024!'. John’s medical records indicate he has an appointment on April 10, 2025, at Springfield General Hospital. His insurance policy number is A123456789 with BlueCross BlueShield. His LinkedIn profile, linkedin.com/in/johndoe89, has all his professional details, while his Twitter handle @johndtweets is used for casual updates.."

print("Original:", test_text)
print("Redacted:", redact_pii(test_text))


Original: John Doe, a 34-year-old software engineer, recently moved to 5678 Maple Street, Springfield, IL, 62704. His Social Security Number is 123-45-6789, and his personal email is johndoe89@gmail.com. John often uses his work email, j.doe@techsolutions.com, for professional communications. His mobile number, +1-312-555-0198, is linked to his Chase Bank account, which uses the routing number 071000013. On March 15, 1990, John was born in Los Angeles, CA. His driver's license number is D1234567, issued by the state of Illinois. He recently booked a flight using his passport number X12345678, and his credit card details (Visa: 4111-1111-1111-1111, Exp: 12/26, CVV: 456) were saved in the airline's database. His wife, Emily Doe, born on July 9, 1992, works at Global Marketing Ltd. with the email emily.doe@globalmkt.com. Their home WiFi is named 'DoeHomeWiFi' and is secured with the password 'Springfield2024!'. John’s medical records indicate he has an appointment on April 10, 2025, at Sp

NameError: name 'nlp' is not defined

In [None]:
pip install fastapi uvicorn spacy



In [None]:
from fastapi import FastAPI
import spacy

# Load trained model
nlp = spacy.load("/content/drive/MyDrive/Sandbox 2025")  # Update with your model path

app = FastAPI()

@app.post("/redact/")
async def redact_pii(data: dict):
    text = data.get("text", "")
    doc = nlp(text)

    redacted_text = text
    for ent in doc.ents:
        redacted_text = redacted_text.replace(ent.text, "[REDACTED]")

    return {"redacted_text": redacted_text}

# Run the API with: uvicorn app:app --host 0.0.0.0 --port 8000


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pip install onnx onnxruntime torch torchvision tf2onnx spacy



In [None]:
import spacy

nlp = spacy.blank("en")  # Replace with your trained model
nlp.add_pipe("ner")  # Add Named Entity Recognition component
nlp.to_disk("pii_model")  # Save the trained model


In [None]:
!python -m spacy convert pii_model/ pii_model.onnx --converter onnx

[33mUsage: [0mpython [1;32m-m[0m spacy convert [OPTIONS] INPUT_PATH [OUTPUT_DIR]
[2mTry [0m[2;34m'python [0m[1;2;34m-m[0m[2;34m spacy convert [0m[1;2;34m-[0m[1;2;34m-help[0m[2;34m'[0m[2m for help.[0m
[31m╭─[0m[31m Error [0m[31m─────────────────────────────────────────────────────────────────────────────────────────[0m[31m─╮[0m
[31m│[0m Invalid value for '[OUTPUT_DIR]': Path 'pii_model.onnx' does not exist.                          [31m│[0m
[31m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯[0m


In [None]:
pip install spacy-legacy onnx onnxruntime




In [None]:
import spacy
import os

# Load trained spaCy model
nlp = spacy.load("/content/drive/MyDrive/Sandbox 2025")

# Define ONNX export directory
onnx_dir = "onnx_model"
os.makedirs(onnx_dir, exist_ok=True)

# Convert and save the model in ONNX-compatible format
nlp.to_disk(onnx_dir)

print(f"ONNX-compatible model saved in: {onnx_dir}")


ONNX-compatible model saved in: onnx_model


In [None]:
pip install onnx onnxruntime fastapi uvicorn numpy




In [None]:
pip install fastapi uvicorn spacy



In [None]:
from fastapi import FastAPI
import spacy

# Load your trained spaCy model (replace with your model's path)
nlp = spacy.load("/content/drive/MyDrive/Sandbox 2025")

app = FastAPI()

@app.post("/redact_pii/")
async def redact_pii(text: str):
    """
    API endpoint to redact PII from a given text.
    """
    doc = nlp(text)
    redacted_text = text

    # Replace detected PII entities with "[REDACTED]"
    for ent in doc.ents:
        if ent.label_ in ["PERSON", "EMAIL", "GPE", "ORG", "PHONE", "DATE"]:  # Modify as per your model
            redacted_text = redacted_text.replace(ent.text, "[REDACTED]")

    return {"original": text, "redacted": redacted_text}


In [None]:
pip install fastapi uvicorn spacy onnxruntime




In [None]:
import spacy
import torch
import onnx
import onnxruntime as ort

# Load your trained spaCy model
nlp = spacy.load("/content/drive/MyDrive/Sandbox 2025")

# Convert to ONNX
dummy_input = ["This is a sample input text."]
torch.onnx.export(nlp, dummy_input, "pii_model.onnx", opset_version=12)

print("ONNX model saved as pii_model.onnx")


AttributeError: 'English' object has no attribute 'modules'

In [None]:
!pip install onnxruntime
!pip install spacy[transformers]
import spacy
import torch
import onnx
import onnxruntime as ort
from spacy.tokens import Doc

# Load your trained spaCy model
nlp = spacy.load("/content/drive/MyDrive/Sandbox 2025")

# Access the PyTorch component of the NER pipe, if using a transformer-based model
ner = nlp.get_pipe("ner")
if hasattr(ner.model, "to"):  # Check if it's a PyTorch component
    # Create a dummy input for the NER component
    dummy_input = torch.randint(0, 100, (1, 10))  # Adjust shape as needed
    # Export the NER component to ONNX
    torch.onnx.export(ner.model, dummy_input, "pii_model_ner.onnx", opset_version=12)
    print("ONNX model (NER component) saved as pii_model_ner.onnx")
else:
    print("NER model is not a PyTorch component and cannot be directly exported to ONNX.")

NER model is not a PyTorch component and cannot be directly exported to ONNX.


In [None]:
from fastapi import FastAPI
import onnxruntime as ort
import spacy

# Load ONNX model
# Updated path to reflect the NER component's ONNX file
onnx_model_path = "pii_model_ner.onnx"
session = ort.InferenceSession(onnx_model_path)

# Load original NLP model (optional, for comparison)
# Replace with the actual path to your trained model
nlp = spacy.load("/content/drive/MyDrive/Sandbox 2025")

app = FastAPI()

@app.get("/")
def home():
    return {"message": "PII Redaction API is running!"}

@app.post("/redact/")
def redact_text(text: str):
    # Process text using your NLP model
    doc = nlp(text)

    redacted_text = text
    for ent in doc.ents:
        redacted_text = redacted_text.replace(ent.text, "[REDACTED]")

    return {"original": text, "redacted": redacted_text}

NoSuchFile: [ONNXRuntimeError] : 3 : NO_SUCHFILE : Load model from pii_model_ner.onnx failed:Load model pii_model_ner.onnx failed. File doesn't exist

In [None]:
import spacy
import json

# Load your trained spaCy model
nlp = spacy.load("/content/drive/MyDrive/Sandbox 2025")

# Save entity labels & patterns
entity_patterns = {}
for label in nlp.get_pipe("ner").labels:
    entity_patterns[label] = []

for text in ["John Doe", "123-456-7890", "johndoe@email.com"]:  # Add more test cases
    doc = nlp(text)
    for ent in doc.ents:
        entity_patterns[ent.label_].append(ent.text)

# Save as JSON
with open("nlp_model.json", "w") as f:
    json.dump(entity_patterns, f)


In [None]:
from google.colab import files
files.download("nlp_model.json")  # Replace with your actual filename


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import spacy
import json

# Load your trained model (update the path)
nlp = spacy.load("/content/drive/MyDrive/Sandbox 2025")  # Update path if needed

# Extract NLP model data
model_data = {
    "labels": list(nlp.pipe_labels.get("ner", [])),  # Entity labels
    "vocab": list(nlp.vocab.strings),  # Vocabulary
}

# Save as JSON
json_file_path = "nlp_model.json"
with open(json_file_path, "w") as f:
    json.dump(model_data, f, indent=4)

print(f"Model JSON saved as {json_file_path}")


Model JSON saved as nlp_model.json


In [None]:
from google.colab import files
files.download("nlp_model.json")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import spacy
import json

# Load your trained spaCy model (Update the path accordingly)
nlp = spacy.load("/content/drive/MyDrive/Sandbox 2025")

# Extract entity labels
labels = list(nlp.pipe_labels["ner"])

# Save model info to JSON
model_data = {
    "labels": labels,
    "patterns": []  # Optional: Manually add regex patterns if needed
}

# Save as JSON file
json_file_path = "pii_model.json"
with open(json_file_path, "w") as f:
    json.dump(model_data, f, indent=4)

print(f"Model JSON saved as {json_file_path}")


Model JSON saved as pii_model.json


In [None]:
from google.colab import files
files.download("pii_model.json")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import json

# Define the model configuration
model_config = {
    "name": "text_processor_model",
    "version": "1.0.0",
    "description": "Sample spaCy model configuration",
    "language": "en",
    "author": "Text Processor Extension",
    "rules": [
        {
            "type": "capitalization",
            "probability": 0.2
        },
        {
            "type": "punctuation",
            "fix_spacing": True
        }
    ],
    "note": "This is a placeholder for an actual spaCy model. In a real implementation, you would load a trained model exported from spaCy."
}

# Save as JSON file
json_file_path = "pii_model.json"
with open(json_file_path, "w") as f:
    json.dump(model_config, f, indent=4)

print(f"Model JSON saved as {json_file_path}")


Model JSON saved as pii_model.json


In [None]:
from google.colab import files
files.download("pii_model.json")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>