In [7]:
!pip install spacy
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m95.9 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
import spacy

# Load English tokenizer, tagger, parser, NER, and word vectors
nlp = spacy.load("en_core_web_sm")

# Example text
text = "Dr. John Smith from Kathmandu University met with UNICEF officials on June 1, 2025."

# Process the text
doc = nlp(text)

# Create a new anonymized text by replacing entities with their type
anonymized_text = text
offset = 0  # To track character offset after replacements

for ent in doc.ents:
    start = ent.start_char + offset
    end = ent.end_char + offset
    label = f"[{ent.label_}]"  # e.g., [PERSON], [ORG], [DATE]
    anonymized_text = anonymized_text[:start] + label + anonymized_text[end:]
    offset += len(label) - (end - start)

print("Original Text:")
print(text)
print("\nAnonymized Text:")
print(anonymized_text)


Original Text:
Dr. John Smith from Kathmandu University met with UNICEF officials on June 1, 2025.

Anonymized Text:
Dr. [PERSON] from [ORG] met with [ORG] officials on [DATE].


In [4]:
!pip install spacy faker
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m53.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [5]:
import spacy
from faker import Faker

# Initialize
nlp = spacy.load("en_core_web_sm")
fake = Faker()

# Example text
text = "Dr. John Smith from Kathmandu University met with UNICEF officials on June 1, 2025. His phone number is 9841234567."

# Process the text
doc = nlp(text)

# Map entity type to faker methods
entity_fakers = {
    "PERSON": lambda: fake.name(),
    "ORG": lambda: fake.company(),
    "GPE": lambda: fake.city(),
    "DATE": lambda: fake.date(),
    "TIME": lambda: fake.time(),
    "PHONE_NUMBER": lambda: fake.phone_number(),  # Not all NER models detect phone numbers; you may need regex.
}

# Collect replacement values for each unique entity (to keep replacements consistent if repeated)
entity_replacements = {}

for ent in doc.ents:
    ent_text = ent.text
    ent_label = ent.label_
    if ent_label in entity_fakers:
        # If this entity hasn't been replaced before, generate a fake one
        if ent_text not in entity_replacements:
            entity_replacements[ent_text] = entity_fakers[ent_label]()

# Replace each entity in the text with its pseudonym (in reverse order so positions don't shift)
anonymized_text = text
for ent in sorted(doc.ents, key=lambda e: e.start_char, reverse=True):
    ent_text = ent.text
    ent_label = ent.label_
    if ent_text in entity_replacements:
        anonymized_text = (anonymized_text[:ent.start_char] +
                           entity_replacements[ent_text] +
                           anonymized_text[ent.end_char:])

print("Original Text:")
print(text)
print("\nPseudonymized Text:")
print(anonymized_text)


Original Text:
Dr. John Smith from Kathmandu University met with UNICEF officials on June 1, 2025. His phone number is 9841234567.

Pseudonymized Text:
Dr. Jessica Collins from Woods-Zamora met with Jackson Inc officials on 1970-04-18. His phone number is 1997-11-04.


In [10]:
import pandas as pd
import numpy as np
from faker import Faker
from google.colab import files
from io import BytesIO

fake = Faker()

# Upload CSV
uploaded = files.upload()
file_name = next(iter(uploaded))
df = pd.read_csv(BytesIO(uploaded[file_name]))

# Make a copy to avoid altering the original data
df_anon = df.copy()

# Pseudonymize Name, Email, and Account No
if "Name" in df_anon.columns:
    df_anon["Name"] = [fake.name() for _ in range(len(df_anon))]
if "Email" in df_anon.columns:
    df_anon["Email"] = [fake.email() for _ in range(len(df_anon))]
if "Account No" in df_anon.columns:
    df_anon["Account No"] = [fake.bban() for _ in range(len(df_anon))]

# Pseudonymize City (keep realistic city names)
if "City" in df_anon.columns:
    df_anon["City"] = [fake.city() for _ in range(len(df_anon))]

# Add random noise to Salary (but keep column, type, header)
if "Salary" in df_anon.columns:
    df_anon["Salary"] = (df_anon["Salary"] * np.random.uniform(0.95, 1.05, size=len(df_anon))).round(2)

# Age (option 1: keep as is; option 2: optionally add randomization)
# If you want, you can also pseudonymize age:
# if "Age" in df_anon.columns:
#     df_anon["Age"] = [fake.random_int(min=20, max=65) for _ in range(len(df_anon))]

# Print anonymized DataFrame with the same column order and headers
print("\nAnonymized DataFrame:\n")
print(df_anon.to_string(index=False))


Saving input.csv to input (2).csv

Anonymized DataFrame:

            Name  Age                       Email    Salary        City         Account No
 Andrea Petersen   32     isaacacosta@example.org  62589.76 Adkinsville NSYT71686352464935
Catherine Thomas   41          tmoran@example.org  83023.43   Toddshire IGQF09989377219469
   Andrew Bailey   28 trujillogregory@example.com  56735.42   Sherylton SEUH48572224931564
  Samantha Mccoy   36       kenneth59@example.com  72367.14  Fowlerfort MLMZ13791471059597
  Stephanie Gill   52   dylanmcintyre@example.org 125373.53   Nealburgh NYKJ42844268051368
