In [1]:
!pip install transformers datasets seqeval faker numpy scikit-learn

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting faker
  Downloading faker-38.2.0-py3-none-any.whl.metadata (16 kB)
Downloading faker-38.2.0-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m44.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=084664745040132d0411f077d1b7fff789899fd75f3e0cdc04f60ad894535271
  Stored in directory: /root/.cache/pip/wheels/5f/b8/73/0b2c1a76b701a677653dd79ece07cfabd7457989dbfbdcd8d7
Successfully built seqeva

To load files from Google Drive, you first need to mount your Drive to this Colab notebook. This will allow you to access your files as if they were on the local file system.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


After running the cell above and following the authentication steps, your Google Drive will be mounted at `/content/drive`. You can then navigate to your files. For example, to list the contents of your Drive's root directory, you can use:

In [3]:
import os

os.chdir('/content/drive/MyDrive/pii_ner_assignment_IITB')
print(os.getcwd())

/content/drive/MyDrive/pii_ner_assignment_IITB


In [4]:
import os

files_in_pwd = os.listdir('.')
print(files_in_pwd)

['assignment.md', 'requirements.txt', 'README.md', 'data', 'src', 'out']


## Setup and Quick Baseline

### Subtask:
Install dependencies, train a baseline model, predict on dev and stress sets, and evaluate the performance.


In [5]:
# Check if requirements.txt exists before installing
if os.path.exists('requirements.txt'):
    !pip install -r requirements.txt
else:
    print('requirements.txt not found. Please ensure it is in the current working directory.')




**Reasoning**:
The next step is to train a baseline model using the provided `train.py` script, as specified in the assignment's suggested workflow. This will create the `out` directory and save the trained model.



In [7]:
# Create the output directory if it doesn't exist
output_dir = 'out'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)


## Creating random data for Dev set

In [60]:
import json
import random
from faker import Faker
from datetime import datetime

fake = Faker()

# 1. HARDCODED "SPOKEN" ARTIFACTS
# STT often transcribes these specific ways
INDIAN_NAMES = [
    "ramesh", "suresh", "amit", "priya", "rahul", "sharma", "patel", "singh",
    "kumar", "aditya", "sneha", "rohit", "vikram", "anjali", "deepak", "neha",
    "gupta", "verma", "reddy", "nair", "khan", "mishra", "joshi"
]

DIGIT_MAP = {
    "0": "zero", "1": "one", "2": "two", "3": "three", "4": "four",
    "5": "five", "6": "six", "7": "seven", "8": "eight", "9": "nine"
}

MONTHS = [
    "january", "february", "march", "april", "may", "june",
    "july", "august", "september", "october", "november", "december"
]

# 2. HELPER FUNCTIONS FOR "SPOKEN" CONVERSION
def to_spoken_digits(text):
    """Converts '9820' -> 'nine eight two zero'"""
    out = []
    for char in text:
        if char in DIGIT_MAP:
            out.append(DIGIT_MAP[char])
        else:
            out.append(char)
    return " ".join(out)

def get_spoken_date():
    """Generates 'january first', '10th of may', 'march 23'"""
    # Random date object
    d = fake.date_object()
    day = d.day
    month = MONTHS[d.month - 1]
    year = d.year

    style = random.choice([1, 2, 3, 4])

    if style == 1:
        return f"{month} {day}"  # "march 12"
    elif style == 2:
        return f"{day} of {month}" # "12 of march"
    elif style == 3:
        # Ordinal approximate (stt often misses this, but useful)
        suffix = "th"
        if day in [1, 21, 31]: suffix = "st"
        elif day in [2, 22]: suffix = "nd"
        elif day in [3, 23]: suffix = "rd"
        return f"{month} {day}{suffix}" # "march 1st"
    else:
        return f"{month} {day} {year}" # "march 12 2022"

def make_email_spoken(email):
    """Converts 'bob@gmail.com' -> 'bob at gmail dot com'"""
    return email.replace("@", " at ").replace(".", " dot ")

# 3. GENERATOR
def generate_v4_example(uid):
    templates = [
        # TEMPLATE PATTERNS (Subject + Verb + Entity)
        ("my credit card is {CREDIT_CARD}", "CREDIT_CARD"),
        ("card number {CREDIT_CARD}", "CREDIT_CARD"),
        ("{CREDIT_CARD} is my card", "CREDIT_CARD"),

        ("call me at {PHONE}", "PHONE"),
        ("phone number is {PHONE}", "PHONE"),
        ("dial {PHONE}", "PHONE"),

        ("email address is {EMAIL}", "EMAIL"),
        ("contact {EMAIL}", "EMAIL"),
        ("mail to {EMAIL}", "EMAIL"),

        ("my name is {PERSON_NAME}", "PERSON_NAME"),
        ("this is {PERSON_NAME}", "PERSON_NAME"),
        ("i am {PERSON_NAME}", "PERSON_NAME"),

        ("date of birth {DATE}", "DATE"),
        ("meeting on {DATE}", "DATE"),
        ("scheduled for {DATE}", "DATE"),
        ("today is {DATE}", "DATE"),

        ("live in {CITY}", "CITY"),
        ("from {CITY}", "CITY"),
        ("visit {LOCATION}", "LOCATION"),

        # MULTI-ENTITY
        ("name {PERSON_NAME} card {CREDIT_CARD}", ["PERSON_NAME", "CREDIT_CARD"]),
        ("call {PERSON_NAME} at {PHONE}", ["PERSON_NAME", "PHONE"]),
    ]

    # 20% Negative Examples (No PII) to fix Precision
    if random.random() < 0.2:
        text = fake.sentence()
        # Remove punctuation to look like STT
        text = text.replace(".", "").replace(",", "").lower()
        return {"id": f"train_{uid}", "text": text, "entities": []}

    template, ent_types = random.choice(templates)
    if not isinstance(ent_types, list): ent_types = [ent_types]

    entity_data = {}
    for et in ent_types:
        if et == "CREDIT_CARD":
            val = fake.credit_card_number()
            # 80% Spoken digits (Stress set is heavy on this)
            if random.random() < 0.8:
                val = to_spoken_digits(val)
            entity_data[et] = val

        elif et == "PHONE":
            val = fake.phone_number()
            # 80% Spoken digits
            if random.random() < 0.8:
                val = to_spoken_digits(val)
            else:
                val = val.replace("-", " ").replace("(", "").replace(")", "")
            entity_data[et] = val

        elif et == "EMAIL":
            # 90% Spoken emails (Stress set is heavy on this)
            val = fake.email()
            if random.random() < 0.9:
                val = make_email_spoken(val)
            entity_data[et] = val

        elif et == "PERSON_NAME":
            # 50% Indian Names
            if random.random() < 0.5:
                # Create full name "Ramesh Kumar"
                val = f"{random.choice(INDIAN_NAMES)} {random.choice(INDIAN_NAMES)}"
            else:
                val = fake.name()
            entity_data[et] = val

        elif et == "DATE":
            # 100% Spoken Dates (CRITICAL FIX for your 0.00 score)
            entity_data[et] = get_spoken_date()

        elif et == "CITY": entity_data[et] = fake.city()
        elif et == "LOCATION": entity_data[et] = fake.address()

    # Build Text
    full_text = ""
    spans = []
    parts = template.split("{")

    for part in parts:
        if "}" in part:
            etype, rest = part.split("}")
            val = str(entity_data[etype]).lower()

            # STT Noise: Randomly remove spaces in digit sequences
            if etype in ["CREDIT_CARD", "PHONE"] and random.random() < 0.3:
                val = val.replace(" ", "")

            start = len(full_text)
            full_text += val
            end = len(full_text)

            spans.append({"start": start, "end": end, "label": etype})
            full_text += rest.lower()
        else:
            full_text += part.lower()

    return {
        "id": f"train_{uid}",
        "text": full_text,
        "entities": spans
    }

# GENERATE 3000 EXAMPLES (Need volume to learn patterns)
print("Generating V4 'Spoken' Data...")
with open("data/train.jsonl", "w") as f:
    for i in range(3000):
        f.write(json.dumps(generate_v4_example(i)) + "\n")

with open("data/dev.jsonl", "w") as f:
    for i in range(400):
        f.write(json.dumps(generate_v4_example(i + 10000)) + "\n")


Generating V4 'Spoken' Data...


In [67]:
!python src/train.py \
  --model_name distilroberta-base \
  --train data/train.jsonl \
  --dev data/dev.jsonl \
  --out_dir out \
  --epochs 5 \
  --batch_size 16 \
  --lr 3e-5 \
  --max_length 128 \
  --freeze_layers 3

Loading tokenizer: distilroberta-base
Loading training data from data/train.jsonl...
Loading dev data from data/dev.jsonl...
2025-11-25 12:25:58.500495: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764073558.559859   17377 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764073558.569219   17377 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1764073558.604719   17377 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1764073558.604761   17377 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid li

In [68]:
!python src/measure_latency.py \
  --model_dir out \
  --input data/dev.jsonl \
  --max_length 128 \
  --runs 100


2025-11-25 12:27:17.603570: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764073637.632457   17811 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764073637.641388   17811 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1764073637.666100   17811 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1764073637.666131   17811 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1764073637.666138   17811 computation_placer.cc:177] computation placer alr

In [69]:
!python src/predict.py \
  --model_dir out \
  --input data/dev.jsonl \
  --output out/dev_pred.json \
  --max_length 128


2025-11-25 12:27:33.468190: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764073653.485046   17907 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764073653.490271   17907 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1764073653.506764   17907 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1764073653.506794   17907 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1764073653.506799   17907 computation_placer.cc:177] computation placer alr

In [70]:
!python src/eval_span_f1.py \
  --gold data/dev.jsonl \
  --pred out/dev_pred.json



Per-entity metrics:
CITY            P=0.966 R=1.000 F1=0.982
CREDIT_CARD     P=1.000 R=1.000 F1=1.000
DATE            P=1.000 R=1.000 F1=1.000
EMAIL           P=0.955 R=0.977 F1=0.966
LOCATION        P=1.000 R=1.000 F1=1.000
PERSON_NAME     P=1.000 R=1.000 F1=1.000
PHONE           P=1.000 R=1.000 F1=1.000

Macro-F1: 0.993

PII-only metrics: P=0.993 R=0.997 F1=0.995
Non-PII metrics: P=0.978 R=1.000 F1=0.989


## Prediction and evaluation on the **Stress Set**

In [71]:
# 1. Predict on Stress Set
!python src/predict.py \
  --model_dir out \
  --input data/stress.jsonl \
  --output out/stress_pred.json \
  --max_length 128


2025-11-25 12:27:50.696156: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764073670.713683   17992 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764073670.719011   17992 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1764073670.741449   17992 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1764073670.741484   17992 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1764073670.741488   17992 computation_placer.cc:177] computation placer alr

In [72]:
# 2. Evaluate Stress Set
!python src/eval_span_f1.py \
  --gold data/stress.jsonl \
  --pred out/stress_pred.json


Per-entity metrics:
CITY            P=0.857 R=0.750 F1=0.800
CREDIT_CARD     P=0.277 R=0.450 F1=0.343
DATE            P=0.952 R=1.000 F1=0.976
EMAIL           P=0.000 R=0.000 F1=0.000
PERSON_NAME     P=0.262 R=0.975 F1=0.413
PHONE           P=0.257 R=0.450 F1=0.327

Macro-F1: 0.476

PII-only metrics: P=0.368 R=0.730 F1=0.489
Non-PII metrics: P=0.857 R=0.750 F1=0.800
