In [1]:
from datasets import Dataset, DatasetDict

def read_conll(path):
    tokens, labels = [], []
    sentences, ner_tags = [], []
    
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:  # sentence boundary
                if tokens:
                    sentences.append(tokens)
                    ner_tags.append(labels)
                    tokens, labels = [], []
            else:
                try:
                    token, tag = line.split()
                except ValueError:
                    # In case some line is malformed
                    continue
                tokens.append(token)
                labels.append(tag)

    # Catch last sentence if file doesn't end with a blank line
    if tokens:
        sentences.append(tokens)
        ner_tags.append(labels)

    return Dataset.from_dict({"tokens": sentences, "ner_tags": ner_tags})

# Load your dataset
dataset = read_conll(
    r"C:\Users\1hchu\OneDrive\Documents\GitHub\redact-demon\training\training data\names_conll_shuffled.conll"
)

# Optionally split into train/validation
dataset = dataset.train_test_split(test_size=0.2, seed=42)
dataset = DatasetDict({
    "train": dataset["train"],
    "validation": dataset["test"],
})

print(dataset)
print(dataset["train"][0])


DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 39990
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 9998
    })
})
{'tokens': ['Dr', 'Susan', 'Si', 'Shu', 'Ying', 'treated', 'the', 'patient', 'Susan', 'Si', 'Shu', 'Ying.'], 'ner_tags': ['O', 'B-PER', 'I-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}


In [2]:
# Collect all unique labels from dataset
labels_list = sorted(set(tag for tags in dataset["train"]["ner_tags"] for tag in tags))
label2id = {label: i for i, label in enumerate(labels_list)}
id2label = {i: label for label, i in label2id.items()}

print(label2id)


{'B-PER': 0, 'I-PER': 1, 'O': 2}


In [3]:
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",   # for batching
        max_length=128
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # ignore in loss
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                # Same word → repeat label for subword
                label_ids.append(label2id[label[word_idx]])
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/39990 [00:00<?, ? examples/s]

Map:   0%|          | 0/9998 [00:00<?, ? examples/s]

In [4]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)



Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
import evaluate

metric = evaluate.load("seqeval")


Downloading builder script: 0.00B [00:00, ?B/s]

In [11]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=-1)

    true_labels = [
        [id2label[l] for l in label if l != -100]
        for label in labels
    ]
    true_preds = [
        [id2label[pred] for (pred, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_preds, references=true_labels)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
    }


In [12]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    output_dir="./pii-model",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=50,
    report_to=["wandb"],  # Enable wandb logging
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [None]:
import wandb

# Log in using your API key
wandb.login()


True

In [13]:
trainer.train()
trainer.save_model("./pii-model")


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0,1.5e-05,1.0,1.0,1.0,1.0
2,0.0,1e-05,1.0,1.0,1.0,1.0
3,0.0,7e-06,1.0,1.0,1.0,1.0


In [16]:
metrics = trainer.evaluate()
print(metrics)



{'eval_loss': 6.794568889745278e-06, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_accuracy': 1.0, 'eval_runtime': 24.8886, 'eval_samples_per_second': 401.71, 'eval_steps_per_second': 25.112, 'epoch': 3.0}


In [18]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

model_path = "./pii-model"  # path where you saved your trained model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)


In [19]:
ner_pipeline = pipeline(
    "token-classification",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple"  # groups subword tokens into entities
)


Device set to use cuda:0
