In [None]:
!pip install -q transformers datasets seqeval   # seqval is used for sequece labeling tasks


In [None]:
#Generate training data. This training data is just a demo. It create a dataste in ConLL format
import random

names = ["John Smith", "Emily Johnson", "Michael Brown"]
orgs = ["ACME Corp", "Global Logistics"]
vehicles = ["Toyota Camry", "Ford F-150"]
locations = ["New York", "Los Angeles"]
dates = ["March 5, 2022", "April 10, 2023"]

def format_conll(tokens, labels):
    return "\n".join(f"{tok} {lab}" for tok, lab in zip(tokens, labels)) + "\n\n"

def annotate_report():
    name = random.choice(names)
    org = random.choice(orgs)
    vehicle = random.choice(vehicles)
    location = random.choice(locations)
    date = random.choice(dates)

    sentence = f"On {date}, {name} was driving a {vehicle} registered to {org} near {location}."
    tokens = sentence.split()
    labels = []

    for tok in tokens:
        if date.startswith(tok):
            labels.append("B-DATE")
        elif tok in name.split():
            labels.append("B-PER" if tok == name.split()[0] else "I-PER")
        elif tok in vehicle.split():
            labels.append("B-VEH" if tok == vehicle.split()[0] else "I-VEH")
        elif tok in org.split():
            labels.append("B-ORG" if tok == org.split()[0] else "I-ORG")
        elif tok in location.split():
            labels.append("B-GPE")
        else:
            labels.append("O")

    return format_conll(tokens, labels)    #format tokens and labels into CoNLL fromat.

with open("train.txt", "w") as f:
    for _ in range(100):
        f.write(annotate_report())


In [None]:
#Convert CONLL data to Huggingface dataset
from datasets import Dataset
import pandas as pd

# Load train.txt
def read_conll(filename):      #this function reads the data from train.txt and purses into dictionary
    data = []
    tokens, labels = [], []
    with open(filename) as f:
        for line in f:
            if line.strip() == "":
                if tokens:
                    data.append({"tokens": tokens, "ner_tags": labels})
                    tokens, labels = [], []
                continue
            token, tag = line.strip().split()
            tokens.append(token)
            labels.append(tag)
    return data

label_list = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-DATE', 'B-GPE', 'B-VEH', 'I-VEH']
label_to_id = {l: i for i, l in enumerate(label_list)}

data = read_conll("train.txt")
for d in data:
    d["ner_tags"] = [label_to_id[tag] for tag in d["ner_tags"]]

dataset = Dataset.from_list(data)


In [None]:
#Tokenization and Data preparation
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []

    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)


In [None]:
#Finetuning
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification

model = AutoModelForTokenClassification.from_pretrained("bert-base-cased", num_labels=len(label_list))

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    num_train_epochs=10,
    logging_dir='./logs',
    logging_steps=10
)

# Create a DataCollatorForTokenClassification instance
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator # Add the data collator here
)

trainer.train()

In [None]:
from transformers import pipeline

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)

text = "On March 5, 2022, Emily Johnson was driving a Toyota Camry registered to ACME Corp in Los Angeles."
results = ner_pipeline(text)

for entity in results:
    print(entity)

In [None]:
from transformers import pipeline
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification # Ensure all necessary classes are imported

# Define your label list (ensure it matches what you used during training)
label_list = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-DATE', 'B-GPE', 'B-VEH', 'I-VEH']
id2label = {i: label for i, label in enumerate(label_list)}


# Associate the id2label mapping with the model's configuration
model.config.id2label = id2label
model.config.label2id = {label: i for i, label in enumerate(label_list)} # It's also good practice to set label2id


# Now create the pipeline without passing id2label directly
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)

# Example accident report text
text = "On March 5, 2022, Emily Johnson was driving a Toyota Camry registered to ACME Corp in Los Angeles."
results = ner_pipeline(text)

# Print each detected entity with its label name
for entity in results:
    # The 'entity_group' key will now contain the human-readable label name
    print(f"Entity: {entity['word']} | Label: {entity['entity_group']} | Score: {entity['score']:.2f}")