In [15]:
# !pip install transformers datasets evaluate torch

In [3]:
import pandas as pd
from datasets import Dataset

# Load the dataset
file_path = "/content/ner_dataset.csv"  # Update the path if needed
df = pd.read_csv(file_path)

df

Unnamed: 0,Text,Entities
0,Emma 2022 New York.,"[{'entity': 'PERSON', 'start': 0, 'end': 4}, {..."
1,Emma.,"[{'entity': 'PERSON', 'start': 0, 'end': 4}]"
2,January 2023 Amazon Boston.,"[{'entity': 'DATE', 'start': 0, 'end': 12}, {'..."
3,Google Google.,"[{'entity': 'ORG', 'start': 0, 'end': 6}, {'en..."
4,New York.,"[{'entity': 'LOC', 'start': 0, 'end': 8}]"
...,...,...
2995,Amazon.,"[{'entity': 'ORG', 'start': 0, 'end': 6}]"
2996,Google San Francisco.,"[{'entity': 'ORG', 'start': 0, 'end': 6}, {'en..."
2997,David.,"[{'entity': 'PERSON', 'start': 0, 'end': 5}]"
2998,London David Paris.,"[{'entity': 'LOC', 'start': 0, 'end': 6}, {'en..."


In [4]:

# Convert the 'Entities' column from string to list
import ast
df['Entities'] = df['Entities'].apply(ast.literal_eval)

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)


In [5]:
# Create a list of unique labels
unique_labels = ['O'] + [ent['entity'] for record in df['Entities'] for ent in record]
label_to_id = {label: idx for idx, label in enumerate(set(unique_labels))}
id_to_label = {idx: label for label, idx in label_to_id.items()}

In [7]:
from transformers import AutoTokenizer
from datasets import ClassLabel
import numpy as np

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")

# Preprocessing function
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["Text"],
        truncation=True,
        is_split_into_words=False,
        padding="max_length",
        max_length=128,
        return_offsets_mapping=True,  # Enable offset mapping
    )

    labels = []
    for batch_index in range(len(examples["Text"])):
        word_ids = tokenized_inputs.word_ids(batch_index=batch_index)
        example_labels = [label_to_id['O']] * len(word_ids)
        offset_mapping = tokenized_inputs['offset_mapping'][batch_index]

        for entity in examples["Entities"][batch_index]:
            start = entity['start']
            end = entity['end']
            label = label_to_id[entity['entity']]

            # Align labels with tokens
            for idx, word_id in enumerate(word_ids):
                if word_id is None:
                    continue
                token_start, token_end = offset_mapping[idx]
                if token_start >= start and token_end <= end:
                    example_labels[idx] = label

        labels.append(example_labels)

    tokenized_inputs["labels"] = labels
    tokenized_inputs.pop("offset_mapping")  # Remove offset mapping before returning
    return tokenized_inputs


# Apply preprocessing
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [8]:
from torch.utils.data import DataLoader

train_dataset = tokenized_dataset.train_test_split(test_size=0.1)['train']
val_dataset = tokenized_dataset.train_test_split(test_size=0.1)['test']

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)


In [10]:
from transformers import AutoModelForTokenClassification
from transformers import Trainer, TrainingArguments
import numpy as np

# Load the model
model = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-cased",
    num_labels=len(label_to_id)
)

# Define metrics function
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (-100)
    true_predictions = [
        [id_to_label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id_to_label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Flatten predictions and labels for metric computation
    flat_predictions = [item for sublist in true_predictions for item in sublist]
    flat_labels = [item for sublist in true_labels for item in sublist]

    # Calculate metrics
    true_positive = sum([1 for p, l in zip(flat_predictions, flat_labels) if p == l])
    precision = true_positive / len(flat_predictions) if flat_predictions else 0.0
    accuracy = true_positive / len(flat_labels) if flat_labels else 0.0

    return {
        "precision": precision,
        "accuracy": accuracy,
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir="./ner_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Precision,Accuracy
1,0.0002,8.9e-05,1.0,1.0
2,0.0001,4.4e-05,1.0,1.0
3,0.0001,3.6e-05,1.0,1.0


# Save and Load Model

In [16]:
# Save model and tokenizer separately
output_dir = "./ner_finetuned_model"

# Save the fine-tuned model
model.save_pretrained(output_dir)

# Save the tokenizer
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")


Model and tokenizer saved to ./ner_finetuned_model


In [18]:
from transformers import AutoModelForTokenClassification, AutoTokenizer

model = AutoModelForTokenClassification.from_pretrained(output_dir)
tokenizer = AutoTokenizer.from_pretrained(output_dir)

In [19]:
import torch
# Batch NER Detection Function
def ner_detection_batch(sentences):
    """
    Detects named entities in a batch of sentences using the fine-tuned NER model.

    Args:
    sentences (list): A list of input sentences.

    Returns:
    list: A list of results for each sentence. Each result is a list of tuples containing the token and its predicted entity label.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    results = []
    for text in sentences:
        # Tokenize the input text and move tokens to the same device
        tokens = tokenizer(text, return_tensors="pt", truncation=True, is_split_into_words=False).to(device)

        with torch.no_grad():
            output = model(**tokens)

        # Get predictions
        predictions = np.argmax(output.logits.detach().cpu().numpy(), axis=2)

        # Convert token IDs to words and labels
        tokens = tokenizer.convert_ids_to_tokens(tokens["input_ids"].squeeze().tolist())
        labels = [id_to_label[label] for label in predictions[0]]

        # Combine tokens and their predicted labels
        sentence_results = []
        for token, label in zip(tokens, labels):
            if token not in ["[CLS]", "[SEP]", "[PAD]"]:  # Skip special tokens
                sentence_results.append((token, label))

        results.append(sentence_results)

    return results

# Example Usage
example_sentences = [
    "Barack Obama was born in Hawaii.",
    "Microsoft was founded by Bill Gates and Paul Allen.",
    "Apple Inc. is headquartered in Cupertino, California.",
    "Elon Musk leads SpaceX and Tesla.",
    "The Eiffel Tower is located in Paris, France."
] * 20  # Repeat to make 100 sentences

# Process the sentences
detected_entities_batch = ner_detection_batch(example_sentences)

# Print results for the first 5 sentences
for i, entities in enumerate(detected_entities_batch[:5]):
    print(f"Sentence {i+1}: {entities}")


Sentence 1: [('Barack', 'PERSON'), ('Obama', 'LOC'), ('was', 'O'), ('born', 'O'), ('in', 'O'), ('Hawaii', 'LOC'), ('.', 'O')]
Sentence 2: [('Microsoft', 'ORG'), ('was', 'O'), ('founded', 'O'), ('by', 'DATE'), ('Bill', 'LOC'), ('Gates', 'LOC'), ('and', 'O'), ('Paul', 'PERSON'), ('Allen', 'LOC'), ('.', 'O')]
Sentence 3: [('Apple', 'ORG'), ('Inc', 'ORG'), ('.', 'O'), ('is', 'O'), ('headquartered', 'DATE'), ('in', 'DATE'), ('Cup', 'ORG'), ('##ert', 'DATE'), ('##ino', 'O'), (',', 'DATE'), ('California', 'LOC'), ('.', 'O')]
Sentence 4: [('El', 'ORG'), ('##on', 'O'), ('Mu', 'ORG'), ('##sk', 'O'), ('leads', 'O'), ('Space', 'ORG'), ('##X', 'ORG'), ('and', 'O'), ('Te', 'ORG'), ('##sla', 'ORG'), ('.', 'O')]
Sentence 5: [('The', 'LOC'), ('E', 'ORG'), ('##iff', 'O'), ('##el', 'O'), ('Tower', 'O'), ('is', 'O'), ('located', 'O'), ('in', 'O'), ('Paris', 'LOC'), (',', 'DATE'), ('France', 'LOC'), ('.', 'O')]


# Dowload To your pc

In [20]:
import shutil
from google.colab import files

# Specify the model directory
model_dir = "/content/ner_finetuned_model"

# Create a zip file of the model directory
shutil.make_archive("/content/ner_finetuned_model", 'zip', model_dir)

# Download the zip file
files.download("/content/ner_finetuned_model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>