# Named Entity Recognition with BERT

In [None]:
!pip install transformers wordcloud scikit-learn pandas matplotlib seaborn tqdm



In [None]:
import os
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, get_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from wordcloud import WordCloud


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
df = pd.read_csv("ner_dataset.csv", encoding="latin1").ffill()
df.head()

In [None]:
print

In [None]:
df.isnull().sum()

In [None]:
df['Sentence #'] = df['Sentence #'].fillna(method='ffill')
df = df.dropna(subset=['Word'])

df.isnull().sum()

print(f"Total sentences: {df['Sentence #'].nunique()}")
print(f"Total tokens: {len(df)}")

In [None]:
sentences = df.groupby("Sentence #")["Word"].apply(list).tolist()
tags = df.groupby("Sentence #")["Tag"].apply(list).tolist()

unique_tags = sorted(set(tag for seq in tags for tag in seq))
label2id = {tag: idx for idx, tag in enumerate(unique_tags)}
id2label = {idx: tag for tag, idx in label2id.items()}

train_texts, val_texts, train_labels, val_labels = train_test_split(sentences, tags, test_size=0.1, random_state=42)

In [None]:
# Unique NER Tags
ner_tags = df['Tag'].unique()
print("NER Tags:", ner_tags)

# Unique POS Tags
pos_tags = df['POS'].unique()
print("POS Tags:", pos_tags)

print("Number of NER Tags:", len(ner_tags))
print("Number of POS Tags:", len(pos_tags))

In [None]:
print("Label to ID mapping:")
print(label2id)
print("\nID to Label mapping:")
print(id2label)


In [None]:
class NERDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, label2id, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        words = self.texts[idx]
        labels = self.labels[idx]
        encoding = self.tokenizer(words, is_split_into_words=True, return_offsets_mapping=True,
                                  padding="max_length", truncation=True, max_length=self.max_len)
        offset_mapping = encoding.pop("offset_mapping")
        word_ids = encoding.word_ids()
        aligned_labels = np.ones(len(encoding["input_ids"]), dtype=int) * -100
        previous_word_idx = None
        for i, word_idx in enumerate(word_ids):
            if word_idx is None:
                continue
            if word_idx != previous_word_idx:
                aligned_labels[i] = self.label2id.get(labels[word_idx], 0)
            previous_word_idx = word_idx
        item = {key: torch.tensor(val) for key, val in encoding.items()}
        item["labels"] = torch.tensor(aligned_labels)
        return item

In [None]:
from huggingface_hub import login
from google.colab import userdata


from transformers import AutoTokenizer, AutoModelForTokenClassification

model_name = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(unique_tags),
).to(device)


In [None]:
train_dataset = NERDataset(train_texts, train_labels, tokenizer, label2id)
val_dataset = NERDataset(val_texts, val_labels, tokenizer, label2id)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)


In [None]:
optimizer = AdamW(model.parameters(), lr=3e-5)
scheduler = get_scheduler("linear", optimizer, num_warmup_steps=0, num_training_steps=len(train_loader)*5)

train_losses, val_f1s = [], []

for epoch in range(10):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    train_losses.append(avg_loss)

    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            pred = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
            true = batch["labels"].cpu().numpy()
            for p, t in zip(pred, true):
                for pi, ti in zip(p, t):
                    if ti != -100:
                        preds.append(id2label[pi])
                        trues.append(id2label[ti])
    f1 = f1_score(trues, preds, average="weighted")
    val_f1s.append(f1)
    print(f"Epoch {epoch+1}: Train Loss = {avg_loss:.4f}, Validation F1 = {f1:.4f}")

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(train_losses, label="Train Loss")
plt.plot(val_f1s, label="Validation F1")
plt.xlabel("Epoch")
plt.ylabel("Score")
plt.legend()
plt.title("Training Loss and Validation F1")
plt.grid(True)
plt.show()

In [None]:
# Run inference on our custom sentence
tokens = ["Elon", "Musk", "is", "the", "CEO", "of", "Tesla", "and", "SpaceX", "based", "in", "the", "United", "States"]
inputs = tokenizer(tokens, is_split_into_words=True, return_tensors="pt", truncation=True, padding=True).to(device)
outputs = model(**inputs).logits
predictions = torch.argmax(outputs, dim=2)[0].cpu().numpy()
predicted_labels = [id2label[p] for p in predictions]
print("Predicted NER Tags:")
print(list(zip(tokens, predicted_labels)))


In [None]:
text = " ".join(df[df['Tag'] != 'O']['Word'].values)
wordcloud = WordCloud(width=1000, height=600, background_color="white", colormap="viridis").generate(text)
plt.figure(figsize=(14, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Word Cloud of Named Entities")
plt.show()

In [None]:

# Installing seqeval
!pip install transformers datasets seqeval


In [None]:
# Importing libraries
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer
from datasets import DatasetDict, Dataset  # only import DatasetDict and Dataset here
!pip install evaluate
import evaluate # for importing metrics, this is the correct import.
import numpy as np
import matplotlib.pyplot as plt

In [None]:

# Loading and preprocessing the dataset
df = pd.read_csv("ner_dataset.csv", encoding="latin1")
df = df.fillna(method="ffill")

# Creating the grouped sentences and labels
grouped = df.groupby("Sentence #").agg({"Word": list, "Tag": list}).reset_index()
sentences = grouped["Word"].tolist()
labels = grouped["Tag"].tolist()

# Unique labels for the dataset
label_list = sorted(set(tag for seq in labels for tag in seq))
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

# Encoding labels to IDs
encoded_labels = [[label2id[tag] for tag in seq] for seq in labels]


In [None]:
df['Sentence #'] = df['Sentence #'].fillna(method='ffill')
sent_lengths = df.groupby("Sentence #")['Word'].count()

plt.figure(figsize=(10, 5))
sns.histplot(sent_lengths, bins=50, kde=True)
plt.title('Sentence Length Distribution')
plt.xlabel('Number of Tokens')
plt.ylabel('Number of Sentences')
plt.tight_layout()
plt.show()

In [None]:
from collections import Counter

entity_words = df[df['Tag'] != 'O']['Word']
entity_counts = Counter(entity_words)

# Top 20 named entities
most_common_entities = entity_counts.most_common(20)
words, counts = zip(*most_common_entities)

plt.figure(figsize=(12, 6))
sns.barplot(x=list(words), y=list(counts))
plt.title('Top 20 Named Entities')
plt.xticks(rotation=45)
plt.xlabel('Entity Word')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
top_pos = df['POS'].value_counts().head(20)
sns.barplot(x=top_pos.index, y=top_pos.values)
plt.title('Top 20 POS Tags')
plt.xticks(rotation=45)
plt.xlabel('POS Tag')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

In [None]:

# Tokenize inputs
model_name = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, padding="max_length", is_split_into_words=True, max_length=128)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[i][word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [None]:

# Prepare Hugging Face dataset
dataset = Dataset.from_dict({
    "tokens": sentences,
    "ner_tags": encoded_labels
})
dataset = dataset.train_test_split(test_size=0.1)


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs
print("manee-checking for the fuctionality")


In [None]:
# Check sample label encoding
print("Sample original tags:", tags[0])
print("Encoded tags:", encoded_labels[0])
print("Back to label names:", [id2label[i] for i in encoded_labels[0]])


In [None]:

# Load model
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list), id2label=id2label, label2id=label2id)
data_collator = DataCollatorForTokenClassification(tokenizer)


In [None]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

# Define label mappings from string tags to integers
label_list = sorted(set(tag for seq in tags for tag in seq))
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

# Encoding string tags to integer IDs
encoded_labels = [[label2id[tag] for tag in seq] for seq in tags]

# Spliting into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    sentences, encoded_labels, test_size=0.1, random_state=42
)

# Building Hugging Face dataset
dataset = DatasetDict({
    "train": Dataset.from_dict({"tokens": train_texts, "ner_tags": train_labels}),
    "test": Dataset.from_dict({"tokens": val_texts, "ner_tags": val_labels}),
})

# Tokenization and label alignment function
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply the tokenization and label alignment
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

# Convert to PyTorch DataLoaders
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=16, shuffle=True)
eval_dataloader = DataLoader(tokenized_datasets["test"], batch_size=16)


In [None]:

# Saving model and testing
model.save_pretrained("bert-ner-model")
tokenizer.save_pretrained("bert-ner-model")

# Example prediction
test_sentence = "Elon Musk is the CEO of Tesla and SpaceX, based in the United States."
''' We have taken a sample sentence for our prediction. If you want to try more,
here are some of the random sentences for prediction testing'''
# "Elon Musk mowa is the CEO of Tesla and SpaceX, based in the United States."
# "Mr. Mohan Das Karamchand Gandhi was the person behind betrayal of electing Prime Minister of India"
# "Steve Jobs founded Apple in California."
# "Madam Marie Curie won the Nobel Prize for her work in radioactivity."
# "The Amazon River flows through Brazil and Peru."
# "Christopher Nolan directed Inception which was released in 2010."
# "Lionel Messi joined Inter Miami after leaving Paris Saint-Germain."
# "Harvard University is located in Cambridge, Massachusetts."


tokens = tokenizer(test_sentence.split(), return_tensors="pt", is_split_into_words=True)
with torch.no_grad():
    output = model(**tokens)
logits = output.logits
predictions = torch.argmax(logits, dim=-1)
predicted_labels = [id2label[p.item()] for p in predictions[0]]

for token, label in zip(test_sentence.split(), predicted_labels[1:len(test_sentence.split())+1]):
    print(f"{token:15s} --> {label}")


# distibert-base-cased

In [None]:
from transformers import DistilBertTokenizerFast

distilbert_model_name = "distilbert-base-cased"
distilbert_tokenizer = DistilBertTokenizerFast.from_pretrained(distilbert_model_name)

#tokenize inpus
def tokenize_and_align_labels_distilbert(examples, label_all_tokens = False):
    tokenized_inputs = distilbert_tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []

    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_distil = dataset.map(tokenize_and_align_labels_distilbert, batched=True)


In [None]:
from transformers import DistilBertForTokenClassification

distilbert_model = DistilBertForTokenClassification.from_pretrained(
    distilbert_model_name,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
).to(device)

In [None]:
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=distilbert_tokenizer)

train_dataloader_distil = DataLoader(train_dataset, batch_size=32, collate_fn=data_collator)
val_dataloader_distil = DataLoader(val_dataset, batch_size=32, collate_fn=data_collator)


train_dataset_distil = tokenized_distil["train"]
val_dataset_distil = tokenized_distil["test"]


# Define optimizer and scheduler
from transformers import get_scheduler

optimizer = AdamW(distilbert_model.parameters(), lr=5e-5)

num_training_steps = len(train_dataloader_distil) * 10
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)


In [None]:
#Training for DistilBert
from tqdm.auto import tqdm
train_losses, val_f1s = [], []

for epoch in range(10):
  distilbert_model.train()
  total_loss = 0
  for batch in tqdm(train_dataloader_distil, desc=f"DistilBERT Epoch {epoch+1}"):
      batch = {k: v.to(device) for k, v in batch.items()}
      if 'token_type_ids' in batch:
        del batch['token_type_ids']
      outputs = distilbert_model(**batch)
      loss = outputs.loss
      loss.backward()
      optimizer.step()
      lr_scheduler.step()
      optimizer.zero_grad()
      total_loss += loss.item()
      avg_loss = total_loss / len(train_dataloader_distil)
  train_losses.append(avg_loss)
  distilbert_model.eval()
  distil_preds, distil_trues = [], []
  with torch.no_grad():
      for batch in val_dataloader_distil:
          batch = {k: v.to(device) for k, v in batch.items()}
          if 'token_type_ids' in batch:
            del batch['token_type_ids']
          outputs = distilbert_model(**batch)
          pred = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
          true = batch["labels"].cpu().numpy()
          for p, t in zip(pred, true):
              for pi, ti in zip(p, t):
                  if ti != -100:
                      distil_preds.append(id2label[pi])
                      distil_trues.append(id2label[ti])
  distil_f1 = f1_score(distil_trues, distil_preds, average="weighted")
  val_f1s.append(distil_f1)
  print(f"[DistilBERT] Epoch {epoch+1}: Train Loss = {avg_loss:.4f}, Validation F1 = {distil_f1:.4f}")


In [None]:
# Saving model and testing
model.save_pretrained("distilbert-bert-ner-model")
tokenizer.save_pretrained("distilbert-bert-ner-model")

# Example prediction
test_sentence = "Elon Musk is the CEO of Tesla and SpaceX, based in the United States."
''' We have taken a sample sentence for our prediction. If you want to try more,
here are some of the random sentences for prediction testing'''
# "Elon Musk mowa is the CEO of Tesla and SpaceX, based in the United States."
# "Mr. Mohan Das Karamchand Gandhi was the person behind betrayal of electing Prime Minister of India"
# "Steve Jobs founded Apple in California."
# "Madam Marie Curie won the Nobel Prize for her work in radioactivity."
# "The Amazon River flows through Brazil and Peru."
# "Christopher Nolan directed Inception which was released in 2010."
# "Lionel Messi joined Inter Miami after leaving Paris Saint-Germain."
# "Harvard University is located in Cambridge, Massachusetts."


tokens = tokenizer(test_sentence.split(), return_tensors="pt", is_split_into_words=True)
with torch.no_grad():
    output = model(**tokens)
logits = output.logits
predictions = torch.argmax(logits, dim=-1)
predicted_labels = [id2label[p.item()] for p in predictions[0]]

for token, label in zip(test_sentence.split(), predicted_labels[1:len(test_sentence.split())+1]):
    print(f"{token:15s} --> {label}")


In [None]:

# Multilingual BERT Implementation (bert-base-multilingual-cased)
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score

# Tokenizer & model
tokenizer_multi = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")
model_multi = BertForTokenClassification.from_pretrained("bert-base-multilingual-cased", num_labels=len(label_list))

# Tokenize and align labels again for multilingual model
def tokenize_and_align_labels_multilingual(examples):
    tokenized_inputs = tokenizer_multi(
        examples["tokens"], truncation=True, is_split_into_words=True, padding="max_length", max_length=128
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(label2id[label[word_idx]])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Prepare multilingual dataset
tokenized_multi_dataset = dataset.map(tokenize_and_align_labels_multilingual, batched=True)
train_test_multi = tokenized_multi_dataset.train_test_split(test_size=0.2)
train_multi = train_test_multi["train"]
eval_multi = train_test_multi["test"]

# Compute metrics
def compute_metrics_multilingual(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_preds, true_labels = [], []
    for pred, label in zip(predictions, labels):
        preds, labs = [], []
        for p, l in zip(pred, label):
            if l != -100:
                preds.append(id2label[p])
                labs.append(id2label[l])
        true_preds.append(preds)
        true_labels.append(labs)
    print(classification_report(true_labels, true_preds))
    return {
        "f1": f1_score(true_labels, true_preds),
        "precision": precision_score(true_labels, true_preds),
        "recall": recall_score(true_labels, true_preds),
    }

# TrainingArguments
training_args_multi = TrainingArguments(
    output_dir="./results-multilingual",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    save_steps=500,
)

# Trainer
trainer_multi = Trainer(
    model=model_multi,
    args=training_args_multi,
    train_dataset=train_multi,
    eval_dataset=eval_multi,
    tokenizer=tokenizer_multi,
    compute_metrics=compute_metrics_multilingual
)

# Train and evaluate
trainer_multi.train()
trainer_multi.evaluate()


# bert-base-multilingual-cased

In [None]:

# Install required packages
!pip install transformers datasets seqeval


In [None]:
# Imports
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, load_metric
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments


In [None]:

# Load the dataset
df = pd.read_csv("ner_dataset.csv", encoding="latin1")
df = df.fillna(method="ffill")

# Preprocess the dataset
class EntityDataset:
    def __init__(self, df):
        self.sentences = []
        self.labels = []
        agg_func = lambda s: list(s)
        grouped = df.groupby("Sentence #").agg({"Word": agg_func, "Tag": agg_func})
        for _, row in grouped.iterrows():
            self.sentences.append(row["Word"])
            self.labels.append(row["Tag"])

entity_data = EntityDataset(df)

label_list = list(set(tag for tags in entity_data.labels for tag in tags))
label_list.sort()
label_to_id = {label: idx for idx, label in enumerate(label_list)}
id_to_label = {idx: label for label, idx in label_to_id.items()}

data = [{"tokens": s, "ner_tags": [label_to_id[tag] for tag in tags]} for s, tags in zip(entity_data.sentences, entity_data.labels)]
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)

train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)




In [None]:
# Tokenizer and Model
model_name = "bert-base-multilingual-cased"
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))

# Tokenization and alignment
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(example["ner_tags"][word_idx])
        else:
            labels.append(-100)
        previous_word_idx = word_idx
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

train_dataset = train_dataset.map(tokenize_and_align_labels, batched=False)
val_dataset = val_dataset.map(tokenize_and_align_labels, batched=False)



In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)


In [None]:
# Metrics
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = torch.argmax(torch.tensor(predictions), dim=2)
    true_predictions = [
        [id_to_label[pred] for (pred, label) in zip(prediction, label) if label != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id_to_label[label] for (pred, label) in zip(prediction, label) if label != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return metric.compute(predictions=true_predictions, references=true_labels)



In [None]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()