# Dataset preparation

In [1]:
from gc import set_debug

import pandas as pd
import itertools

ds = pd.read_json('filtered_data.json')

ds['tokens'] = ds['Text'].apply(lambda t: t.split(' '))
ds['ne'] = ds['tokens'].apply(lambda l: list(itertools.repeat('O', len(l))))

ds.to_json('./output.json', orient="records")

# Training

In [2]:
from sklearn.model_selection import train_test_split

dataset = pd.read_json("ner_data.json")

ne_label_col = dataset['ne']
label_list = set(itertools.chain.from_iterable(ne_label_col))

label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}
num_labels = len(label_list)

# Split the dataset into train and temporary datasets (80% train, 20% temporary)
train_dataset, temp_dataset = train_test_split(dataset, test_size=0.2, random_state=42)

# Split the temporary dataset into validation and test datasets (50% validation, 50% test)
val_dataset, test_dataset = train_test_split(temp_dataset, test_size=0.5, random_state=42)

# Save the datasets to JSON files
train_dataset.to_json('train_data.json', orient="records")
val_dataset.to_json('val_data.json', orient="records")
test_dataset.to_json('test_data.json', orient="records")

In [3]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'].tolist(), truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples['ne']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label_to_id[label[word_idx]])
            else:
                label_ids.append(label_to_id[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs['labels'] = labels
    return tokenized_inputs


label_all_tokens = True
train_dataset = tokenize_and_align_labels(train_dataset)
val_dataset = tokenize_and_align_labels(val_dataset)
test_dataset = tokenize_and_align_labels(test_dataset)

In [4]:
from transformers import BertForTokenClassification

model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=num_labels)
model.to('cuda')

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [5]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)


In [6]:
import numpy as np
from seqeval.metrics import classification_report, f1_score


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_labels = [[id_to_label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id_to_label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = classification_report(true_labels, true_predictions)
    return {'f1': f1_score(true_labels, true_predictions)}


In [7]:
import accelerate
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./ner_training_results',  # Output directory
    overwrite_output_dir=True,  # Overwrite the content of the output directory
    num_train_epochs=3,  # Total number of training epochs
    per_device_train_batch_size=16,  # Batch size per device during training
    per_device_eval_batch_size=64,  # Batch size for evaluation
    eval_strategy='epoch',  # Evaluation strategy to adopt during training
    save_strategy='epoch',  # Save the model after each epoch
    logging_dir='./ner_training_results/logs',  # Directory for storing logs
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

eval_results = trainer.evaluate()
print(eval_results)


  trainer = Trainer(


AttributeError: 'tokenizers.Encoding' object has no attribute 'keys'