In [None]:
!pip install accelerate -U
!pip install evaluate
!pip install seqeval




In [None]:
!unzip data.zip

Archive:  data.zip
   creating: data/
  inflating: data/100035.ann         
  inflating: data/100035.txt         
  inflating: data/100039.ann         
  inflating: data/100039.txt         
  inflating: data/100187.ann         
  inflating: data/100187.txt         
  inflating: data/100229.ann         
  inflating: data/100229.txt         
  inflating: data/100564.ann         
  inflating: data/100564.txt         
  inflating: data/100579.ann         
  inflating: data/100579.txt         
  inflating: data/100590.ann         
  inflating: data/100590.txt         
  inflating: data/100677.ann         
  inflating: data/100677.txt         
  inflating: data/100847.ann         
  inflating: data/100847.txt         
  inflating: data/100883.ann         
  inflating: data/100883.txt         
  inflating: data/100922.ann         
  inflating: data/100922.txt         
  inflating: data/101092.ann         
  inflating: data/101092.txt         
  inflating: data/101136.ann         
  inflating:

In [None]:
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import numpy as np
import evaluate
import pandas as pd

nltk.download('punkt')
nltk.download('stopwords')

def read_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        return file.read()

def parse_ann(ann_content):
    annotations = []
    for line in ann_content.strip().split('\n'):
        if line.startswith('T'):
            parts = line.split('\t')
            ann_id = parts[0]
            label_info = parts[1]
            text = parts[2]
            label_info_parts = label_info.split()
            label = label_info_parts[0]
            start = int(label_info_parts[1].split(';')[0])
            end = int(label_info_parts[2].split(';')[0])
            annotations.append({
                'id': ann_id,
                'label': label,
                'start': start,
                'end': end,
                'text': text
            })
    return annotations

def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words and token not in string.punctuation]
    return filtered_tokens

def format_biobert_input(text, annotations):
    tokens = preprocess_text(text)
    token_annotations = ['O'] * len(tokens)
    text_offset = 0

    for ann in annotations:
        ann_tokens = word_tokenize(ann['text'])
        ann_label = ann['label']

        while text_offset < len(tokens):
            try:
                if tokens[text_offset] == ann_tokens[0]:
                    match = True
                    for i in range(len(ann_tokens)):
                        if text_offset + i >= len(tokens) or tokens[text_offset + i] != ann_tokens[i]:
                            match = False
                            break
                    if match:
                        for i in range(len(ann_tokens)):
                            if i == 0:
                                token_annotations[text_offset + i] = f'B-{ann_label}'
                            else:
                                token_annotations[text_offset + i] = f'I-{ann_label}'
                        text_offset += len(ann_tokens)
                        break
                text_offset += 1
            except:
                print(ann_tokens)
                print(tokens[text_offset])

    return tokens, token_annotations

def split_into_chunks(tokens, labels, max_length=509):
    chunks = []
    chunk_labels = []
    current_chunk = []
    current_chunk_labels = []
    current_length = 0

    for i in range(len(tokens)):
        current_chunk.append(tokens[i])
        current_chunk_labels.append(labels[i])
        current_length += 1

        if current_length >= max_length:
            while i < len(tokens) and not labels[i] == 'O':
                current_chunk.append(tokens[i])
                current_chunk_labels.append(labels[i])
                current_length += 1
                i += 1

            chunks.append(current_chunk)
            chunk_labels.append(current_chunk_labels)
            current_chunk = []
            current_chunk_labels = []
            current_length = 0

    if current_chunk:
        chunks.append(current_chunk)
        chunk_labels.append(current_chunk_labels)

    return chunks, chunk_labels

def process_files(txt_file, ann_file):
    text = read_file(txt_file)
    ann_content = read_file(ann_file)
    annotations = parse_ann(ann_content)
    tokens, labels = format_biobert_input(text, annotations)
    token_chunks, label_chunks = split_into_chunks(tokens, labels)
    return token_chunks, label_chunks

tokend_text = []
cor_labels = []

def process_all_files(directory):
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            txt_file = os.path.join(directory, filename)
            ann_file = txt_file.replace(".txt", ".ann")
            if os.path.exists(ann_file):
                token_chunks, label_chunks = process_files(txt_file, ann_file)
                tokend_text.extend(token_chunks)
                cor_labels.extend(label_chunks)

directory = '/content/data'
process_all_files(directory)

labels = ["O", "B-Drug", "I-Drug", "B-Strength", "I-Strength", "B-Form", "I-Form", "B-Dosage", "I-Dosage",
          "B-Duration", "I-Duration", "B-Frequency", "I-Frequency", "B-Route", "I-Route", "B-ADE", "I-ADE",
          "B-Reason", "I-Reason"]

label_map = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label_map.items()}

tokenizer = AutoTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.1')
model = AutoModelForTokenClassification.from_pretrained('dmis-lab/biobert-base-cased-v1.1', num_labels=len(id2label), id2label=id2label, label2id=label_map, ignore_mismatched_sizes=False)

texts = tokend_text
labels = [[label_map[label] for label in doc_labels] for doc_labels in cor_labels]

class NERDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        word_labels = self.labels[index]

        encoding = self.tokenizer(text,
                                  truncation=True,
                                  padding='max_length',
                                  max_length=self.max_len,
                                  is_split_into_words=True,
                                  return_tensors='pt')

        word_ids = encoding.word_ids(batch_index=0)

        labels = [-100 if word_id is None else word_labels[word_id] for word_id in word_ids]

        item = {key: val.squeeze() for key, val in encoding.items()}
        item['labels'] = torch.tensor(labels, dtype=torch.long)

        return item

MAX_LEN = 512
BATCH_SIZE = 16

train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.1)

kf = KFold(n_splits=10, shuffle=True, random_state=42)
all_metrics = {
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1_micro': [],
    'f1_macro': [],
    'confusion_matrix': []
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

data_collator = DataCollatorForTokenClassification(tokenizer)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="no"
)

metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1_micro": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

for fold, (train_index, val_index) in enumerate(kf.split(train_texts)):
    print(f"Fold {fold + 1}")

    train_fold_texts = [train_texts[i] for i in train_index]
    val_fold_texts = [train_texts[i] for i in val_index]
    train_fold_labels = [train_labels[i] for i in train_index]
    val_fold_labels = [train_labels[i] for i in val_index]

    train_dataset = NERDataset(train_fold_texts, train_fold_labels, tokenizer, MAX_LEN)
    val_dataset = NERDataset(val_fold_texts, val_fold_labels, tokenizer, MAX_LEN)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()

    metrics = trainer.evaluate(val_dataset)
    predictions, labels, _ = trainer.predict(val_dataset)
    predictions = np.argmax(predictions, axis=2)

    true_labels = []
    pred_labels = []

    for i in range(len(labels)):
        true_labels.extend(labels[i])
        pred_labels.extend(predictions[i])

    true_labels = [label for label in true_labels if label != -100]
    pred_labels = [pred for label, pred in zip(true_labels, pred_labels) if label != -100]

    cm = confusion_matrix(true_labels, pred_labels, labels=list(label_map.values()))

    all_metrics['accuracy'].append(metrics['eval_accuracy'])
    all_metrics['precision'].append(metrics['eval_precision'])
    all_metrics['recall'].append(metrics['eval_recall'])
    all_metrics['f1_micro'].append(metrics['eval_f1_micro'])
    all_metrics['f1_macro'].append(precision_recall_fscore_support(true_labels, pred_labels, average='macro')[2])
    all_metrics['confusion_matrix'].append(cm)

    print(f"Fold {fold + 1} Metrics:")
    print(f"Accuracy: {metrics['eval_accuracy']}")
    print(f"Precision: {metrics['eval_precision']}")
    print(f"Recall: {metrics['eval_recall']}")
    print(f"Micro F1: {metrics['eval_f1_micro']}")
    print(f"Macro F1: {all_metrics['f1_macro'][-1]}")


confusion_matrices = np.sum(all_metrics['confusion_matrix'], axis=0)

metrics_mean_std = {metric: (np.mean(all_metrics[metric]), np.std(all_metrics[metric])) for metric in all_metrics if metric != 'confusion_matrix'}

print("\nMetrics Mean and Standard Deviation:")
for metric, (mean, std) in metrics_mean_std.items():
    print(f"{metric.capitalize()} - Mean: {mean}, Std: {std}")

print("\nCombined Confusion Matrix:")
print(confusion_matrices)

trainer.save_model("./model")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

Fold 1


Epoch,Training Loss,Validation Loss,Precision,Recall,F1 Micro,Accuracy
1,0.1744,0.087936,0.0,0.0,0.0,0.992003
2,0.0762,0.060562,0.0,0.0,0.0,0.992003
3,0.0709,0.059565,0.0,0.0,0.0,0.992003
4,0.0668,0.058264,0.0,0.0,0.0,0.992003
5,0.0551,0.051077,0.0,0.0,0.0,0.992003
6,0.0414,0.047971,0.0,0.0,0.0,0.992003
7,0.0407,0.04129,0.0,0.0,0.0,0.992003
8,0.0463,0.053277,0.0,0.0,0.0,0.991978
9,0.0338,0.052913,0.0,0.0,0.0,0.991242
10,0.0157,0.055006,0.23913,0.038062,0.065672,0.991414


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))


Fold 1 Metrics:
Accuracy: 0.991413992738691
Precision: 0.2391304347826087
Recall: 0.03806228373702422
Micro F1: 0.06567164179104477
Macro F1: 0.0765742882056337
Fold 2


Epoch,Training Loss,Validation Loss,Precision,Recall,F1 Micro,Accuracy
1,0.0161,0.028408,0.689815,0.774697,0.729796,0.992953
2,0.0139,0.025168,0.721582,0.759099,0.739865,0.993548
3,0.0147,0.025858,0.712681,0.769497,0.74,0.993424
4,0.0179,0.027392,0.69462,0.760832,0.72622,0.992953
5,0.0128,0.024688,0.811203,0.677643,0.738432,0.99397
6,0.0067,0.023842,0.760943,0.783362,0.77199,0.994045
7,0.0084,0.032779,0.63881,0.781629,0.70304,0.99129
8,0.0056,0.024339,0.717868,0.793761,0.753909,0.993449
9,0.006,0.025639,0.802158,0.772964,0.78729,0.994417
10,0.0088,0.033934,0.620056,0.760832,0.683268,0.990074


  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))


Fold 2 Metrics:
Accuracy: 0.9900744416873449
Precision: 0.6200564971751412
Recall: 0.7608318890814558
Micro F1: 0.6832684824902724
Macro F1: 0.12872301510910017
Fold 3


Epoch,Training Loss,Validation Loss,Precision,Recall,F1 Micro,Accuracy
1,0.0054,0.005008,0.915138,0.956835,0.935522,0.998777
2,0.0098,0.004544,0.932558,0.961631,0.946871,0.998969
3,0.0068,0.003876,0.941315,0.961631,0.951364,0.999089
4,0.0026,0.003978,0.945755,0.961631,0.953627,0.999137
5,0.0025,0.004239,0.945882,0.964029,0.954869,0.999113
6,0.0033,0.005361,0.906818,0.956835,0.931155,0.998729
7,0.0034,0.006127,0.874459,0.968825,0.919226,0.998226
8,0.0024,0.006625,0.864629,0.94964,0.905143,0.998058
9,0.0028,0.006299,0.885393,0.944844,0.914153,0.998273
10,0.009,0.006087,0.914081,0.918465,0.916268,0.998393


Fold 3 Metrics:
Accuracy: 0.9983933624286605
Precision: 0.9140811455847255
Recall: 0.9184652278177458
Micro F1: 0.9162679425837319
Macro F1: 0.07336795867230382
Fold 4


Epoch,Training Loss,Validation Loss,Precision,Recall,F1 Micro,Accuracy
1,0.0027,0.002893,0.87251,0.986486,0.926004,0.999148
2,0.0021,0.001623,0.92437,0.990991,0.956522,0.999513
3,0.0015,0.00103,0.940426,0.995495,0.967177,0.999635
4,0.0019,0.002203,0.894737,0.995495,0.942431,0.999318
5,0.0015,0.001561,0.932489,0.995495,0.962963,0.99944
6,0.0011,0.001163,0.940171,0.990991,0.964912,0.999659
7,0.0013,0.002191,0.901639,0.990991,0.944206,0.999367
8,0.0014,0.001087,0.968037,0.954955,0.961451,0.999586
9,0.0068,0.006215,0.815574,0.896396,0.854077,0.998393
10,0.0041,0.006397,0.876068,0.923423,0.899123,0.998466


  _warn_prf(average, modifier, msg_start, len(result))


Fold 4 Metrics:
Accuracy: 0.9984662576687117
Precision: 0.8760683760683761
Recall: 0.9234234234234234
Micro F1: 0.8991228070175438
Macro F1: 0.0661621303371889
Fold 5


Epoch,Training Loss,Validation Loss,Precision,Recall,F1 Micro,Accuracy
1,0.0027,0.001654,0.978979,0.990881,0.984894,0.999729
2,0.0013,0.001824,0.967456,0.993921,0.98051,0.999655
3,0.0014,0.001394,0.996942,0.990881,0.993902,0.999877
4,0.0009,0.001341,1.0,0.987842,0.993884,0.999877
5,0.0004,0.001697,0.961765,0.993921,0.977578,0.999606
6,0.0004,0.001636,0.964497,0.990881,0.977511,0.999606
7,0.0004,0.001683,0.978916,0.987842,0.983359,0.999705
8,0.0017,0.001691,0.993865,0.984802,0.989313,0.999803
9,0.0024,0.005721,0.884615,0.978723,0.929293,0.998695
10,0.0045,0.008925,0.858726,0.942249,0.898551,0.998178


  _warn_prf(average, modifier, msg_start, len(result))


Fold 5 Metrics:
Accuracy: 0.998178327014918
Precision: 0.8587257617728532
Recall: 0.9422492401215805
Micro F1: 0.8985507246376812
Macro F1: 0.07073346980879822
Fold 6


Epoch,Training Loss,Validation Loss,Precision,Recall,F1 Micro,Accuracy
1,0.0046,0.001405,0.975104,0.987395,0.981211,0.999633
2,0.0033,0.000807,0.983368,0.993697,0.988506,0.999804
3,0.0012,0.000504,0.995807,0.997899,0.996852,0.999902
4,0.0007,0.000791,0.983402,0.995798,0.989562,0.999755
5,0.0006,0.000293,1.0,0.997899,0.998948,0.999976
6,0.0004,0.002561,0.946,0.993697,0.969262,0.99929
7,0.0004,0.000985,0.985263,0.983193,0.984227,0.999682
8,0.0008,0.000424,0.987526,0.997899,0.992685,0.999878
9,0.0011,0.001544,0.977226,0.991597,0.984359,0.999559
10,0.0027,0.006444,0.922619,0.976891,0.94898,0.998628


Fold 6 Metrics:
Accuracy: 0.9986283586842041
Precision: 0.9226190476190477
Recall: 0.976890756302521
Micro F1: 0.9489795918367347
Macro F1: 0.07553511334109085
Fold 7


Epoch,Training Loss,Validation Loss,Precision,Recall,F1 Micro,Accuracy
1,0.0017,0.001966,0.919231,0.987603,0.952191,0.999393
2,0.0006,0.001885,0.933594,0.987603,0.959839,0.999368
3,0.0002,0.001263,0.964143,1.0,0.981744,0.999621
4,0.0003,0.00068,0.979675,0.995868,0.987705,0.999823
5,0.0004,0.00058,0.971888,1.0,0.985743,0.999823
6,0.0004,0.001802,0.941634,1.0,0.96994,0.999621
7,0.0002,0.000612,0.979757,1.0,0.989775,0.999874
8,0.0005,0.001062,0.974468,0.946281,0.960168,0.999671
9,0.0022,0.001904,0.941176,0.991736,0.965795,0.999444
10,0.0025,0.002291,0.923077,0.991736,0.956175,0.999418


  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))


Fold 7 Metrics:
Accuracy: 0.9994184429441958
Precision: 0.9230769230769231
Recall: 0.9917355371900827
Micro F1: 0.9561752988047809
Macro F1: 0.07264986758531941
Fold 8


Epoch,Training Loss,Validation Loss,Precision,Recall,F1 Micro,Accuracy
1,0.002,0.001611,0.93617,0.972376,0.95393,0.999562
2,0.0007,0.001277,0.937824,1.0,0.967914,0.999684
3,0.0012,0.001149,0.947644,1.0,0.973118,0.999733
4,0.0002,0.000914,0.947644,1.0,0.973118,0.999733
5,0.0001,0.00111,0.952632,1.0,0.975741,0.999757
6,0.0002,0.000681,0.962766,1.0,0.98103,0.999805
7,0.0002,0.000741,0.957672,1.0,0.978378,0.999781
8,0.0002,0.00046,0.973118,1.0,0.986376,0.999854
9,0.0002,0.000609,0.962766,1.0,0.98103,0.999805
10,0.0019,0.000495,0.983516,0.98895,0.986226,0.999903


  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))


Fold 8 Metrics:
Accuracy: 0.9999027308318945
Precision: 0.9835164835164835
Recall: 0.988950276243094
Micro F1: 0.9862258953168045
Macro F1: 0.10797994141115101
Fold 9


Epoch,Training Loss,Validation Loss,Precision,Recall,F1 Micro,Accuracy
1,0.0005,0.000359,0.988571,1.0,0.994253,0.999898
2,0.0008,0.000175,0.994253,1.0,0.997118,0.999949
3,0.0001,4.8e-05,1.0,1.0,1.0,1.0
4,0.0001,2.4e-05,1.0,1.0,1.0,1.0
5,0.0001,1e-05,1.0,1.0,1.0,1.0
6,0.0001,1.1e-05,1.0,1.0,1.0,1.0
7,0.0001,8e-06,1.0,1.0,1.0,1.0
8,0.0001,1.5e-05,1.0,1.0,1.0,1.0
9,0.0001,4.8e-05,1.0,1.0,1.0,1.0
10,0.0069,0.003427,0.932203,0.953757,0.942857,0.999465


Fold 9 Metrics:
Accuracy: 0.9994653495595499
Precision: 0.9322033898305084
Recall: 0.953757225433526
Micro F1: 0.9428571428571428
Macro F1: 0.10197136090226915
Fold 10


Epoch,Training Loss,Validation Loss,Precision,Recall,F1 Micro,Accuracy
1,0.0025,0.002269,0.909396,1.0,0.952548,0.999325
2,0.0013,0.00104,0.960993,1.0,0.980108,0.999725
3,0.0005,0.001031,0.954225,1.0,0.976577,0.999675
4,0.0002,0.000876,0.954225,1.0,0.976577,0.999675
5,0.0001,0.000575,0.967857,1.0,0.983666,0.999775
6,0.0004,0.001593,0.931271,1.0,0.964413,0.9995
7,0.0005,0.001351,0.978339,1.0,0.989051,0.99985
8,0.0004,0.001151,0.954225,1.0,0.976577,0.999675
9,0.0007,0.001133,0.940767,0.99631,0.967742,0.9996
10,0.0003,0.000854,0.957597,1.0,0.978339,0.9997


Fold 10 Metrics:
Accuracy: 0.9996998499249625
Precision: 0.9575971731448764
Recall: 1.0
Micro F1: 0.9783393501805053
Macro F1: 0.07180481551604348

Metrics Mean and Standard Deviation:
Accuracy - Mean: 0.9973641113483133, Std: 0.003370671919924848
Precision - Mean: 0.8227075232571543, Std: 0.21684646706920616
Recall - Mean: 0.8494365859350452, Std: 0.2783059484104994
F1_micro - Mean: 0.8275458877516243, Std: 0.26679791043321444
F1_macro - Mean: 0.08455019608888985, Std: 0.019765380285472322

Combined Confusion Matrix:
[[398446   1893     67    266    124     54      0     38     10      8
       0    131     19    122      0     38      7    199    166]
 [  2074    120      0      6      0      0      0      1      2      0
       0      2      0      3      0      0      0      0      3]
 [    78      8      4      0      0      0      0      0      0      0
       0      0      0      0      0      0      0      0      0]
 [   291     16      4      3      0      0      0      0     

In [None]:
import numpy as np
import evaluate
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support

metric = evaluate.load("seqeval")

def compute_test_metrics(predictions, labels):
    predictions = np.argmax(predictions, axis=2)

    true_labels = []
    pred_labels = []

    for i in range(len(labels)):
        true_labels.append(labels[i])
        pred_labels.append(predictions[i])

    true_labels = [[label for label in sublist if label != -100] for sublist in true_labels]
    pred_labels = [[pred for pred, label in zip(pred_sublist, label_sublist) if label != -100]
                   for pred_sublist, label_sublist in zip(pred_labels, true_labels)]

    flat_true_labels = [item for sublist in true_labels for item in sublist]
    flat_pred_labels = [item for sublist in pred_labels for item in sublist]

    cm = confusion_matrix(flat_true_labels, flat_pred_labels, labels=list(label_map.values()))

    results = metric.compute(predictions=[[id2label[p] for p in sublist] for sublist in pred_labels],
                             references=[[id2label[l] for l in sublist] for sublist in true_labels])

    precision, recall, f1_weighted, _ = precision_recall_fscore_support(
        flat_true_labels, flat_pred_labels, average='weighted', zero_division=0
    )
    _, _, f1_micro, _ = precision_recall_fscore_support(
        flat_true_labels, flat_pred_labels, average='micro', zero_division=0
    )
    _, _, f1_macro, _ = precision_recall_fscore_support(
        flat_true_labels, flat_pred_labels, average='macro', zero_division=0
    )

    return {
        "confusion_matrix": cm,
        "accuracy": accuracy_score(flat_true_labels, flat_pred_labels),
        "precision": precision,
        "recall": recall,
        "f1_micro": f1_micro,
        "f1_macro": f1_macro
    }

test_dataset = NERDataset(test_texts, test_labels, tokenizer, MAX_LEN)

predictions, labels, _ = trainer.predict(test_dataset)

test_metrics = compute_test_metrics(predictions, labels)

for metric_name, metric_value in test_metrics.items():
    print(f"{metric_name}: {metric_value}")


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


confusion_matrix: [[45811    64     5     7     4     0     0     0     0     0     0     1
      0     0     0     8     0     9     3]
 [  262     5     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [   10     1     1     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [   45     1     0     4     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [   25     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    8     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    7     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    1     0     0     0     

In [None]:
import evaluate

# Load evaluation metric
metric = evaluate.load("seqeval")

# Compute metrics for test dataset
def compute_test_metrics(predictions, labels):
    predictions = np.argmax(predictions, axis=2)

    true_labels = []
    pred_labels = []

    for i in range(len(labels)):
        true_labels.append(labels[i])
        pred_labels.append(predictions[i])

    # Remove ignored index (-100)
    true_labels = [[label for label in sublist if label != -100] for sublist in true_labels]
    pred_labels = [[pred for pred, label in zip(pred_sublist, label_sublist) if label != -100]
                   for pred_sublist, label_sublist in zip(pred_labels, true_labels)]

    # Flatten the lists for confusion matrix
    flat_true_labels = [item for sublist in true_labels for item in sublist]
    flat_pred_labels = [item for sublist in pred_labels for item in sublist]

    # Calculate confusion matrix
    cm = confusion_matrix(flat_true_labels, flat_pred_labels, labels=list(label_map.values()))

    results = metric.compute(predictions=[[id2label[p] for p in sublist] for sublist in pred_labels],
                             references=[[id2label[l] for l in sublist] for sublist in true_labels])

    return {
        "confusion_matrix": cm,
        "accuracy": accuracy_score(flat_true_labels, flat_pred_labels),
        "precision": precision_recall_fscore_support(flat_true_labels, flat_pred_labels, average='weighted')[0],
        "recall": precision_recall_fscore_support(flat_true_labels, flat_pred_labels, average='weighted')[1],
        "f1_micro": precision_recall_fscore_support(flat_true_labels, flat_pred_labels, average='micro')[2],
        "f1_macro": precision_recall_fscore_support(flat_true_labels, flat_pred_labels, average='macro')[2]
    }

# Test dataset preparation
test_dataset = NERDataset(test_texts, test_labels, tokenizer, MAX_LEN)

# Predict on test dataset
predictions, labels, _ = trainer.predict(test_dataset)

# Compute metrics
test_metrics = compute_test_metrics(predictions, labels)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp -r model/ drive/MyDrive