## Training Model

In [68]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

tokenizer = AutoTokenizer.from_pretrained("Clinical-AI-Apollo/Medical-NER")
model = AutoModelForTokenClassification.from_pretrained("Clinical-AI-Apollo/Medical-NER")
print(model.config.id2label)


loading file spm.model from cache at /home/sammartj/.cache/huggingface/hub/models--Clinical-AI-Apollo--Medical-NER/snapshots/a9d5061193e969de80b24225f926cb224caac1ce/spm.model
loading file tokenizer.json from cache at /home/sammartj/.cache/huggingface/hub/models--Clinical-AI-Apollo--Medical-NER/snapshots/a9d5061193e969de80b24225f926cb224caac1ce/tokenizer.json
loading file added_tokens.json from cache at /home/sammartj/.cache/huggingface/hub/models--Clinical-AI-Apollo--Medical-NER/snapshots/a9d5061193e969de80b24225f926cb224caac1ce/added_tokens.json
loading file special_tokens_map.json from cache at /home/sammartj/.cache/huggingface/hub/models--Clinical-AI-Apollo--Medical-NER/snapshots/a9d5061193e969de80b24225f926cb224caac1ce/special_tokens_map.json
loading file tokenizer_config.json from cache at /home/sammartj/.cache/huggingface/hub/models--Clinical-AI-Apollo--Medical-NER/snapshots/a9d5061193e969de80b24225f926cb224caac1ce/tokenizer_config.json
loading configuration file config.json fro

{0: 'O', 1: 'B-ACTIVITY', 2: 'I-ACTIVITY', 3: 'I-ADMINISTRATION', 4: 'B-ADMINISTRATION', 5: 'B-AGE', 6: 'I-AGE', 7: 'I-AREA', 8: 'B-AREA', 9: 'B-BIOLOGICAL_ATTRIBUTE', 10: 'I-BIOLOGICAL_ATTRIBUTE', 11: 'I-BIOLOGICAL_STRUCTURE', 12: 'B-BIOLOGICAL_STRUCTURE', 13: 'B-CLINICAL_EVENT', 14: 'I-CLINICAL_EVENT', 15: 'B-COLOR', 16: 'I-COLOR', 17: 'I-COREFERENCE', 18: 'B-COREFERENCE', 19: 'B-DATE', 20: 'I-DATE', 21: 'I-DETAILED_DESCRIPTION', 22: 'B-DETAILED_DESCRIPTION', 23: 'I-DIAGNOSTIC_PROCEDURE', 24: 'B-DIAGNOSTIC_PROCEDURE', 25: 'I-DISEASE_DISORDER', 26: 'B-DISEASE_DISORDER', 27: 'B-DISTANCE', 28: 'I-DISTANCE', 29: 'B-DOSAGE', 30: 'I-DOSAGE', 31: 'I-DURATION', 32: 'B-DURATION', 33: 'I-FAMILY_HISTORY', 34: 'B-FAMILY_HISTORY', 35: 'B-FREQUENCY', 36: 'I-FREQUENCY', 37: 'I-HEIGHT', 38: 'B-HEIGHT', 39: 'B-HISTORY', 40: 'I-HISTORY', 41: 'I-LAB_VALUE', 42: 'B-LAB_VALUE', 43: 'I-MASS', 44: 'B-MASS', 45: 'I-MEDICATION', 46: 'B-MEDICATION', 47: 'I-NONBIOLOGICAL_LOCATION', 48: 'B-NONBIOLOGICAL_LOCATIO

In [87]:
new_tags = {
    83 : "B-PROTEIN",
    84 : "I-PROTEIN",
    85 : "B-GENE",
    86 : "I-GENE"
}
existing_labels = model.config.id2label

updated_labels = {**existing_labels, **new_tags}

# Update model's config
model.config.id2label = updated_labels
model.config.label2id = {v: k for k, v in updated_labels.items()}

In [88]:
import torch.nn as nn

num_labels = 87  # 83 original + 4 new entities
model.classifier = nn.Linear(in_features=model.config.hidden_size, out_features=num_labels)

# Update model config to reflect new number of labels
model.config.num_labels = num_labels

In [89]:
model.save_pretrained("updated_model")

Configuration saved in updated_model/config.json
Model weights saved in updated_model/pytorch_model.bin


In [90]:
model = AutoModelForTokenClassification.from_pretrained("updated_model")

loading configuration file updated_model/config.json
Model config DebertaV2Config {
  "_name_or_path": "updated_model",
  "architectures": [
    "DebertaV2ForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-ACTIVITY",
    "2": "I-ACTIVITY",
    "3": "I-ADMINISTRATION",
    "4": "B-ADMINISTRATION",
    "5": "B-AGE",
    "6": "I-AGE",
    "7": "I-AREA",
    "8": "B-AREA",
    "9": "B-BIOLOGICAL_ATTRIBUTE",
    "10": "I-BIOLOGICAL_ATTRIBUTE",
    "11": "I-BIOLOGICAL_STRUCTURE",
    "12": "B-BIOLOGICAL_STRUCTURE",
    "13": "B-CLINICAL_EVENT",
    "14": "I-CLINICAL_EVENT",
    "15": "B-COLOR",
    "16": "I-COLOR",
    "17": "I-COREFERENCE",
    "18": "B-COREFERENCE",
    "19": "B-DATE",
    "20": "I-DATE",
    "21": "I-DETAILED_DESCRIPTION",
    "22": "B-DETAILED_DESCRIPTION",
    "23": "I-DIAGNOSTIC_PROCEDURE",
    "24": "B-DIAGNOSTIC_PROCEDURE",
    "25": 

In [91]:
nlp_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

In [6]:
from datasets import load_dataset
dataset = load_dataset("../downloaded_txt_files/testing", data_files={"train": "training.json", "test": "testing.json"})


Using custom data configuration testing-7fba87adafe2b0c7
Found cached dataset json (/home/sammartj/.cache/huggingface/datasets/json/testing-7fba87adafe2b0c7/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, padding='max_length', max_length = 128, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        for word_id in word_ids:
            if word_id is None:  
                label_ids.append(-100)  # Ignore special tokens
            elif word_id >= len(label):  # Ensure word_id is within label range
                label_ids.append(-100)  
            elif word_id != word_ids[word_id - 1]:  # Assign label only to the first subword
                label_ids.append(label[word_id])
            else:
                label_ids.append(-100)  # Assign -100 to subword tokens
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels  # Add labels to the dataset
    return tokenized_inputs
   
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [94]:
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
import torch
# Define evaluation metric
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)

    true_predictions = [[model.config.id2label[p] for p, l in zip(pred, label) if l != -100] 
                        for pred, label in zip(predictions, labels)]
    true_labels = [[model.config.id2label[l] for p, l in zip(pred, label) if l != -100] 
                   for pred, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir="./ner_results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [95]:
def convert_labels_to_tensor(example):
    example["labels"] = torch.tensor(example["labels"], dtype=torch.long)
    return example

tokenized_datasets = tokenized_datasets.map(convert_labels_to_tensor)

  0%|          | 0/105 [00:00<?, ?ex/s]

  0%|          | 0/208 [00:00<?, ?ex/s]

In [96]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()


The following columns in the training set don't have a corresponding argument in `DebertaV2ForTokenClassification.forward` and have been ignored: tokens, ner_tags. If tokens, ner_tags are not expected by `DebertaV2ForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 105
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 70
  Number of trainable parameters = 183898455


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,2.6984,0.827919,0.566061,0.652235,0.6061,0.843764
2,0.9663,0.459125,0.675971,0.777933,0.723377,0.886988
3,0.2474,0.376611,0.816976,0.860335,0.838095,0.911301
4,0.1502,0.369659,0.822222,0.878492,0.849426,0.906799
5,0.1056,0.367208,0.836436,0.878492,0.856948,0.911751


The following columns in the evaluation set don't have a corresponding argument in `DebertaV2ForTokenClassification.forward` and have been ignored: tokens, ner_tags. If tokens, ner_tags are not expected by `DebertaV2ForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 208
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./ner_results/checkpoint-14
Configuration saved in ./ner_results/checkpoint-14/config.json
Model weights saved in ./ner_results/checkpoint-14/pytorch_model.bin
tokenizer config file saved in ./ner_results/checkpoint-14/tokenizer_config.json
Special tokens file saved in ./ner_results/checkpoint-14/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DebertaV2ForTokenClassification.forward` and have been ignored: tokens, ner_tags. If tokens, ner_tags are not expected by `DebertaV2ForTokenClassification.fo

TrainOutput(global_step=70, training_loss=0.6776569928441729, metrics={'train_runtime': 1187.1766, 'train_samples_per_second': 0.442, 'train_steps_per_second': 0.059, 'total_flos': 34322173804800.0, 'train_loss': 0.6776569928441729, 'epoch': 5.0})

In [97]:
model.save_pretrained("finetuned-model")

Configuration saved in finetuned-model/config.json
Model weights saved in finetuned-model/pytorch_model.bin


## Loading Model

In [2]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

ner_model =  AutoModelForTokenClassification.from_pretrained("finetuned-model")
tokenizer = AutoTokenizer.from_pretrained("Clinical-AI-Apollo/Medical-NER")

2025-04-01 18:45:39.255788: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-04-01 18:45:39.809733: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-01 18:45:46.479389: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2025-04-01 18:45:46.479649: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] 

In [14]:
from pathlib import Path

directory = Path('../downloaded_txt_files/target')

In [5]:
def chunk_text(text, tokenizer, chunk_size=100, stride=20):
    tokens = tokenizer(text, add_special_tokens=False)["input_ids"]
    chunks = []
    
    for i in range(0, len(tokens), chunk_size - stride):
        chunk = tokens[i : i + chunk_size]
        chunks.append(tokenizer.decode(chunk))  # Convert tokens back to text
    
    return chunks

In [None]:
import sys
import re
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
import torch
import pandas as pd

df = pd.DataFrame(columns=['pmcid', 'entity', 'score', 'index', 'word', 'start', 'end'])
counter = 0 
for file in directory.iterdir():
    if file.suffix == '.txt':
        with open(file, 'r') as f:
            content = f.read()
        
        chunks = chunk_text(content, tokenizer, chunk_size=100, stride=20)

        # Process each chunk with NER
        results = []
        for chunk in chunks:
            results.extend(ner_pipeline(chunk))
            
        specific = []
        for obj in results:
            if obj['score'] > 0.5:
                specific.append(obj)
                  
        for obj in specific:
            if obj['word'] not in df['word'].unique():
                   df.loc[len(df.index)] = [file.name, obj['entity'], obj['score'], obj['index'], obj['word'], obj['start'], obj['end']]
        counter+=1
        if counter%100 == 0:
            print(counter)
                  
df.to_csv('../ner_results.csv')
print('all done !')



100
