In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
pip install seqeval datasets

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=9c9300244e3d0052b415c188ee438dece6fd8fa7f7b762e38dca18a94becf626
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
Note: you may need to restart the kernel to use updated packages.


In [4]:
from datasets import Dataset
import re
from datasets import load_dataset
import numpy as np
import datasets
import torch

Load the pubmed dataset. the below code will convert the dataset into tokens and tags format.

In [5]:
data = []
file_path = '/kaggle/input/pubmed/output-combined3.txt'
count = 0
with open(file_path, 'r') as file:
    for line in file:
        data.append(eval(line))

The pubmed dataset extracted and it is in format suitable for spacy model. this format is changed into word tokens and each word is annotated as whether the word is disease or not. 

In [6]:
def convert_to_dataset(data):
    ncbi_data = []
    for text, annotations in data:
        tokens = re.findall(r'\S+', text)  # Tokenize the text using whitespace as delimiter
        ner_tags = [0] * len(tokens)  # Initialize with "O" (outside) tags
        start, end, label = annotations['entities'][0]  # Assuming there's only one entity
        char_pos = 0
        for i, token in enumerate(tokens):
            if char_pos == start:
                ner_tags[i] = 1 # "B-Disease" for beginning of the entity
            elif char_pos > start and char_pos < end:
                ner_tags[i] = 1  # "I-Disease" for inside of the entity
            char_pos += len(tokens[i]) + 1  # Update character position for next token
        example = {
            "id": str(len(ncbi_data)),  # Generate a unique ID for each example
            "tokens": tokens,
            "ner_tags": ner_tags
        }
        ncbi_data.append(example)
    return Dataset.from_dict({"id": [example["id"] for example in ncbi_data],
                              "tokens": [example["tokens"] for example in ncbi_data],
                              "ner_tags": [example["ner_tags"] for example in ncbi_data]})

pubmed_dataset = convert_to_dataset(data)

due to computing resource contraint, we shuffled the entire dataset and selected first 100000 samples as train dataset and then 50000 as test dataset. base on this BERT model is fine tuned.

In [7]:
pubmed_dataset = pubmed_dataset.shuffle()
pubmed_dataset[10000]

{'id': '88898',
 'tokens': ['Probiotics',
  'may',
  'make',
  'little',
  'or',
  'no',
  'difference',
  'in',
  'QoL',
  'for',
  'people',
  'with',
  'eczema',
  'nor',
  'in',
  'investigator-rated',
  'eczema',
  'severity',
  'score',
  '(combined',
  'with',
  'participant',
  'scoring',
  'for',
  'eczema',
  'symptoms',
  'of',
  'itch',
  'and',
  'sleep',
  'loss);',
  'for',
  'the',
  'latter,',
  'the',
  'observed',
  'effect',
  'was',
  'small',
  'and',
  'of',
  'uncertain',
  'clinical',
  'significance.'],
 'ner_tags': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]}

In [8]:
from transformers import BertTokenizerFast, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

def tokenize_and_align_labels(examples, label_all_tokens=True): 
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True) 
    labels = [] 
    for i, label in enumerate(examples["ner_tags"]): 
        word_ids = tokenized_inputs.word_ids(batch_index=i) 
        # word_ids() => Return a list mapping the tokens
        # to their actual word in the initial sentence.
        # It Returns a list indicating the word corresponding to each token. 
        previous_word_idx = None 
        label_ids = []
        # Special tokens like `` and `<\s>` are originally mapped to None 
        # We need to set the label to -100 so they are automatically ignored in the loss function.
        for word_idx in word_ids: 
            if word_idx is None: 
                # set –100 as the label for these special tokens
                label_ids.append(-100)
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            elif word_idx != previous_word_idx:
                # if current word_idx is != prev then its the most regular case
                # and add the corresponding token                 
                label_ids.append(label[word_idx]) 
            else: 
                # to take care of sub-words which have the same word_idx
                # set -100 as well for them, but only if label_all_tokens == False
                label_ids.append(label[word_idx] if label_all_tokens else -100) 
                # mask the subword representations after the first subword
                 
            previous_word_idx = word_idx 
        labels.append(label_ids) 
    tokenized_inputs["labels"] = labels 
    return tokenized_inputs 
train_data = Dataset.from_dict(pubmed_dataset[:100000])
validation_data = Dataset.from_dict(pubmed_dataset[100000:150000])
tokenized_datasets = train_data.map(tokenize_and_align_labels, batched=True)
validation_datasets = validation_data.map(tokenize_and_align_labels, batched=True)
tokenized_datasets[0]

2024-05-03 04:43:17.495645: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-03 04:43:17.495756: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-03 04:43:17.629927: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

{'id': '367466',
 'tokens': ['Four',
  'RCTs',
  'and',
  'a',
  'quasiexperimental',
  'study',
  'indicate',
  'that',
  'some',
  'interventions',
  'can',
  'enhance',
  'SSE',
  'activity',
  'and',
  'so',
  'are',
  'more',
  'likely',
  'to',
  'aid',
  'early',
  'detection',
  'of',
  'skin',
  'cancer.'],
 'ner_tags': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1],
 'input_ids': [101,
  2176,
  22110,
  3215,
  1998,
  1037,
  17982,
  10288,
  4842,
  14428,
  15758,
  2817,
  5769,
  2008,
  2070,
  19388,
  2064,
  11598,
  7020,
  2063,
  4023,
  1998,
  2061,
  2024,
  2062,
  3497,
  2000,
  4681,
  2220,
  10788,
  1997,
  3096,
  4456,
  1012,
  102],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  

The model training by default store the epoch results in wandb, so Wandb project api key is needed. a free account in weights and biases with project created in that account should generate an apikey that need to be given below, during execution.

In [9]:
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=7)

args = TrainingArguments( 
"test-ner",
evaluation_strategy = "epoch", 
learning_rate=2e-5, 
per_device_train_batch_size=16, 
per_device_eval_batch_size=16, 
num_train_epochs=1, 
weight_decay=0.01, 
) 

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = datasets.load_metric("seqeval") 
label_list = ['O','Skin'] 

def compute_metrics(eval_preds): 
    pred_logits, labels = eval_preds 
    
    pred_logits = np.argmax(pred_logits, axis=2) 
    predictions = [ 
        [label_list[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100] 
        for prediction, label in zip(pred_logits, labels) 
    ] 
    true_labels = [ 
      [label_list[l] for (eval_preds, l) in zip(prediction, label) if l != -100] 
       for prediction, label in zip(pred_logits, labels) 
   ] 
    results = metric.compute(predictions=predictions, references=true_labels)

    return { 
          "precision": results["overall_precision"], 
          "recall": results["overall_recall"], 
          "f1": results["overall_f1"], 
          "accuracy": results["overall_accuracy"], 
  } 
     

    
trainer = Trainer( 
   model, 
   args, 
   train_dataset=tokenized_datasets, 
   eval_dataset=validation_datasets, 
   data_collator=data_collator, 
   tokenizer=tokenizer, 
   compute_metrics=compute_metrics 
) 

trainer.train() 


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  metric = datasets.load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0091,0.009009,0.982219,0.985097,0.983656,0.996615




TrainOutput(global_step=6250, training_loss=0.020296326637268068, metrics={'train_runtime': 1417.9167, 'train_samples_per_second': 70.526, 'train_steps_per_second': 4.408, 'total_flos': 4540905982214976.0, 'train_loss': 0.020296326637268068, 'epoch': 1.0})

In [15]:
model.save_pretrained("/kaggle/working/ner_model")

In [16]:
tokenizer.save_pretrained("/kaggle/working/tokenizer")

('/kaggle/working/tokenizer/tokenizer_config.json',
 '/kaggle/working/tokenizer/special_tokens_map.json',
 '/kaggle/working/tokenizer/vocab.txt',
 '/kaggle/working/tokenizer/added_tokens.json',
 '/kaggle/working/tokenizer/tokenizer.json')

In [17]:
!zip -r tokeniser.zip /kaggle/working/tokenizer/
!zip -r ner_model.zip /kaggle/working/ner_model/

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  adding: kaggle/working/tokenizer/ (stored 0%)
  adding: kaggle/working/tokenizer/special_tokens_map.json (deflated 42%)
  adding: kaggle/working/tokenizer/tokenizer_config.json (deflated 76%)
  adding: kaggle/working/tokenizer/vocab.txt (deflated 53%)
  adding: kaggle/working/tokenizer/tokenizer.json (deflated 71%)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  adding: kaggle/working/ner_model/ (stored 0%)
  adding: kaggle/working/ner_model/config.json (deflated 54%)
  adding: kaggle/working/ner_model/model.safetensors (deflated 7%)


In [20]:

id2label = {
    str(i): label for i,label in enumerate(label_list)
}
label2id = {
    label: str(i) for i,label in enumerate(label_list)
}

In [21]:
import json
config = json.load(open("ner_model/config.json"))
config["id2label"] = id2label
config["label2id"] = label2id
json.dump(config, open("/kaggle/working/ner_model/config.json","w"))

In [22]:
model_fine_tuned = AutoModelForTokenClassification.from_pretrained("/kaggle/working/ner_model", ignore_mismatched_sizes=True)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at /kaggle/working/ner_model and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([7]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([7, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:

from transformers import pipeline

nlp = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer)


example = "Eczema (Atopic Dermatitis): Eczema is a common inflammatory skin condition that causes itching, redness, and rash. It often occurs in individuals with a family history of allergies or asthma and requires careful management to prevent flare-ups and maintain skin health"
ner_results = nlp(example)
ner_results

[]