In [None]:
!pip install datasets evaluate seqeval[gpu]

In [None]:
!pip install -U accelerate
!pip install -U transformers seqeval[gpu]

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification

In [None]:
from pathlib import Path
import re

#Function to load and split the data into text and tags
def read_data(file_path):
    file_path = Path(file_path)

    raw_text = file_path.read_text().strip()
    #print(raw_text[:100])
    raw_docs = re.split(r'\n', raw_text)
    print(raw_docs[:100])
    token_docs = []
    tag_docs = []
    for doc in raw_docs:
        tokens = []
        tags = []
        for line in doc.split('\n'):
            if len(line) < 3:
              continue
            token, tag, sentence= line.split('\t')
            tokens.append(token)
            tags.append(tag)
        token_docs.append(tokens)
        tag_docs.append(tags)

    return token_docs, tag_docs

#read train data
texts, tags = read_data('./data/train.tsv')

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

In [None]:
from sklearn.model_selection import train_test_split

#Split the data using an 80% random split
train_text,val_text, train_tags,  val_tags = train_test_split(texts,tags, test_size=0.2, random_state=42)

In [None]:
#Mapping the tags to label ids
unique_tags = set(tag for doc in tags for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

In [None]:
#Get the list of unique tags in the data
label_list = list(unique_tags)
label_list

In [None]:
from transformers import BertTokenizerFast

#Loading the BERT Tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-uncased')

#Obtaining the training and validation encodings after tokenization of text
train_encodings = tokenizer(train_text, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
val_encodings = tokenizer(val_text, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

In [None]:
import numpy as np

def encode_tags(tags, encodings):
  print(tags[:10])
  labels = [[tag2id[tag] for tag in doc] for doc in tags]
  encoded_labels = []
  for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        # create an empty array of -100
    doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
    arr_offset = np.array(doc_offset)

    # set labels whose first offset position is 0 and the second is not 0
    doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
    encoded_labels.append(doc_enc_labels.tolist())

  return encoded_labels

#Obtaining the training and validation labels after encoding of tags
train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)

In [None]:
import torch

class MEDOCCANDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_encodings.pop("offset_mapping") # we don't want to pass this to the model
val_encodings.pop("offset_mapping")
train_dataset = MEDOCCANDataset(train_encodings, train_labels)
val_dataset = MEDOCCANDataset(val_encodings, val_labels)

In [None]:
from transformers import BertForTokenClassification

#Loading the modelfrom transformers library and sending to cuda
model = BertForTokenClassification.from_pretrained('bert-base-multilingual-cased', num_labels=len(unique_tags))
model = model.to(device)

In [None]:
import evaluate

seqeval = evaluate.load("seqeval")

#compute_metrics funciton is used to obtain the various metrics during training and evaluation
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
from sklearn.model_selection import KFold
from transformers import Trainer, TrainingArguments, AdamW, get_linear_schedule_with_warmup

# Defining test dataset
test_texts, test_tags = read_data('test.tsv')
test_encodings = tokenizer(test_texts, is_split_into_words=True,
                          return_offsets_mapping=True, padding=True,
                           truncation=True)
test_labels = encode_tags(test_tags, test_encodings)
test_dataset = MEDOCCANDataset(test_encodings, test_labels)  # Replace with your actual test dataset

# Instantiate Trainer and TrainingArguments for this fold
training_args = TrainingArguments(
    output_dir=f'./results',  # Directory for results
    num_train_epochs=5,                  # Total number of training epochs
    per_device_train_batch_size=12,       # Batch size per GPU
    logging_dir=f'./logs',    # Directory for storing logs
    save_strategy = "epoch",                       # Save model checkpoint every 500 steps
    evaluation_strategy="epoch",          # Evaluate at the end of each epoch
    logging_steps=100,                    # Log metrics every 100 steps
    learning_rate=3e-5,                   # Learning rate
    gradient_accumulation_steps=1,        # Number of updates steps before backward pass
    weight_decay=0.0,                     # Weight decay (if applicable)
    adam_beta1=0.9,                       # AdamW beta1
    adam_beta2=0.999,                     # AdamW beta2
    adam_epsilon=1e-8,                    # AdamW epsilon
    max_grad_norm=1.0,                    # Gradient clipping threshold
    warmup_steps=500,                     # Number of warmup steps for the scheduler
    load_best_model_at_end=True,          # Load the best model when training ends
    metric_for_best_model='eval_loss',    # Metric to use to determine the best model
    greater_is_better=False               # Indicate if higher metric values are better
)

# Instantiate AdamW optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=training_args.learning_rate,
                        betas=(training_args.adam_beta1, training_args.adam_beta2),
                        eps=training_args.adam_epsilon)
num_training_steps = len(train_encodings) // (training_args.per_device_train_batch_size *
                                                    training_args.gradient_accumulation_steps) * training_args.num_train_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=training_args.warmup_steps,
                                                num_training_steps=num_training_steps)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    optimizers=(optimizer,scheduler),
    train_dataset= train_dataset,
    eval_dataset= val_dataset,
    compute_metrics=compute_metrics
)

# Train the model for this fold
trainer.train()

# Evaluate the model for this fold
evaluation_result = trainer.evaluate()
print(f"Evaluation result:")
print(evaluation_result)
test_predictions = trainer.predict(test_dataset)
model.save_pretrained(f"./model/")


In [None]:
!git lfs install --system --skip-repo

In [None]:
tokenizer.save_pretrained('./tokenizer')

In [None]:
!pip install huggingface_hub
from huggingface_hub import notebook_login
notebook_login()

In [None]:
model.push_to_hub('sravn/NER-BERT-MEDOCCAN')

In [None]:
tokenizer.push_to_hub('sravn/NER-BERT-MEDOCCAN')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
model.save_pretrained("/content/drive/My Drive/NER_BERT/model/")
tokenizer.save_pretrained("/content/drive/My Drive/NER_BERT/tokenizer/")