In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import transformers
#from transformers import AutoTokenizer
from transformers import DistilBertForTokenClassification
from torch.optim import AdamW
import torch
import torch.nn as nn
from torch.optim import SGD
import torch.nn.functional as F
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score,f1_score, precision_score, recall_score
import re
from torchcrf import CRF

from transformers import AutoTokenizer, AutoModelForMaskedLM

#import part

using_biobert = False #biobert option
using_bilstmcrf = True #biLSTM + CRF option

In [2]:
N = 43821 #columns of dataset
df = pd.read_csv("NER_dataset.csv", encoding="cp949").sample(frac=1)[:N] #load dataset

#change field name
df.rename(columns = {'text':'sentence', 'labels':'tags'}, inplace = True)

#split train, dev, test data (dev data is used for cross validation)
df_train, df_dev, df_test = np.split(df.sample(frac=1, random_state=0), [int(.8 * len(df)), int(.9 * len(df))])

In [3]:
df.head()
#show head part of dataset

Unnamed: 0,sentence,tags
40113,"AZEL HIBBSKAYS , M.D. CF66",O O O O O
6458,2) Azotemia presumed secondary to sepsis and d...,O B-PR O O O B-PR O B-PR O B-TE O O O O B-TR I...
22986,"White count was 14.4 , hematocrit was 36.2 , p...",B-TE I-TE O O O B-TE O O O B-TE I-TE O O O B-T...
25131,There was artificial rupture of membranes was ...,O O B-TR I-TR I-TR I-TR O O O B-TR I-TR O
35653,( End of Report ),O O O O O


In [4]:
df.tail()
#show tail part of dataset

Unnamed: 0,sentence,tags
33065,2013-05-24 07:40 AM 628 *,O O O O O
7792,Angioectasias in the fundus,B-PR I-PR I-PR I-PR
17931,BP 157/85,B-TE O
24,At 9am the morning of admission he passed a la...,O O O O O O O O B-PR I-PR I-PR I-PR I-PR I-PR ...
33842,Heart Failure Service was involved .,O O O O O O


In [5]:
# tansfrom label to list (delimiter is " ")
labels = [i.split() for i in df['tags'].values.tolist()]

# count the number of label
unique_labels = set()

for lb in labels:
    [unique_labels.add(i) for i in lb if i not in unique_labels]
 
print(unique_labels)
# {'B-TE', 'I-TR', 'I-PR', 'I-TE', 'O', 'B-TR', 'B-PR'}

# mappingg unique_lables to their id
labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}
ids_to_labels = {v: k for v, k in enumerate(sorted(unique_labels))}
print(labels_to_ids)
# {'B-PR': 0, 'B-TE': 1, 'B-TR': 2, 'I-PR': 3, 'I-TE': 4, 'I-TR': 5, 'O': 6}

{'I-TR', 'I-TE', 'O', 'I-PR', 'B-TE', 'B-PR', 'B-TR'}
{'B-PR': 0, 'B-TE': 1, 'B-TR': 2, 'I-PR': 3, 'I-TE': 4, 'I-TR': 5, 'O': 6}


In [6]:
#main class for doing NER with Distilbert

class DistilbertNER(nn.Module):
  
    def __init__(self, tokens_dim): #constructor 
        
        super(DistilbertNER,self).__init__()
        #constructor of parent
    
        if type(tokens_dim) != int:
            raise TypeError('tokens_dim should be an integer')
        if tokens_dim <= 0:
            raise ValueError('Classification layer dimension should be at least 1')
        #exception handling part
            
        if using_biobert == False :        
            self.pretrained = DistilBertForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels = tokens_dim) #for using distilBERT model
        else :
            self.pretrained = AutoModelForMaskedLM.from_pretrained("dmis-lab/biobert-base-cased-v1.2", num_labels = tokens_dim) #for using bioBERT model
       
        if using_bilstmcrf == True :
            hidden_size=768
            num_classes=7 #7 labels
            dr_rate=0.3
    
            self.dropout = nn.Dropout(dr_rate)
            self.bilstm = nn.LSTM(hidden_size, (hidden_size) // 2, dropout=dr_rate, batch_first=True, bidirectional=True)
            self.position_wise_ff = nn.Linear(hidden_size, num_classes)
            self.crf = CRF(num_tags=num_classes, batch_first = True)
    
            #set the output of each token classifier = unique_lables

    def forward(self, input_ids, attention_mask, labels = None): #define the way for getting ouput by given input
        if labels == None:
            out = self.pretrained(input_ids = input_ids, attention_mask = attention_mask )
            return out #return without label

        else: #labels != None
            out = self.pretrained(input_ids = input_ids, attention_mask = attention_mask , labels = labels)
            return out #return with label

In [7]:
#generalized function for get information from dataset

class NerDataset(torch.utils.data.Dataset):
  
  def __init__(self, df):
    if not isinstance(df, pd.DataFrame):
      raise TypeError('Input should be a dataframe')
    
    if "tags" not in df.columns or "sentence" not in df.columns:
      raise ValueError("Dataframe should contain 'tags' and 'sentence' columns")
    
    tags_list = [i.split() for i in df["tags"].values.tolist()]
    texts = df["sentence"].values.tolist()
    
    #for change float(nan) -> string("nan")
    i = 0
    for string in texts:
        i += 1
        if(isinstance(string, float)):
            texts[i - 1] = "nan"

    self.texts = [tokenizer(text, padding = "max_length", truncation = True, return_tensors = "pt") for text in texts]
    self.labels = [match_tokens_labels(text, tags) for text,tags in zip(self.texts, tags_list)]

  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):
    batch_text = self.texts[idx]
    batch_labels = self.labels[idx]

    return batch_text, torch.LongTensor(batch_labels)

In [8]:
class MetricsTracking():
  def __init__(self):

    self.total_acc = 0
    self.total_f1 = 0
    self.total_precision = 0
    self.total_recall = 0

  def update(self, predictions, labels , ignore_token = -100):  
    predictions = predictions.flatten()
    labels = labels.flatten()
    
    predictions = predictions[labels != ignore_token]
    labels = labels[labels != ignore_token]

    predictions = predictions.to("cpu")
    labels = labels.to("cpu")

    acc = accuracy_score(labels,predictions)
    f1 = f1_score(labels, predictions, average = "macro")
    precision = precision_score(labels, predictions, average = "macro")
    recall = recall_score(labels, predictions, average = "macro")

    self.total_acc  += acc
    self.total_f1 += f1
    self.total_precision += precision
    self.total_recall  += recall

  def return_avg_metrics(self,data_loader_size):
    n = data_loader_size
    metrics = {
        "acc": round(self.total_acc / n ,3), 
        "f1": round(self.total_f1 / n, 3), 
        "precision" : round(self.total_precision / n, 3), 
        "recall": round(self.total_recall / n, 3)
          }
    return metrics

In [9]:
#create label

def tags_2_labels(tags : str, tag2idx : dict):
  return [tag2idx[tag] if tag in tag2idx else unseen_label for tag in tags.split()] 

In [10]:
#map words to tag

def tags_mapping(tags_series : pd.Series):
  if not isinstance(tags_series, pd.Series):
      raise TypeError('Input should be a padas Series')

  unique_tags = set()
  
  for tag_list in df_train["tags"]:
    for tag in tag_list.split():
      unique_tags.add(tag)

  tag2idx = {k:v for v,k in enumerate(sorted(unique_tags))}
  idx2tag = {k:v for v,k in tag2idx.items()}

  unseen_label = tag2idx["O"]

  return tag2idx, idx2tag, unseen_label, unique_tags

In [11]:
#-100 means CLS or PAD

def match_tokens_labels(tokenized_input, tags, ignore_token = -100):
        word_ids = tokenized_input.word_ids()

        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:

            if word_idx is None:
                label_ids.append(ignore_token)

            else :
                try:
                  reference_tag = tags[word_idx]
                  label_ids.append(tag2idx[reference_tag])
                except:
                  label_ids.append(ignore_token)
            
            previous_word_idx = word_idx

        return label_ids

In [12]:
#train & evaluation function

def train_loop(model, train_dataset, dev_dataset, optimizer,  batch_size, epochs):
  
  train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
  dev_dataloader = DataLoader(dev_dataset, batch_size = batch_size, shuffle = True)

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model = model.to(device)

  for epoch in range(epochs) : 
    
    train_metrics = MetricsTracking()
    total_loss_train = 0

    model.train() #core function for train

    for train_data, train_label in tqdm(train_dataloader):

      train_label = train_label.to(device)

      mask = train_data['attention_mask'].squeeze(1).to(device)
      input_id = train_data['input_ids'].squeeze(1).to(device)

      optimizer.zero_grad()
      
      output = model(input_id, mask, train_label)
      loss, logits = output.loss, output.logits
      predictions = logits.argmax(dim= -1) 

      train_metrics.update(predictions, train_label)
      total_loss_train += loss.item()

      loss.backward()
      optimizer.step()
    
    model.eval() #core function for evaluation

    dev_metrics = MetricsTracking()
    total_loss_dev = 0
    
    with torch.no_grad():
      for dev_data, dev_label in dev_dataloader:

        dev_label = dev_label.to(device)

        mask = dev_data['attention_mask'].squeeze(1).to(device)
        input_id = dev_data['input_ids'].squeeze(1).to(device)

        output = model(input_id, mask, dev_label)
        loss, logits = output.loss, output.logits

        predictions = logits.argmax(dim= -1)     

        dev_metrics.update(predictions, dev_label)
        total_loss_dev += loss.item()
    
    train_results = train_metrics.return_avg_metrics(len(train_dataloader))
    dev_results = dev_metrics.return_avg_metrics(len(dev_dataloader))

    print(f"TRAIN \nLoss: {total_loss_train / len(train_dataset)} \nMetrics {train_results}\n" ) 
    print(f"VALIDATION \nLoss {total_loss_dev / len(dev_dataset)} \nMetrics{dev_results}\n" )   

In [13]:
#create mapp between label and tag
tag2idx, idx2tag , unseen_label, unique_tags = tags_mapping(df_train["tags"])

#change label to tag (surplus label will be changed to "O" tag)
for df in [df_train, df_dev, df_test]:
  df["labels"] = df["tags"].apply(lambda tags : tags_2_labels(tags, tag2idx))

In [14]:
if using_biobert == False :
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") #needed to useing distilBERT model
else :
    tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.2", model_max_length = 512) #needed to useing bioBERT model

In [15]:
text = df_train["sentence"].values.tolist()

#for change float(nan) -> string("nan")

i = 0
for string in text:
    i += 1
    if(isinstance(string, float)):
        text[i - 1] = "nan"

text_tokenized = tokenizer(text, padding = "max_length", truncation = True, return_tensors = "pt")
#text_tokenized = text_tokenized.remove_columns(books_dataset["train"].column_names) #needed to useing bioBERT tokenizer

#map tokens to corresponding words
word_ids = text_tokenized.word_ids()


In [16]:
model = DistilbertNER(len(unique_tags))
learn = False

#determine whether new train & learn is needed or not

try :
    model = torch.load("NER_model", map_location=torch.device('cpu'))
except FileNotFoundError as e : 
    print("MODEL is not exist so new MODEL will be created")
    learn = True

model.eval()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN t

MODEL is not exist so new MODEL will be created


DistilbertNER(
  (pretrained): DistilBertForTokenClassification(
    (distilbert): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0): TransformerBlock(
            (attention): MultiHeadSelfAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, bias=True)
              (out_lin): Linear(in_features=768, out_features=768, bias=True)
            )
            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (ffn):

In [17]:
#set the hyperparameters

train_dataset = NerDataset(df_train)
dev_dataset = NerDataset(df_dev)

lr = 1e-2
optimizer = SGD(model.parameters(), lr=lr, momentum = 0.9)  

#MAIN

if using_biobert == False :
    parameters = {
        "model": model,
        "train_dataset": train_dataset,
        "dev_dataset" : dev_dataset,
        "optimizer" : optimizer,
        "batch_size" : 16,
        "epochs" : 5
    }
else :
    parameters = {
        "model": model,
        "train_dataset": train_dataset,
        "dev_dataset" : dev_dataset,
        "optimizer" : optimizer,
        "batch_size" : 8, #to avoid out of memory error
        "epochs" : 5
    }

if learn == True: #do train & test if NER_model not exist
    train_loop(**parameters)

100%|██████████████████████████████████████████████████████████████████████████████| 2191/2191 [07:32<00:00,  4.84it/s]


TRAIN 
Loss: 0.029421302068907137 
Metrics {'acc': 0.839, 'f1': 0.609, 'precision': 0.689, 'recall': 0.604}

VALIDATION 
Loss 0.02195437626920841 
Metrics{'acc': 0.877, 'f1': 0.704, 'precision': 0.785, 'recall': 0.686}



100%|██████████████████████████████████████████████████████████████████████████████| 2191/2191 [07:37<00:00,  4.79it/s]


TRAIN 
Loss: 0.019063821510254098 
Metrics {'acc': 0.892, 'f1': 0.747, 'precision': 0.811, 'recall': 0.736}

VALIDATION 
Loss 0.017555131884757217 
Metrics{'acc': 0.9, 'f1': 0.768, 'precision': 0.813, 'recall': 0.771}



100%|██████████████████████████████████████████████████████████████████████████████| 2191/2191 [07:37<00:00,  4.79it/s]


TRAIN 
Loss: 0.014570196061179387 
Metrics {'acc': 0.916, 'f1': 0.808, 'precision': 0.856, 'recall': 0.8}

VALIDATION 
Loss 0.016656909990543096 
Metrics{'acc': 0.908, 'f1': 0.79, 'precision': 0.829, 'recall': 0.791}



100%|██████████████████████████████████████████████████████████████████████████████| 2191/2191 [07:35<00:00,  4.81it/s]


TRAIN 
Loss: 0.01214027072993519 
Metrics {'acc': 0.929, 'f1': 0.838, 'precision': 0.88, 'recall': 0.831}

VALIDATION 
Loss 0.01603966121149935 
Metrics{'acc': 0.916, 'f1': 0.809, 'precision': 0.838, 'recall': 0.818}



100%|██████████████████████████████████████████████████████████████████████████████| 2191/2191 [07:34<00:00,  4.82it/s]


TRAIN 
Loss: 0.010589892369773538 
Metrics {'acc': 0.937, 'f1': 0.856, 'precision': 0.89, 'recall': 0.85}

VALIDATION 
Loss 0.017373026489135215 
Metrics{'acc': 0.911, 'f1': 0.794, 'precision': 0.849, 'recall': 0.784}



In [18]:
if learn == True: #do export the result of train if NER_model not exist
    torch.save(model, "NER_model")