In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import transformers
from transformers import AutoTokenizer
from transformers import DistilBertForTokenClassification
from torch.optim import AdamW
import torch
import torch.nn as nn
from torch.optim import SGD
import torch.nn.functional as F
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score,f1_score, precision_score, recall_score
import re

In [2]:
#N = 10000
N = 43821
df = pd.read_csv("NER_dataset.csv", encoding="cp949").sample(frac=1)[:N]

#change columns names
df.rename(columns = {'text':'sentence', 'labels':'tags'}, inplace = True)

#split train, dev , test sets
df_train, df_dev, df_test = np.split(df.sample(frac=1, random_state=42),
                            [int(.8 * len(df)), int(.9 * len(df))])

In [3]:
df2 = pd.read_csv('NER_dataset.csv', encoding = "cp949")
df2.head()

Unnamed: 0,text,labels
0,Admission Date :,O O O
1,2012-10-31,O
2,Discharge Date :,O O O
3,2012-11-07,O
4,Date of Birth :,O O O O


In [4]:
df2.tail()

Unnamed: 0,text,labels
43817,05/16/2004,O
43818,TD :,O O
43819,05/16/2004 7:28 A 265076,O O O O
43820,cc :,O O
43821,"OIE LIMOR WARM , M.D. CH GILD WARM , M.D.",O O O O O O O O O O


In [5]:
# Split labels based on whitespace and turn them into a list
labels2 = [i.split() for i in df2['labels'].values.tolist()]

# Check how many labels are there in the dataset
unique_labels2 = set()

for lb in labels2:
  [unique_labels2.add(i) for i in lb if i not in unique_labels2]
 
print(unique_labels2)
# {'B-tim', 'B-art', 'I-art', 'O', 'I-gpe', 'I-per', 'I-nat', 'I-geo', 'B-eve', 'B-org', 'B-gpe', 'I-eve', 'B-per', 'I-tim', 'B-nat', 'B-geo', 'I-org'}

# Map each label into its id representation and vice versa
labels_to_ids2 = {k: v for v, k in enumerate(sorted(unique_labels2))}
ids_to_labels2 = {v: k for v, k in enumerate(sorted(unique_labels2))}
print(labels_to_ids2)
# {'B-art': 0, 'B-eve': 1, 'B-geo': 2, 'B-gpe': 3, 'B-nat': 4, 'B-org': 5, 'B-per': 6, 'B-tim': 7, 'I-art': 8, 'I-eve': 9, 'I-geo': 10, 'I-gpe': 11, 'I-nat': 12, 'I-org': 13, 'I-per': 14, 'I-tim': 15, 'O': 16}

{'B-PR', 'I-TE', 'I-TR', 'O', 'B-TR', 'I-PR', 'B-TE'}
{'B-PR': 0, 'B-TE': 1, 'B-TR': 2, 'I-PR': 3, 'I-TE': 4, 'I-TR': 5, 'O': 6}


In [6]:
class DistilbertNER(nn.Module):
  """
  Implement NN class based on distilbert pretrained from Hugging face.
  Inputs : 
    tokens_dim : int specifyng the dimension of the classifier
  """
  
  def __init__(self, tokens_dim):
    super(DistilbertNER,self).__init__()
    
    if type(tokens_dim) != int:
            raise TypeError('Please tokens_dim should be an integer')

    if tokens_dim <= 0:
          raise ValueError('Classification layer dimension should be at least 1')

    self.pretrained = DistilBertForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels = tokens_dim) #set the output of each token classifier = unique_lables

  def forward(self, input_ids, attention_mask, labels = None): #labels are needed in order to compute the loss
    """
  Forwad computation of the network
  Input:
    - inputs_ids : from model tokenizer
    - attention :  mask from model tokenizer
    - labels : if given the model is able to return the loss value
  """

    #inference time no labels
    if labels == None:
      out = self.pretrained(input_ids = input_ids, attention_mask = attention_mask )
      return out

    out = self.pretrained(input_ids = input_ids, attention_mask = attention_mask , labels = labels)
    return out

In [7]:
class NerDataset(torch.utils.data.Dataset):
  """
  Custom dataset implementation to get (text,labels) tuples
  Inputs:
   - df : dataframe with columns [tags, sentence]
  """
  
  def __init__(self, df):
    if not isinstance(df, pd.DataFrame):
      raise TypeError('Input should be a dataframe')
    
    if "tags" not in df.columns or "sentence" not in df.columns:
      raise ValueError("Dataframe should contain 'tags' and 'sentence' columns")
    
    tags_list = [i.split() for i in df["tags"].values.tolist()]
    texts = df["sentence"].values.tolist()
    
    i = 0
    for string in texts:
        i += 1
        if(isinstance(string, float)):
            #print("float")
            texts[i - 1] = "nan"
        #print(str(i) + " = " + str(string))

    self.texts = [tokenizer(text, padding = "max_length", truncation = True, return_tensors = "pt") for text in texts]
    self.labels = [match_tokens_labels(text, tags) for text,tags in zip(self.texts, tags_list)]

  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):
    batch_text = self.texts[idx]
    batch_labels = self.labels[idx]

    return batch_text, torch.LongTensor(batch_labels)

In [8]:
class MetricsTracking():
  """
  In order make the train loop lighter I define this class to track all the metrics that we are going to measure for our model.
  """
  def __init__(self):

    self.total_acc = 0
    self.total_f1 = 0
    self.total_precision = 0
    self.total_recall = 0

  def update(self, predictions, labels , ignore_token = -100):
    '''
    Call this function every time you need to update your metrics.
    Where in the train there was a -100, were additional token that we dont want to label, so remove them.
    If we flatten the batch its easier to access the indexed = -100
    '''  
    predictions = predictions.flatten()
    labels = labels.flatten()
    
    predictions = predictions[labels != ignore_token]
    labels = labels[labels != ignore_token]

    predictions = predictions.to("cpu")
    labels = labels.to("cpu")

    acc = accuracy_score(labels,predictions)
    f1 = f1_score(labels, predictions, average = "macro")
    precision = precision_score(labels, predictions, average = "macro")
    recall = recall_score(labels, predictions, average = "macro")

    self.total_acc  += acc
    self.total_f1 += f1
    self.total_precision += precision
    self.total_recall  += recall

  def return_avg_metrics(self,data_loader_size):
    n = data_loader_size
    metrics = {
        "acc": round(self.total_acc / n ,3), 
        "f1": round(self.total_f1 / n, 3), 
        "precision" : round(self.total_precision / n, 3), 
        "recall": round(self.total_recall / n, 3)
          }
    return metrics

In [9]:
def tags_2_labels(tags : str, tag2idx : dict):
  '''
  Method that takes a list of tags and a dictionary mapping and returns a list of labels (associated).
  Used to create the "label" column in df from the "tags" column.
  '''
  return [tag2idx[tag] if tag in tag2idx else unseen_label for tag in tags.split()] 

In [10]:
def tags_mapping(tags_series : pd.Series):
  """
  tag_series = df column with tags for each sentence.
  Returns:
    - dictionary mapping tags to indexes (label)
    - dictionary mappign inedexes to tags
    - The label corresponding to tag 'O'
    - A set of unique tags ecountered in the trainind df, this will define the classifier dimension
  """

  if not isinstance(tags_series, pd.Series):
      raise TypeError('Input should be a padas Series')

  unique_tags = set()
  
  for tag_list in df_train["tags"]:
    for tag in tag_list.split():
      unique_tags.add(tag)

  tag2idx = {k:v for v,k in enumerate(sorted(unique_tags))}
  idx2tag = {k:v for v,k in tag2idx.items()}

  unseen_label = tag2idx["O"]

  return tag2idx, idx2tag, unseen_label, unique_tags

In [11]:
def match_tokens_labels(tokenized_input, tags, ignore_token = -100):
        '''
        Used in the custom dataset.
        -100 will be tha label used to match additional tokens like [CLS] [PAD] that we dont care about. 
        Inputs : 
          - tokenized_input : tokenizer over the imput text -> {input_ids, attention_mask}
          - tags : is a single label array -> [O O O O O O O O O O O O O O B-tim O]
        
        Returns a list of labels that match the tokenized text -> [-100, 3,5,6,-100,...]
        '''

        #gives an array [ None , 0 , 1 ,2 ,... None]. Each index tells the word of reference of the token
        word_ids = tokenized_input.word_ids()

        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:

            if word_idx is None:
                label_ids.append(ignore_token)

            #if its equal to the previous word we can add the same label id of the provious or -100 
            else :
                try:
                  reference_tag = tags[word_idx]
                  label_ids.append(tag2idx[reference_tag])
                except:
                  label_ids.append(ignore_token)
            
            previous_word_idx = word_idx

        return label_ids

In [12]:
def freeze_model(model,num_layers = 1):
  """
  Freeze last num_layers of a model to prevent ctastrophic forgetting.
  Doesn't seem to work weel, its better to fine tune the entire netwok
  """
  for id , params in enumerate(model.parameters()):
    if id == len(list(model.parameters())) - num_layers: 
      print("last layer unfreezed")
      params.requires_grad = True
    else:
      params.requires_grad = False
  return model

In [13]:
def train_loop(model, train_dataset, dev_dataset, optimizer,  batch_size, epochs):
  
  train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
  dev_dataloader = DataLoader(dev_dataset, batch_size = batch_size, shuffle = True)

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model = model.to(device)

  for epoch in range(epochs) : 
    
    train_metrics = MetricsTracking()
    total_loss_train = 0

    model.train() #train mode

    for train_data, train_label in tqdm(train_dataloader):

      train_label = train_label.to(device)
      '''
      squeeze in order to match the sizes. From [batch,1,seq_len] --> [batch,seq_len] 
      '''
      mask = train_data['attention_mask'].squeeze(1).to(device)
      input_id = train_data['input_ids'].squeeze(1).to(device)

      optimizer.zero_grad()
      
      output = model(input_id, mask, train_label)
      loss, logits = output.loss, output.logits
      predictions = logits.argmax(dim= -1) 

      #compute metrics
      train_metrics.update(predictions, train_label)
      total_loss_train += loss.item()

      #grad step
      loss.backward()
      optimizer.step()
    
    '''
    EVALUATION MODE
    '''            
    model.eval()

    dev_metrics = MetricsTracking()
    total_loss_dev = 0
    
    with torch.no_grad():
      for dev_data, dev_label in dev_dataloader:

        dev_label = dev_label.to(device)

        mask = dev_data['attention_mask'].squeeze(1).to(device)
        input_id = dev_data['input_ids'].squeeze(1).to(device)

        output = model(input_id, mask, dev_label)
        loss, logits = output.loss, output.logits

        predictions = logits.argmax(dim= -1)     

        dev_metrics.update(predictions, dev_label)
        total_loss_dev += loss.item()
    
    train_results = train_metrics.return_avg_metrics(len(train_dataloader))
    dev_results = dev_metrics.return_avg_metrics(len(dev_dataloader))

    print(f"TRAIN \nLoss: {total_loss_train / len(train_dataset)} \nMetrics {train_results}\n" ) 
    print(f"VALIDATION \nLoss {total_loss_dev / len(dev_dataset)} \nMetrics{dev_results}\n" )   

In [14]:
#create tag-label mapping
tag2idx, idx2tag , unseen_label, unique_tags = tags_mapping(df_train["tags"])

#create the label column from tag. Unseen labels will be tagged as "O"
for df in [df_train, df_dev, df_test]:
  df["labels"] = df["tags"].apply(lambda tags : tags_2_labels(tags, tag2idx))

In [15]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [16]:

#original text
text = df_train["sentence"].values.tolist()
#df.stack().groupby(level=0).apply(text).tolist()

i = 0
for string in text:
    i += 1
    if(isinstance(string, float)):
        #print("float")
        text[i - 1] = "nan"
    #print(str(i) + " = " + str(string))

#toeknized text
text_tokenized = tokenizer(text , padding = "max_length" , truncation = True, return_tensors = "pt" )

#mapping token to original word
word_ids = text_tokenized.word_ids()


In [17]:
model = DistilbertNER(len(unique_tags))
learn = False

try :
    model = torch.load("NER_model", map_location=torch.device('cpu'))
except FileNotFoundError as e : 
    print("MODEL is not exist so new MODEL will be created")
    learn = True

model.eval()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN t

MODEL is not exist so new MODEL will be created


DistilbertNER(
  (pretrained): DistilBertForTokenClassification(
    (distilbert): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0): TransformerBlock(
            (attention): MultiHeadSelfAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, bias=True)
              (out_lin): Linear(in_features=768, out_features=768, bias=True)
            )
            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (ffn):

In [18]:

#model = DistilbertNER(len(unique_tags))
#Prevent Catastrofic Forgetting
#model = freeze_model(model, num_layers = 2)

#datasets

train_dataset = NerDataset(df_train)
dev_dataset = NerDataset(df_dev)

lr = 1e-2
optimizer = SGD(model.parameters(), lr=lr, momentum = 0.9)  

#MAIN
parameters = {
    "model": model,
    "train_dataset": train_dataset,
    "dev_dataset" : dev_dataset,
    "optimizer" : optimizer,
    "batch_size" : 16,
    "epochs" : 8
}

if learn == True:
    train_loop(**parameters)


100%|██████████████████████████████████████████████████████████████████████████████| 2191/2191 [07:40<00:00,  4.75it/s]


TRAIN 
Loss: 0.027788051847510452 
Metrics {'acc': 0.848, 'f1': 0.631, 'precision': 0.711, 'recall': 0.623}

VALIDATION 
Loss 0.019319590822538927 
Metrics{'acc': 0.893, 'f1': 0.752, 'precision': 0.827, 'recall': 0.735}



100%|██████████████████████████████████████████████████████████████████████████████| 2191/2191 [07:33<00:00,  4.83it/s]


TRAIN 
Loss: 0.0181666704406728 
Metrics {'acc': 0.898, 'f1': 0.759, 'precision': 0.82, 'recall': 0.75}

VALIDATION 
Loss 0.017211641984537333 
Metrics{'acc': 0.904, 'f1': 0.772, 'precision': 0.834, 'recall': 0.758}



100%|██████████████████████████████████████████████████████████████████████████████| 2191/2191 [07:34<00:00,  4.82it/s]


TRAIN 
Loss: 0.014746015793023029 
Metrics {'acc': 0.916, 'f1': 0.802, 'precision': 0.85, 'recall': 0.794}

VALIDATION 
Loss 0.01709186677943825 
Metrics{'acc': 0.908, 'f1': 0.786, 'precision': 0.831, 'recall': 0.779}



100%|██████████████████████████████████████████████████████████████████████████████| 2191/2191 [07:34<00:00,  4.82it/s]


TRAIN 
Loss: 0.012443940102177637 
Metrics {'acc': 0.928, 'f1': 0.831, 'precision': 0.873, 'recall': 0.823}

VALIDATION 
Loss 0.01606104381279174 
Metrics{'acc': 0.912, 'f1': 0.8, 'precision': 0.833, 'recall': 0.804}



100%|██████████████████████████████████████████████████████████████████████████████| 2191/2191 [07:32<00:00,  4.84it/s]


TRAIN 
Loss: 0.010300264004251044 
Metrics {'acc': 0.939, 'f1': 0.861, 'precision': 0.896, 'recall': 0.854}

VALIDATION 
Loss 0.01612658021969033 
Metrics{'acc': 0.918, 'f1': 0.812, 'precision': 0.853, 'recall': 0.81}



100%|██████████████████████████████████████████████████████████████████████████████| 2191/2191 [07:31<00:00,  4.86it/s]


TRAIN 
Loss: 0.00917808968121351 
Metrics {'acc': 0.945, 'f1': 0.873, 'precision': 0.905, 'recall': 0.867}

VALIDATION 
Loss 0.016207100921605905 
Metrics{'acc': 0.922, 'f1': 0.819, 'precision': 0.861, 'recall': 0.812}



100%|██████████████████████████████████████████████████████████████████████████████| 2191/2191 [07:30<00:00,  4.87it/s]


TRAIN 
Loss: 0.009242506893972406 
Metrics {'acc': 0.946, 'f1': 0.873, 'precision': 0.902, 'recall': 0.868}

VALIDATION 
Loss 0.01626957368806161 
Metrics{'acc': 0.92, 'f1': 0.822, 'precision': 0.851, 'recall': 0.825}



100%|██████████████████████████████████████████████████████████████████████████████| 2191/2191 [07:30<00:00,  4.87it/s]


TRAIN 
Loss: 0.007562014873320333 
Metrics {'acc': 0.955, 'f1': 0.897, 'precision': 0.922, 'recall': 0.893}

VALIDATION 
Loss 0.017565901645743818 
Metrics{'acc': 0.92, 'f1': 0.819, 'precision': 0.844, 'recall': 0.822}



In [19]:
if learn == True:
    torch.save(model, "NER_model")