In [1]:
!pip install transformers
!pip install seqeval

Collecting transformers
  Downloading transformers-4.9.2-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 5.2 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 54.5 MB/s 
[?25hCollecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 48.2 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 52.4 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully u

In [2]:
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from transformers import BertForTokenClassification, AdamW, BertTokenizer
from transformers import get_linear_schedule_with_warmup

from keras.preprocessing.sequence import pad_sequences

from abc import ABC, abstractmethod

import seqeval

import requests

torch.manual_seed(28)

<torch._C.Generator at 0x7f6841bc1a90>

In [3]:
class Config:
  DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  MODEL_NAME = "bert-base-uncased"
  TRAIN_BATCH_SIZE = 32
  TRAIN_EPOCHS = 4
  MAX_SEQUENCE_LENGTH = 270
  FULL_FINE_TUNING= True
  MAX_GRAD_NORM = 1.0

In [4]:
# label_map = {"1": "O", "2": "B-MISC", "3": "I-MISC", "4": "B-PER", "5": "I-PER", "6": "B-ORG", 
#              "7": "I-ORG", "8": "B-LOC", "9": "I-LOC", "10": "[CLS]", "11": "[SEP]", "12": "PAD"}

In [5]:
class InputSample:
  def __init__(self, guid, text, label):
    self.guid = guid
    self.text = text
    self.label = label

In [6]:
class FileReader(ABC):
  @abstractmethod
  def read_file(self, file_name):
    pass

In [7]:
class TSVFileReader(FileReader):
  def read_file(self, file_name):
    data = []
    sentence = []
    label = []

    text = requests.get(file_name).text.split('\n')
    # with open(file_name, 'r') as file_handle:
    for line in text:
      if len(line) == 0 or line.startswith('-DOCSTART') or line[0] == '\n':
        if len(sentence) > 0:
          data.append((sentence, label))
          sentence = []
          label = []
        continue
      splits = line.split(' ')
      sentence.append(splits[0])
      #label.append(splits[-1][:-1])
      label.append(splits[-1][:])
    
    if len(sentence) > 0:
      data.append((sentence, label))
    return data

In [8]:
# stri = "EU NNP B-NP B-ORG"

# splits = stri.split(' ')
# print(splits)
# print(splits[0])
# print(splits[-1][:-1])

In [9]:
class DataProcessor(ABC):
  @abstractmethod
  def fetch_train_samples(self, file_path):
    pass
  
  @abstractmethod
  def fetch_validation_samples(self, file_path):
    pass
  
  @abstractmethod
  def fetch_test_samples(self, file_path):
    pass

  @abstractmethod
  def fetch_labels(self):
    pass
  
  def read_file(self, file_name, file_type):
    if file_type == 'tsv':
      tsv = TSVFileReader()
      return tsv.read_file(file_name)

In [10]:
class NERDataProcessor(DataProcessor):
  def __init__(self):
    self.labels = ["O", "B-MISC", "I-MISC",  "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "[CLS]", "[SEP]"]

  def fetch_train_samples(self, file_path):
    data = self.read_file(file_path, "tsv")
    return self.__create_input_samples("train", data)

  def fetch_validation_samples(self, file_path):
    data = self.read_file(file_path, "tsv")
    return self.__create_input_samples("valid", data)

  def fetch_test_samples(self, file_path):
    data = self.read_file(file_path, "tsv")
    return self.__create_input_samples("test", data)

  def fetch_labels(self):
    return self.labels

  def __create_input_samples(self, dataset_type, lines):
    input_samples = []

    for idx, (sentence, label) in enumerate(lines):
      input_samples.append(InputSample(guid=f"{dataset_type}-{idx}", 
                                       text=sentence,
                                       #text=" ".join(sentence), 
                                       label=label))
    
    return input_samples

In [13]:
float(0.1!=0.0)

1.0

In [15]:
ner_processor = NERDataProcessor()
labels = ner_processor.fetch_labels()
labels.append("PAD")
num_labels = len(labels) + 1  # addition 1 for PAD

In [16]:
train_samples = ner_processor.fetch_train_samples("https://raw.githubusercontent.com/saishiva024/BERT_NamedEntityRecognition/master/dataset/train.txt")
valid_samples = ner_processor.fetch_train_samples("https://raw.githubusercontent.com/saishiva024/BERT_NamedEntityRecognition/master/dataset/valid.txt")
test_samples = ner_processor.fetch_train_samples("https://raw.githubusercontent.com/saishiva024/BERT_NamedEntityRecognition/master/dataset/test.txt")

In [17]:
train_samples[:3]

[<__main__.InputSample at 0x7f67fa0ed210>,
 <__main__.InputSample at 0x7f67f79fe890>,
 <__main__.InputSample at 0x7f67f79feb50>]

In [18]:
train_samples[0].text, train_samples[0].label

(['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'],
 ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O'])

In [19]:
def represent_features_for_samples(samples, all_labels, max_seq_length, tokenizer):
  try:
    input_tokens = []
    input_labels = []
    labels_map = {label : i for i, label in enumerate(all_labels,1)}
    for sample_idx, sample in enumerate(samples):
      words_list = sample.text
      label = sample.label

      tokens = []
      labels = []
      
      for word_idx, word in enumerate(words_list):
        token = tokenizer.tokenize(word)

        tokens.extend(token)
        lbl = label[word_idx]
        lbl = labels_map[lbl]

        labels.extend([lbl] * len(token))
      
      input_tokens.append(tokens)
      input_labels.append(labels)

    input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in input_tokens], maxlen=Config.MAX_SEQUENCE_LENGTH, dtype='long', 
                              value=0.0, truncating='post', padding='post')
    label_ids = pad_sequences(input_labels, Config.MAX_SEQUENCE_LENGTH, dtype='long', value=labels_map["PAD"], truncating='post', padding='post')
    attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]
    
    return torch.tensor(input_ids), torch.tensor(label_ids), torch.tensor(attention_masks)

  except Exception as ex:
    print(ex)

In [20]:
tokenizer = BertTokenizer.from_pretrained(Config.MODEL_NAME)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [21]:
train_input_ids, train_input_labels, train_attention_masks = represent_features_for_samples(train_samples, labels, Config.MAX_SEQUENCE_LENGTH, tokenizer)
valid_input_ids, valid_input_labels, valid_attention_masks = represent_features_for_samples(valid_samples, labels, Config.MAX_SEQUENCE_LENGTH, tokenizer)
test_input_ids, test_input_labels, test_attention_masks = represent_features_for_samples(test_samples, labels, Config.MAX_SEQUENCE_LENGTH, tokenizer)

In [22]:
train_data = TensorDataset(train_input_ids, train_attention_masks, train_input_labels)
valid_data = TensorDataset(valid_input_ids, valid_attention_masks, valid_input_labels)
test_data = TensorDataset(test_input_ids, test_attention_masks, test_input_labels)

train_sampler = RandomSampler(train_data)
valid_sampler = RandomSampler(valid_data)
test_sampler = RandomSampler(test_data)

train_data_loader = DataLoader(train_data, sampler=train_sampler, batch_size=Config.TRAIN_BATCH_SIZE)
valid_data_loader = DataLoader(valid_data, sampler=valid_sampler, batch_size=Config.TRAIN_BATCH_SIZE)
test_data_loader = DataLoader(test_data, sampler=test_sampler, batch_size=Config.TRAIN_BATCH_SIZE)

In [23]:
bert_model = BertForTokenClassification.from_pretrained(Config.MODEL_NAME, num_labels=num_labels, output_attentions=False, output_hidden_states=False)

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

In [25]:
# bert_model.cuda()

In [26]:
if Config.FULL_FINE_TUNING:
  param_optimizer = list(bert_model.named_parameters())
  no_decay = ['bias', 'gamma', 'beta']
  optimizer_grouped_params = [{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, 
                            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}]
else:
  param_optimizer = list(bert_model.named_parameters())
  optimizer_grouped_params = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(optimizer_grouped_params, lr = 3e-5, eps = 1e-8)

In [27]:
total_steps = len(train_data_loader) * Config.TRAIN_EPOCHS

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [28]:
# def train():
labels_map = {i : label for i, label in enumerate(labels,1)}
print(labels_map)

{1: 'O', 2: 'B-MISC', 3: 'I-MISC', 4: 'B-PER', 5: 'I-PER', 6: 'B-ORG', 7: 'I-ORG', 8: 'B-LOC', 9: 'I-LOC', 10: '[CLS]', 11: '[SEP]', 12: 'PAD'}


In [None]:
#%%time

losses = []
eval_losses = []

for i in range(Config.TRAIN_EPOCHS):
  bert_model.train()

  total_loss = 0

  for step, batch in enumerate(train_data_loader):
    batch = tuple(t.to(Config.DEVICE) for t in batch)
    batch_input_ids, batch_attention_mask, batch_labels = batch

    bert_model.zero_grad()

    output = bert_model(batch_input_ids, token_type_ids=None, attention_mask=batch_attention_mask, labels=batch_labels)

    loss = output[0]
    loss.backward()

    total_loss += loss.item()

    torch.nn.utils.clip_grad_norm_(parameters=bert_model.parameters(), max_norm=Config.MAX_GRAD_NORM)

    optimizer.step()
    scheduler.step()

  avg_train_loss = total_loss / len(train_data_loader)
  print(f"Average Training Loss - {avg_train_loss}\n")

  losses.append(avg_train_loss)

  bert_model.eval()

  eval_loss = eval_acc = nb_eval_steps = nb_eval_samples = 0
  predictions = []
  actual_labels = []

  for batch in valid_data_loader:
    batch = tuple(t.to(Config.DEVICE) for t in batch)
    batch_input_ids, batch_attention_mask, batch_labels = batch

    with torch.no_grad():
      outputs = bert_model(batch_input_ids, token_type_ids=None, attention_mask=batch_attention_mask, labels=batch_labels)
    # logits = outputs[1].detach().cpu().numpy()
    # label_ids = batch_labels.to('cpu').numpy()

    eval_loss += outputs[0].mean().item()
    # predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
    # actual_labels.extend(label_ids)
    logits = torch.argmax(F.log_softmax(outputs, dim=2), dim=2)
    logits = logits.detach().cpu().numpy()
    label_ids = batch_labels.to('cpu').numpy()

    for i, label in enumerate(label_ids):
      temp_1 = []
      temp_2 = []
      for j,m in enumerate(label):
          if j == 0:
              continue
          elif label_ids[i][j] == len(label_map):
              actual_labels.append(temp_1)
              predictions.append(temp_2)
              break
          else:
              temp_1.append(label_map[label_ids[i][j]])
              temp_2.append(label_map[logits[i][j]])
  
  eval_loss = eval_loss / len(valid_data_loader)
  eval_losses.append(eval_loss)

  print(f"Eval Loss - {eval_loss}")

  report = seqeval.metrics.classification_report(actual_labels, predictions, digits=4)
  print(report)
  print('\n')