<a href="https://colab.research.google.com/github/rahiakela/nlp-research-and-practice/blob/main/nlp-in-real-world/02-advanced-nlp-applications/02_ner_fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Setup

In [1]:
%%capture

!pip install transformers
!pip -q install spacy
!pip install spacy-transformers==1.1.5 -f https://download.pytorch.org/whl/torch_stable.html

!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_lg
!python -m spacy download en_core_web_trf

In [2]:
import re
import random
import numpy as np

from pathlib import Path
from sklearn.model_selection import train_test_split

from transformers import pipeline
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertForTokenClassification
from transformers import TFDistilBertForSequenceClassification

import tensorflow as tf

import spacy
from spacy.training.example import Example
from spacy import displacy

In [3]:
!wget http://noisy-text.github.io/2017/files/wnut17train.conll

--2023-11-22 09:53:49--  http://noisy-text.github.io/2017/files/wnut17train.conll
Resolving noisy-text.github.io (noisy-text.github.io)... 185.199.109.153, 185.199.111.153, 185.199.108.153, ...
Connecting to noisy-text.github.io (noisy-text.github.io)|185.199.109.153|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 493781 (482K) [application/octet-stream]
Saving to: ‘wnut17train.conll’


2023-11-22 09:53:49 (10.0 MB/s) - ‘wnut17train.conll’ saved [493781/493781]



##spaCy fine-tuning

In [4]:
train_data = [
    (
        "Chef added some salt and pepper to the rice.",
        {
            "entities": [
            (16, 20, 'SPICE'),
            (25, 31, 'SPICE'),
            (39, 43, 'INGREDIENT')
          ]
        }
    ),
    (
        "The pasta was set to boil with some salt.",
        {
            "entities": [
            (4, 9, 'INGREDIENT'),
            (36, 40, 'SPICE')
          ]
        }
    ),
    (
        "Adding egg to the rice dish with some pepper.",
        {
            "entities": [
            (7, 10, 'INGREDIENT'),
            (18, 22, 'INGREDIENT'),
            (38, 44, 'SPICE')
          ]
        }
    )
]

In [None]:
nlp = spacy.blank("en")
print("Created a blank en model")

nlp.add_pipe("ner", last=True)
ner = nlp.get_pipe("ner")
print("pipe_names", nlp.pipe_names)

for _, annotations in train_data:
  for ent in annotations.get("entities"):
    ner.add_label(ent[2])

# begin training
optimizer = nlp.begin_training()

Created a blank en model
pipe_names ['ner']


In [None]:
n_iter = 100
pipe_exceptions = ["ner", "trf_wordpiece", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

with nlp.disable_pipes(*other_pipes):
  for _ in range(n_iter):
    random.shuffle(train_data)
    losses = {}
    for batch in spacy.util.minibatch(train_data, size=2):
      for text, annots in batch:
        doc = nlp.make_doc(text)
        nlp.update([Example.from_dict(doc, annots)], drop=0.5, sgd=optimizer, losses=losses)
    print(f"losses: {losses}")

In [None]:
def get_entities(raw_text):
  doc = nlp(raw_text)
  result = []
  for word in doc.ents:
    result.append((word.text, word.label_))
  return result

In [None]:
print(get_entities("Add water to the spaghetti"))

[('water', 'SPICE'), ('spaghetti', 'SPICE')]


In [None]:
print(get_entities("Add some paprika on top to your pasta."))

[('paprika', 'SPICE'), ('pasta', 'INGREDIENT')]


##Transformers fine-tuning

In [5]:
def split_into_tokens(raw_text):
  raw_docs = re.split(r"\n\t?\n", raw_text)
  token_docs = []
  tag_docs = []

  for doc in raw_docs:
    tokens = []
    tags = []
    for line in doc.split("\n"):
      row = line.split("\t")
      if len(row) == 1:
        token = row[0]
        tag = None
      else:
        token, tag = line.split("\t")
      tokens.append(token)
      tags.append(tag)
    token_docs.append(tokens)
    tag_docs.append(tags)
  return token_docs, tag_docs

In [6]:
def read_wnut(file_path):
  file_path = Path(file_path)

  raw_text = file_path.read_text().strip()
  token_docs, tag_docs = split_into_tokens(raw_text)
  return token_docs, tag_docs

In [7]:
texts, tags = read_wnut("wnut17train.conll")

In [8]:
print(texts[0][10:17], tags[0][10:17], sep="\n")

['for', 'two', 'weeks', '.', 'Empire', 'State', 'Building']
['O', 'O', 'O', 'O', 'B-location', 'I-location', 'I-location']


In [32]:
# Splitting our data into training and validation set
train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=0.2)

In [33]:
# let's encode the tokens, using pre-trained DistilBert tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-cased")

In [40]:
train_encodings = tokenizer(
    train_texts,
    is_split_into_words=True,     # we have ready-split tokens
    return_offsets_mapping=True,
    padding=True,
    truncation=True
)

val_encodings = tokenizer(
    val_texts,
    is_split_into_words=True,     # we have ready-split tokens
    return_offsets_mapping=True,
    padding=True,
    truncation=True
)

In [41]:
unique_tags = set(tag for doc in tags for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

In [42]:
def encode_tags(tags, encodings):
  labels = [[tag2id[tag] for tag in doc] for doc in tags]
  encoded_labels = []
  for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
    # create an empty array of -100
    doc_enc_labels = np.ones(len(doc_offset), dtype=int) * -100
    arr_offset = np.array(doc_offset)
    # set labels whose first offset position is 0 and the second is not 0
    doc_enc_labels[(arr_offset[:, 0] == 0) & (arr_offset[:, 1] != 0)] = doc_labels
    encoded_labels.append(doc_enc_labels.tolist())
  return encoded_labels

In [43]:
train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)
print(f"There are total {len(tag2id.keys())} entity tags in the data: {tag2id.keys()}")

There are total 13 entity tags in the data: dict_keys(['I-group', 'O', 'I-product', 'B-product', 'B-group', 'B-creative-work', 'I-corporation', 'B-location', 'I-person', 'B-person', 'I-creative-work', 'I-location', 'B-corporation'])


In [44]:
train_encodings.pop("offset_mapping")  # we don't want to pass this to the model
val_encodings.pop("offset_mapping")

train_dataset = tf.data.Dataset.from_tensor_slices(
    (dict(train_encodings), train_labels)
)

val_dataset = tf.data.Dataset.from_tensor_slices(
    (dict(val_encodings), val_labels)
)

Now we can load in a token classification model and specify the number of labels.

Then, our model is ready for fine-tuning.

In [None]:
model = TFDistilBertForTokenClassification.from_pretrained("distilbert-base-cased", num_labels=len(unique_tags))

In [None]:
# let's do fine-tuning
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)

model.compile(optimizer=optimizer, loss=model.compute_loss)
model.fit(train_dataset.shuffle(1000).batch(16), epochs=3, batch_size=16)