<a href="https://colab.research.google.com/github/rahiakela/nlp-research-and-practice/blob/main/nlp-in-real-world/02-advanced-nlp-applications/02_ner_fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Setup

In [1]:
%%capture

!pip install transformers
!pip -q install spacy
!pip install spacy-transformers==1.1.5 -f https://download.pytorch.org/whl/torch_stable.html

!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_lg
!python -m spacy download en_core_web_trf

In [1]:
import re
import random

from transformers import pipeline

import spacy
from spacy.training.example import Example
from spacy import displacy

In [None]:
!wget http://noisy-text.github.io/2017/files/wnut17train.conll

##spaCy fine-tuning

In [2]:
train_data = [
    (
        "Chef added some salt and pepper to the rice.",
        {
            "entities": [
            (16, 20, 'SPICE'),
            (25, 31, 'SPICE'),
            (39, 43, 'INGREDIENT')
          ]
        }
    ),
    (
        "The pasta was set to boil with some salt.",
        {
            "entities": [
            (4, 9, 'INGREDIENT'),
            (36, 40, 'SPICE')
          ]
        }
    ),
    (
        "Adding egg to the rice dish with some pepper.",
        {
            "entities": [
            (7, 10, 'INGREDIENT'),
            (18, 22, 'INGREDIENT'),
            (38, 44, 'SPICE')
          ]
        }
    )
]

In [3]:
nlp = spacy.blank("en")
print("Created a blank en model")

nlp.add_pipe("ner", last=True)
ner = nlp.get_pipe("ner")
print("pipe_names", nlp.pipe_names)

for _, annotations in train_data:
  for ent in annotations.get("entities"):
    ner.add_label(ent[2])

# begin training
optimizer = nlp.begin_training()

Created a blank en model
pipe_names ['ner']


In [None]:
n_iter = 100
pipe_exceptions = ["ner", "trf_wordpiece", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

with nlp.disable_pipes(*other_pipes):
  for _ in range(n_iter):
    random.shuffle(train_data)
    losses = {}
    for batch in spacy.util.minibatch(train_data, size=2):
      for text, annots in batch:
        doc = nlp.make_doc(text)
        nlp.update([Example.from_dict(doc, annots)], drop=0.5, sgd=optimizer, losses=losses)
    print(f"losses: {losses}")

In [5]:
def get_entities(raw_text):
  doc = nlp(raw_text)
  result = []
  for word in doc.ents:
    result.append((word.text, word.label_))
  return result

In [6]:
print(get_entities("Add water to the spaghetti"))

[('water', 'SPICE'), ('spaghetti', 'SPICE')]


In [7]:
print(get_entities("Add some paprika on top to your pasta."))

[('paprika', 'SPICE'), ('pasta', 'INGREDIENT')]


##Transformers fine-tuning