# Named Entity Recognition with SpaCy and 🔭 Galileo

In this tutorial, we'll train a model with SpaCy and explore the results in Galileo.

In [None]:
#@title Install `dataquality`

try:
    import dataquality as dq
except ImportError:
    # Upgrade pip
    !pip install -U pip &> /dev/null
    # A higher version of spacy comes preinstalled on colab
    !pip uninstall -U en-core-web-sm &> /dev/null
    # Install HF datasets for downloading the example datasets
    !pip install -U "dataquality==0.8.55.2" datasets "spacy==3.2.1" &> /dev/null
    
    print('👋 Installed necessary libraries.')
    print('🙏 Continue with the rest of the notebook or hit "Run All" again!')

    # Restart the runtime
    import os, time
    time.sleep(1) # gives the print statements time to flush
    os._exit(0) # exits without allowing the next cell to run

# SpaCy Data Preparation 
We load NER datasets from HuggingFace🤗 registry, which provide word indexed NER spans. The data is formatted to be compatible with SpaCy pipelines by converting the spans to be character indexed. 

In [None]:
#@title 🤗 HuggingFace Dataset
#@markdown You can select any dataset from [here](https://huggingface.co/datasets?language=language:en&task_categories=task_categories:token-classification&task_ids=task_ids:named-entity-recognition&sort=downloads).

dataset_name = 'conllpp' #@param ["wnut_17", "conllpp", "wikiann"] {allow-input: true}
print(f"You selected the {dataset_name} dataset")

from IPython.utils import io
from datasets import load_dataset, get_dataset_config_names

# Try to load the data. If a config (subset) is needed, pick one
try:
  with io.capture_output() as captured:
    data = load_dataset(dataset_name, trust_remote_code=True)
except ValueError as e:
  if "Config name is missing" not in repr(e):
    raise e

  configs = get_dataset_config_names(dataset_name)
  print(f"The dataset {dataset_name} has multiple subsets {configs}.")
  config = input(f"🖖 Enter the name of the subset to pick (or leave blank for any): ")
  if config:
    assert config in configs, f"{config} is not a valid subset"
  else:
    config = configs[0]
  with io.capture_output() as captured:
    data = load_dataset(dataset_name, name=config, trust_remote_code=True)

# Check that the dataset has at least train and either of validation/test
assert "train" in data and {"validation", "test"}.intersection(data), \
f"💾 The dataset {dataset_name} has either no train, or no validation or test splits, select another one."

print(f"\n🏆 Dataset {dataset_name} loaded succesfully")

# A small function for minimizing the dataset for testing
import os

def _minimize_for_ci() -> bool:
    return os.getenv("MINIMIZE_FOR_CI", "false") == "true"

In [None]:
#@title
#@markdown Converting HF-formatted NER datasets to the Spacy format

import pandas as pd

class NerDataset:
    """
    Helper class to prepare the HF files for model input and output.
    """
    # text and gold_spans 
    def __init__(self, split, labels, label_col="ner_tags"):
      self.idx2label = {k:v for (k,v) in enumerate(labels)}
      self.list_of_labels = labels
      self.ner_token_tags = data[split][label_col]
      self.tokens = data[split]['tokens']
      self.split = split
      self.text_inputs = [' '.join(_tokens) for _tokens in self.tokens]
      self.gold_spans = [self.extract_spans(sample_tokens, sample_ner_token_tags) for (sample_tokens, sample_ner_token_tags) in zip(self.tokens, self.ner_token_tags)]

    def extract_spans(self, sample_tokens, sample_ner_token_tags):
      """
      HF uses word indexed spans. 
      Extract character indexed spans for SpaCy. Compatible with BIOES, BILOU, BIO schema
      """
      gold_tokens_len = [0] # n position value tracks the character length of 
      # a sentence uptill token n
      count = 0
      for _token in sample_tokens:
        count+=len(_token)+1
        gold_tokens_len.append(count)
      gold_sequence = [self.idx2label[ner_token_tag] for ner_token_tag in sample_ner_token_tags]
      
      gold_spans = []
      total_b_count = 0
      idx = 0
      while idx < len(gold_sequence):
          ner_label = gold_sequence[idx]
          next_idx = idx + 1
          if ner_label not in self.list_of_labels:
              raise Exception

          if ner_label.startswith("U") or ner_label.startswith("S"):
              ner_tag, ner_class = ner_label.split("-", 1)
              total_b_count += 1
              gold_spans.append(
                  {
                      "start": gold_tokens_len[idx],
                      "end": gold_tokens_len[idx + 1],
                      "label": ner_class,
                  }
              )
              idx += 1
              continue

          if not ner_label.startswith("B"):
              idx += 1
              continue

          total_b_count += 1
          ner_tag, ner_class = ner_label.split("-", 1)
          for next_tok in gold_sequence[idx + 1 :]:
              if next_tok not in self.list_of_labels:
                  raise Exception
              if next_tok.startswith("I") and next_tok.split("-", 1)[1] == ner_class:
                  next_idx += 1
              elif (next_tok.startswith("L") and next_tok.split("-", 1)[1] == ner_class) or (next_tok.startswith("E") and next_tok.split("-", 1)[1] == ner_class):
                  next_idx += 1
                  break
              else:
                  break
          gold_spans.append(
              {
                  "start": gold_tokens_len[idx],
                  "end": gold_tokens_len[next_idx] - 1,
                  "label": ner_class,
              }
          )
          idx = next_idx

      assert total_b_count == len(gold_spans)
      return gold_spans

# Find the name of the ground truth column
good_col_names = [name for name in list(data['train'].features) if "tags" in name]
if len(good_col_names) == 1:
  label_col = good_col_names[0]
elif "ner_tags" in good_col_names:
  label_col = "ner_tags"
else:
  col_names = list(data['train'].features)
  print(f"The name of the columns are {col_names}.")
  label_col = input(f"🏅 Please enter the name of the column containing the ner tags: ")
  assert label_col in col_names, f"{label_col} is not an existing column"

labels = data["train"].features[label_col].feature.names
train_data = NerDataset(split="train", labels=labels)
test_split_name = "validation" if "validation" in data else "test"
test_data = NerDataset(split=test_split_name, labels=labels, label_col=label_col)

In [None]:
#@markdown Convert datasets to Spacy examples
import spacy
from typing import List
from spacy.training import Example

def generate_examples(texts, samples_annotations, nlp):
  examples = []
  for text, annotations in zip(texts, samples_annotations):
    # spacy requires annotations in this format
    annotations = {
        "entities": [
            (annotation["start"], annotation["end"], annotation["label"])
            for annotation in annotations
        ]
    }

    # Generating the docs/examples
    doc = nlp.make_doc(text)
    examples.append(Example.from_dict(doc, annotations))
  return examples

# Create Spacy NER model
nlp = spacy.blank("en")
nlp.add_pipe("ner", last=True)

if _minimize_for_ci():
  train_data.text_inputs, train_data.gold_spans = train_data.text_inputs[:1000], train_data.gold_spans[:1000]
  test_data.text_inputs, test_data.gold_spans = test_data.text_inputs[:1000], test_data.gold_spans[:1000]

# Create train and test examples
train_examples = generate_examples(train_data.text_inputs, train_data.gold_spans, nlp)
test_examples = generate_examples(test_data.text_inputs, test_data.gold_spans, nlp)

# Training with Galileo
Input samples are logged to Galileo using `log_input_examples`. Model data is logged by wrapping the `nlp` object using `watch`. This automatically logs the logits and embeddings from your model to Galileo with just 1 line of code. 

We complete the training pipeline by using a standard SpaCy training setup. While training, we log the current `epoch` and `split`. To complete logging, we call `dq.finish()` after training.

In [None]:
import random
import spacy
from spacy.util import minibatch
import dataquality as dq
from dataquality.integrations.spacy import log_input_examples, watch

num_epochs = 5
batch_size = 64

# 🔭🌕 Initializing a new run in Galileo. Each run is part of a project.
dq.init(task_type="text_ner", 
        project_name="named_entity_recognition_spacy", 
        run_name=f"example_run_{dataset_name.replace('/', '-')}")

optimizer = nlp.initialize(lambda: train_examples+test_examples)

watch(nlp) # 🔭🌕 One line of Galileo code to capture the model's predictions on the inputs
log_input_examples(train_examples, "training") # 🔭🌕 Logging the training examples with Galileo
log_input_examples(test_examples, "test") # 🔭🌕 Logging the test examples  with Galileo

for itn in range(num_epochs):
    dq.set_epoch(itn) # 🔭🌕 Setting the epoch
    print(f"Starting Epoch {itn}")

    dq.set_split("training") # 🔭🌕 Setting split to training
    random.shuffle(train_examples)
    batches = minibatch(train_examples, batch_size)
    losses = {}
    for batch in batches:
        nlp.update(batch, drop=0.5, sgd=optimizer, losses=losses)

    dq.set_split("test") # 🔭🌕 Setting split to test
    scores = nlp.evaluate(test_examples)
    print(f"Score is {scores} for epoch: {itn}")

# 🔭🌕 Complete the Galileo workflow with a call to dq.finish()
dq.finish()

# Logging Inference Data

To log inference data, save the model to disk and check out the NER Inference with Spacy and Galileo notebook.

In [None]:
nlp.to_disk("my_model")

# General Help and Docs
- To get help with your task's requirements, call `dq.get_data_logger().doc()`
- To see more general data and model logging docs, run `dq.docs()`

In [None]:
dq.get_data_logger().doc()
help(dq.log_dataset)