# Inference Named Entity Recognition with SpaCy and 🔭 Galileo

In this tutorial, we'll log an inference dataset on a pre-trained model with SpaCy and explore the results in Galileo.

In [None]:
#@title Install `dataquality` and other dependencies

# Upgrade pip
!pip install -U pip &> /dev/null
# Install HF datasets for downloading the example datasets
!pip install -U "dataquality==0.8.55.2" datasets "spacy==3.2.1" &> /dev/null

# Login to Galileo

In [None]:
import os

import dataquality as dq
import spacy

# Set enterprise env variables, you cannot use this in Galileo Cloud!
os.environ["GALILEO_CONSOLE_URL"] = os.getenv("GALILEO_CONSOLE_URL_ENTERPRISE")
os.environ["GALILEO_USERNAME"] = os.getenv("GALILEO_USERNAME_ENTERPRISE")
os.environ["GALILEO_PASSWORD"] = os.getenv("GALILEO_PASSWORD_ENTERPRISE")

dq.configure()


# Inference DataSet Preparation 
We load text samples from HuggingFace🤗 dataset. This data can be thought of as production data that our model is making predictions on.

In [None]:
#@title 🤗 HuggingFace Dataset
#@markdown You can select any dataset from [here](https://huggingface.co/datasets?language=language:en&task_categories=task_categories:token-classification&task_ids=task_ids:named-entity-recognition&sort=downloads).

dataset_name = 'conllpp' #@param ["wnut_17", "conllpp", "wikiann"] {allow-input: true}
print(f"You selected the {dataset_name} dataset")

from IPython.utils import io
from tqdm import tqdm
from datasets import load_dataset, get_dataset_config_names

# Try to load the data. If a config (subset) is needed, pick one
try:
  with io.capture_output() as captured:
    data = load_dataset(dataset_name, trust_remote_code=True)
except ValueError as e:
  if "Config name is missing" not in repr(e):
    raise e

  configs = get_dataset_config_names(dataset_name)
  print(f"The dataset {dataset_name} has multiple subsets {configs}.")
  config = input(f"🖖 Enter the name of the subset to pick (or leave blank for any): ")
  if config:
    assert config in configs, f"{config} is not a valid subset"
  else:
    config = configs[0]
  with io.capture_output() as captured:
    data = load_dataset(dataset_name, name=config, trust_remote_code=True)

# Check that the dataset has at least a validation/test set to use as inference data
assert {"validation", "test"}.intersection(data), \
f"💾 The dataset {dataset_name} has no validation or test splits, select another one."

print(f"\n🏆 Dataset {dataset_name} loaded succesfully")

# A small function for minimizing the dataset for testing
import os

def _minimize_for_ci() -> bool:
    return os.getenv("MINIMIZE_FOR_CI", "false") == "true"

In [None]:
def get_inference_docs(data, nlp) -> list:
    # Get inference split name
    inf_split_name = "validation" if "validation" in data else "test"
    tokens = data[inf_split_name]['tokens']
    text_inputs = [' '.join(_tokens) for _tokens in tokens]

    if _minimize_for_ci():
        text_inputs = text_inputs[:50]

    # Get inference docs
    inference_docs = [nlp.make_doc(text) for text in text_inputs]

    return inference_docs

# Load the model

Inference data can be logged as part of an existing run with previously logged training, validation, and test splits, or it can be logged in isolation.

If you want to access the full suite of inference features, including automated drift detection, first run the [NER Spacy and Galileo training notebook](https://colab.research.google.com/github/rungalileo/examples/blob/main/examples/named_entity_recognition/Named_Entity_Recognition_with_SpaCy_and_%F0%9F%94%AD_Galileo.ipynb#scrollTo=QkUoYnK0oFK_).

Otherwise, we will download a small pretrained model to use for inference.

In [1]:
import spacy

model_type = "pretrained_web" #@param ["pretrained_web", "my_model"] {allow-input: false}

if model_type == "pretrained_web":
    # Download a small English model for running inference
    !python -m spacy download en_core_web_sm &> /dev/null
    nlp = spacy.load("en_core_web_sm")
elif model_type == "my_model":
    # Load trained model from NER Spacy training notebook
    nlp = spacy.load("my_model")  # TODO: Update path to model
else:
    raise ValueError(f"Unknown model type {model_type}")

# Inference with Galileo
Inference samples are logged to Galileo using `log_input_docs`. Model data is logged by wrapping the `nlp` object using `watch`. This automatically logs the logits and embeddings from your model to Galileo with just 1 line of code. 

We complete the inference pipeline by setting the split to `inference` and then passing inference Doc objects through our `nlp` model. To complete logging, we call `dq.finish()` after logging inference data.

Each inference run must have a unique `inference_name` that must be consistent for input and model logs. You can log multiple inference datasets with different `inference_name`s on the same Project/Run pair.

**Note:** If you want to connect this inference run to an existing training run, make sure to use the same `project_name` and `run_name`.

In [None]:
import random
from dataquality.integrations.spacy import log_input_docs, unwatch, watch

# 🔭🌕 Initializing a new run in Galileo. Each run is part of a project.
dq.init(task_type="text_ner", 
        project_name="named_entity_recognition_inference_spacy", 
        run_name=f"example_run_{dataset_name.replace('/', '-')}")

inference_docs = get_inference_docs(data, nlp)
inference_name = "example1"
meta = {
    "color": random.choices(["red", "blue", "green"], k=len(inference_docs)),
    "ranking": random.choices(range(1,101), k=len(inference_docs))
}

watch(nlp) # 🔭🌕 One line of Galileo code to capture the model's predictions on the inputs
log_input_docs(inference_docs, inference_name=inference_name, meta=meta) # 🔭🌕 Logging the inference docs with Galileo

dq.set_split("inference", inference_name=inference_name)
for doc in inference_docs:
    nlp(doc)

dq.finish()
unwatch(nlp)

In [None]:
dq.get_data_logger().doc()
help(dq.log_dataset)