In [None]:
import spacy
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m891.9 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
nlp = spacy.load("en_core_web_lg")
nlp

<spacy.lang.en.English at 0x7d41d4be7e50>

In [None]:
from datasets import load_dataset

# Load the dataset
ds = load_dataset("ibm/MedMentions-ZS")

# Set the format to pandas to work with it easily
ds.set_format(type="pandas")

import spacy
from spacy.training import Example
from spacy.tokens import Doc, Span
from spacy import displacy
import matplotlib.pyplot as plt

# Enable GPU if available
# spacy.require_gpu()

# Initialize a blank SpaCy model
nlp = spacy.blank("en")

# Add the NER pipeline
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner", last=True)
else:
    ner = nlp.get_pipe("ner")

# Function to convert tokens and ner_tags to TRAIN_DATA format
def convert_to_train_data(tokens, ner_tags):
    text = " ".join(tokens)
    entities = []
    start_idx = 0

    for token, tag in zip(tokens, ner_tags):
        token_length = len(token)

        if tag.startswith("B-"):  # Beginning of a new entity
            entity_start = start_idx
            entity_end = entity_start + token_length
            entity_label = tag[2:]
            entities.append((entity_start, entity_end, entity_label))

        elif tag.startswith("I-") and entities:  # Continuation of the current entity
            entity_end = start_idx + token_length
            # Update the last entity's end index
            entities[-1] = (entities[-1][0], entity_end, entities[-1][2])

        start_idx += token_length + 1

    return (text, {"entities": entities})

# Prepare training data from MedQA format
TRAIN_DATA = [convert_to_train_data(row.tokens, row.ner_tags) for _, row in ds['train'][:].iterrows()]
VAL_DATA = [convert_to_train_data(row.tokens, row.ner_tags) for _, row in ds['validation'][:].iterrows()]



In [None]:
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") # load a new spacy model
doc_bin = DocBin()

from spacy.util import filter_spans

for training_example  in tqdm(TRAIN_DATA):
    # print(training_example[1])
    # print(training_example[1]["entities"])
    text = training_example[0]
    labels = training_example[1]['entities']
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents
    doc_bin.add(doc)

doc_bin.to_disk("train.spacy")

100%|██████████| 26770/26770 [00:08<00:00, 3150.42it/s]


In [None]:
doc_bin = DocBin()

from spacy.util import filter_spans

for training_example  in tqdm(VAL_DATA):
    # print(training_example[1])
    # print(training_example[1]["entities"])
    text = training_example[0]
    labels = training_example[1]['entities']
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents
    doc_bin.add(doc)

doc_bin.to_disk("val.spacy")

100%|██████████| 1289/1289 [00:00<00:00, 2250.17it/s]


In [None]:
!python -m spacy init fill-config ./base_config.cfg ./config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
!python -m spacy train config.cfg --output ./ --paths.train ./train.spacy --paths.dev ./train.spacy --gpu-id 0

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     41.48    0.00    0.00    0.00    0.00
  0     200         68.17   5085.03   15.15   30.56   10.07    0.15
  0     400        216.59   5560.31   21.73   41.88   14.67    0.22
  0     600        187.98   6478.01   32.81   50.08   24.40    0.33
  0     800        210.39   7486.32   34.95   54.25   25.78    0.35
  0    1000        391.25   9095.20   43.94   49.78   39.33    0.44
  0    1200        429.98  10886.62   45.99   56.14   38.94    0.46
  0    1400        524.90  12488.96   51.64   58.73   46.07    0.52
  0    1600        662.51  14818.63   53.74   62.64   47.06    0.54
  0    1800        829.76  17543.65   54.65   61

In [None]:
nlp_ner = spacy.load("model-best")
doc = nlp_ner("")

spacy.displacy.render(doc, style="ent", jupyter=True)

