In [None]:
!pip install spacy_transformers
!pip install -U spacy

Let's Create functions to parse and import the dataset

In [52]:
import spacy, json
from spacy.tokens import DocBin

def json_parser(data):
  parsed_data = []
  for item in data:
    text = item["text"]
    labels = []
    for entity in item["label"]:
      label = entity["labels"][0].capitalize()
      start = entity["start"]
      end = entity["end"]
      labels.append((start, end, label))
    parsed_data.append((text, labels))
  return parsed_data

def pubtator_extractor(data):
  """Converts data from pubtator to Spacy's JSON Format"""
  parsed_data = []
  for line in data:
    line = line.strip()
    if line == "":
       if parsed_entity:
        parsed_data.append(tuple(parsed_entity))
    if "|t|" in line:
      parsed_entity = []
      current_title = line.split("|t|")[1] + " "
    elif "|a|" in line:
      parsed_entity.append(current_title + line.split("|a|")[1])
    elif "Disease" in line or "Chemical" in line:
      if len(parsed_entity) == 1:
        parsed_entity.append([])
      start = int(line.split("\t")[1])
      end = int(line.split("\t")[2])
      label = line.split("\t")[4]
      parsed_entity[1].append((start, end, label))
  return parsed_data

def db_creator(data):
  db = DocBin()
  nlp = spacy.blank("en")

  for text, annotations in data:
    doc = nlp(text)
    ents = []
    for start, end, label in annotations:
      span = doc.char_span(start, end, label=label)
      if span is None:
        continue
      else:
        ents.append(span)
    doc.ents = ents
    db.add(doc)
  return db

def db_creator_spans(data):
  db = DocBin()
  nlp = spacy.blank("en")

  for text, annotations in data:
    doc = nlp(text)
    spans = []
    for start, end, label in annotations:
      span = doc.char_span(start, end, label=label)
      if span is None:
        continue
      else:
        spans.append(span)
    doc.spans["sc"] = spans
    db.add(doc)
  return db

def sorted_scores(nlp , data):
  whole_text = []
  for text_tuple in data:
      text = text_tuple[0]
      doc = nlp(text)
      single_text = []
      if "sc" in doc.spans:
          for i, span in enumerate(doc.spans["sc"]):
              score = doc.spans["sc"].attrs["scores"][i]
              span_text = span.text
              single_text.append((span_text, score))
      else:
          print(f"No spans found in: '{text}'")
      if len(single_text) > 0:
        whole_text.append([text , sorted(single_text, key=lambda x: x[1])])
  text_list = sorted(whole_text, key=lambda x: x[1][0][1])
  text_percent = []
  for data in text_list:
    text = data[0]
    percent = sum(x[1] for x in data[1]) / len(data[1]) * 100
    text_percent.append([text, percent])
  return sorted(text_percent, key=lambda x: x[1])

def get_sorted_false_negatives(nlp, examples):
    """
    Identifies and sorts false negatives in a spaCy spancat model for active learning.

    Args:
        nlp (spacy.Language): The trained spaCy model.
        examples (list): List of (text, gold_spans) tuples, where
                         gold_spans is a list of (start, end, label).

    Returns:
        list: Sorted list of (text, missing_spans) tuples prioritized for active learning.
    """
    false_negatives = []

    for text, gold_spans in examples:
        doc = nlp(text)
        predicted_spans = set((span.start_char, span.end_char, span.label_) for span in doc.spans.get("sc", []))
        gold_spans_set = set(gold_spans)

        missing_spans = gold_spans_set - predicted_spans  # False negatives

        if missing_spans:
            # Store example with count and average span length
            avg_span_length = sum(e - s for s, e, _ in missing_spans) / len(missing_spans)
            false_negatives.append((text, list(missing_spans), len(missing_spans), avg_span_length))

    # Sort first by number of false negatives, then by average span length
    false_negatives.sort(key=lambda x: (-x[2], -x[3]))

    # Return sorted results without the sorting metadata
    return [(text, missing_spans) for text, missing_spans, _, _ in false_negatives]

In [53]:
with open("data/pubtator_files/CDR_DevelopmentSet.PubTator.txt", mode="r") as f:
  dev_data = pubtator_extractor(f.readlines())
  f.close()

with open("data/pubtator_files/CDR_TrainingSet.PubTator.txt", mode="r") as f:
  train_data_full = pubtator_extractor(f.readlines())
  f.close()

with open("data/training_data/initial_annotated_train.json", mode="r") as f:
  train_data = json_parser(json.load(f))
  f.close()

In [3]:
train = db_creator_spans(train_data)
train.to_disk("train_spans.spacy")

dev = db_creator_spans(dev_data)
dev.to_disk("dev_spans.spacy")

In [4]:
!python -m spacy train ./spacy/config_span.cfg --output ./models/initial_model --paths.train ./data/spacy_db/train_spans.spacy --paths.dev ./data/spacy_db/dev_spans.spacy

^C


In [4]:
!python -m spacy benchmark accuracy .\models\initial_model\model-best\ .\data\spacy_db\dev_spans.spacy -o results/result_initial_model.json -P

[38;5;4m[i] Using CPU[0m
[38;5;4m[i] Per-component scores will be saved to output JSON file.[0m
[38;5;2m[+] Saved results to results\result_initial_model.json[0m


In [None]:

train_text = [text[0] for text in train_data_full]

Let's take 25 texts from the data with the least overall confidence scores


In [None]:
nlp = spacy.load("models/initial_model/model-best")
datas = sorted_scores(nlp, train_data_full)
with open("data/training_data/active_learning_1_test.txt", mode="w") as f:
    for line in datas[:25]:
        f.write(line[0])
        f.write("\n")
    f.close()

In [5]:
with open("data/training_data/active_learning_1.json", mode="r") as f:
    active_learning_data_1 = json_parser(json.load(f))
    f.close()

In [6]:
training_data_active_1 = train_data + active_learning_data_1
training_data_active_1 = db_creator_spans(training_data_active_1)
training_data_active_1.to_disk("data/spacy_db/training_data_active_1.spacy")

In [None]:
!python -m spacy train ./spacy/config_span.cfg --output ./models/active_1 --paths.train ./data/spacy_db/training_data_active_1.spacy --paths.dev ./data/spacy_db/dev_spans.spacy

In [13]:
!python -m spacy benchmark accuracy .\models\active_1\model-best\ .\data\spacy_db\dev_spans.spacy -o results/result_active_1.json -P

[38;5;4m[i] Using CPU[0m
[38;5;4m[i] Per-component scores will be saved to output JSON file.[0m
[38;5;2m[+] Saved results to result_active_1.json[0m


Let's try getting the scores after the first iteration

In [None]:
# nlp = spacy.load("models/output_spans_1/model-best")
datas = sorted_scores(nlp, train_data_full)
datas[:25]

In [None]:
with open("data/training_data/active_learning_2_test.txt", mode="w") as f:
    for line in datas[:25]:
        f.write(line[0])
        f.write("\n")
    f.close()

In [14]:
with open("data/training_data/active_learning_2.json", mode="r") as f:
    active_learning_data_2 = json_parser(json.load(f))
    f.close()
training_data_active_2 = train_data + active_learning_data_1 + active_learning_data_2
training_data_active_2 = db_creator_spans(training_data_active_2)
training_data_active_2.to_disk("data/spacy_db/training_data_active_2.spacy")

In [None]:
!python -m spacy train ./spacy/config_span.cfg --output ./models/active_2 --paths.train ./data/spacy_db/training_data_active_2.spacy --paths.dev ./data/spacy_db/dev_spans.spacy

In [None]:
!python -m spacy benchmark accuracy .\models\active_2\model-best\ .\data\spacy_db\dev_spans.spacy -o results/result_active_2.json -P

Let's try to improve the recall score for this is cycle by targeting on reducing the false negatives

In [25]:
nlp = spacy.load("models/active_2/model-best")
data = get_sorted_false_negatives(nlp, train_data_full)

In [27]:
with open("data/training_data/active_learning_3.txt", mode="w") as f:
    for line in data[:25]:
        f.write(line[0])
        f.write("\n")

In [42]:
with open("data/training_data/active_learning_3.json", mode="r") as f:
    active_learning_data_3 = json_parser(json.load(f))
    f.close()
training_data_active_3 = train_data + active_learning_data_1 + active_learning_data_2 + active_learning_data_3
training_data_active_3 = db_creator_spans(training_data_active_3)
training_data_active_3.to_disk("data/spacy_db/training_data_active_3.spacy")

In [45]:
active_3_ent = train_data + active_learning_data_1 + active_learning_data_2 + active_learning_data_3
active_3_ent = db_creator(active_3_ent)
active_3_ent.to_disk("data/spacy_db/active_3_ent.spacy")

In [59]:
import numpy as np

def entropy(probabilities):
    """Compute entropy of probability distribution."""
    probabilities = np.array(probabilities)
    probabilities = probabilities[probabilities > 0]  # Avoid log(0) errors
    return -np.sum(probabilities * np.log(probabilities))

def get_active_learning_samples_ner(nlp, examples, training_data, uncertainty_threshold=0.5):
    """
    Identifies false negatives & uncertain entities for active learning.

    Args:
        nlp (spacy.Language): Trained custom NER model.
        examples (list): List of (text, gold_entities) tuples.
        training_data (set): Already trained examples to avoid redundancy.
        uncertainty_threshold (float): Minimum entropy score to consider an entity uncertain.

    Returns:
        list: Sorted (text, missing_entities + uncertain_entities) tuples.
    """
    false_negatives = []
    uncertain_entities = []

    for text, gold_entities in examples:
        doc = nlp(text)
        predicted_entities = set((ent.start_char, ent.end_char, ent.label_) for ent in doc.ents)
        gold_entities_set = set(gold_entities)

        # **False Negatives**: Gold entities missed by the model
        missing_entities = gold_entities_set - predicted_entities
        missing_entities = [ent for ent in missing_entities if (text, ent) not in training_data]

        # **Uncertain Predictions**: Entities with low confidence
        for ent in doc.ents:
            if ent.label_ in doc.cats:
                entity_probs = doc.cats[ent.label_]  # Get class probabilities
                if entity_probs:
                    ent_entropy = entropy(entity_probs)
                    if ent_entropy >= uncertainty_threshold:  # High entropy = uncertain
                        uncertain_entities.append((text, (ent.start_char, ent.end_char, ent.label_)))

        if missing_entities:
            avg_entity_length = sum(e - s for s, e, _ in missing_entities) / len(missing_entities)
            false_negatives.append((text, missing_entities, len(missing_entities), avg_entity_length))

    # **Sorting Criteria**:
    # - First by number of false negatives (most gaps)
    # - Then by avg entity length (longer entities are harder)
    false_negatives.sort(key=lambda x: (-x[2], -x[3]))

    return [(text, missing_entities + uncertain_entities) for text, missing_entities, _, _ in false_negatives]

# Example usage
nlp = spacy.load("models/active_3_acc/model-best")


In [62]:
# Get best examples for active learning
active_samples = get_active_learning_samples_ner(nlp, train_data_full, train_data)

In [64]:
len(active_samples)

390

In [68]:
active_samples[100]

('Cardioprotective effect of salvianolic acid A on isoproterenol-induced myocardial infarction in rats. The present study was designed to evaluate the cardioprotective potential of salvianolic acid A on isoproterenol-induced myocardial infarction in rats. Hemodynamic parameters and lead II electrocardiograph were monitored and recorded continuously. Cardiac marker enzymes and antioxidative parameters in serum and heart tissues were measured. Assay for mitochondrial respiratory function and histopathological examination of heart tissues were performed. Isoproterenol-treated rats showed significant increases in the levels of lactate dehydrogenase, aspartate transaminase, creatine kinase and malondialdehyde and significant decreases in the activities of superoxide dismutase, catalase and glutathione peroxidase in serum and heart. These rats also showed declines in left ventricular systolic pressure, maximum and minimum rate of developed left ventricular pressure, and elevation of left ven

In [79]:
doc = nlp(train_data[0][0])

In [80]:
ent = doc.ents[0]


In [81]:
doc.cats

{}