In [None]:
!git clone https://github.com/sri-sandeep108/Spacy_NER_data data
!mkdir data/results
!mkdir data/data/spacy_db

Cloning into 'data'...
remote: Enumerating objects: 16, done.[K
remote: Counting objects: 100% (16/16), done.[K
remote: Compressing objects: 100% (14/14), done.[K
remote: Total 16 (delta 2), reused 12 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (16/16), 1.01 MiB | 3.83 MiB/s, done.
Resolving deltas: 100% (2/2), done.


Let's Create functions to parse and import the dataset

In [43]:
import spacy, json
from spacy.tokens import DocBin
def json_parser(data):
  parsed_data = []
  for item in data:
    text = item["text"]
    labels = []
    for entity in item["label"]:
      label = entity["labels"][0].capitalize()
      start = entity["start"]
      end = entity["end"]
      labels.append((start, end, label))
    parsed_data.append((text, labels))
  return parsed_data

def pubtator_extractor(data):
  """Converts data from pubtator to Spacy's JSON Format"""
  parsed_data = []
  for line in data:
    line = line.strip()
    if line == "":
       if parsed_entity:
        parsed_data.append(tuple(parsed_entity))
    if "|t|" in line:
      parsed_entity = []
      current_title = line.split("|t|")[1] + " "
    elif "|a|" in line:
      parsed_entity.append(current_title + line.split("|a|")[1])
    elif "Disease" in line or "Chemical" in line:
      if len(parsed_entity) == 1:
        parsed_entity.append([])
      start = int(line.split("\t")[1])
      end = int(line.split("\t")[2])
      label = line.split("\t")[4]
      parsed_entity[1].append((start, end, label))
  return parsed_data

def db_creator(data):
  db = DocBin()
  nlp = spacy.blank("en")

  for text, annotations in data:
    doc = nlp(text)
    ents = []
    for start, end, label in annotations:
      span = doc.char_span(start, end, label=label)
      if span is None:
        continue
      else:
        ents.append(span)
    doc.ents = ents
    db.add(doc)
  return db

def db_creator_spans(data):
  db = DocBin()
  nlp = spacy.blank("en")

  for text, annotations in data:
    doc = nlp(text)
    spans = []
    for start, end, label in annotations:
      span = doc.char_span(start, end, label=label)
      if span is None:
        continue
      else:
        spans.append(span)
    doc.spans["sc"] = spans
    db.add(doc)
  return db

def sorted_scores(nlp , data):
  whole_text = []
  for text_tuple in data:
      text = text_tuple[0]
      doc = nlp(text)
      single_text = []
      if "sc" in doc.spans:
          for i, span in enumerate(doc.spans["sc"]):
              score = doc.spans["sc"].attrs["scores"][i]
              span_text = span.text
              single_text.append((span_text, score))
      else:
          print(f"No spans found in: '{text}'")
      if len(single_text) > 0:
        whole_text.append([text , sorted(single_text, key=lambda x: x[1])])
  text_list = sorted(whole_text, key=lambda x: x[1][0][1])
  text_percent = []
  for data in text_list:
    text = data[0]
    percent = sum(x[1] for x in data[1]) / len(data[1]) * 100
    text_percent.append([text, percent])
  return sorted(text_percent, key=lambda x: x[1])

def get_sorted_false_negatives(nlp, examples):
    false_negatives = []

    for text, gold_spans in examples:
        doc = nlp(text)
        predicted_spans = set((span.start_char, span.end_char, span.label_) for span in doc.spans.get("sc", []))
        gold_spans_set = set(gold_spans)

        missing_spans = gold_spans_set - predicted_spans  # False negatives

        if missing_spans:
            # Store example with count and average span length
            avg_span_length = sum(e - s for s, e, _ in missing_spans) / len(missing_spans)
            false_negatives.append((text, list(missing_spans), len(missing_spans), avg_span_length))

    # Sort first by number of false negatives, then by average span length
    false_negatives.sort(key=lambda x: (-x[2], -x[3]))

    # Return sorted results without the sorting metadata
    return [(text, missing_spans) for text, missing_spans, _, _ in false_negatives]

In [3]:
with open("data/data/pubtator_files/CDR_DevelopmentSet.PubTator.txt", mode="r") as f:
  dev_data = pubtator_extractor(f.readlines())
  f.close()

with open("data/data/pubtator_files/CDR_TrainingSet.PubTator.txt", mode="r") as f:
  train_data_full = pubtator_extractor(f.readlines())
  f.close()


In [5]:
import random
initial_data = random.sample(train_data_full,25)

with open("data/data/training_data/initial_training.txt", mode="w") as f:
  for data in initial_data:
      f.write(data[0])
      f.write("\n")
  f.close()

In [6]:
with open("data/data/training_data/initial_training.json", mode="r") as f:
  train_data = json_parser(json.load(f))
  f.close()

In [7]:
train = db_creator_spans(train_data)
train.to_disk("data/data/spacy_db/train_spans.spacy")

dev = db_creator_spans(dev_data)
dev.to_disk("data/data/spacy_db/dev_spans.spacy")

In [10]:
!python3 -m spacy train ./data/spacy/config.cfg --output ./data/models/initial_model --paths.train ./data/data/spacy_db/train_spans.spacy --paths.dev ./data/data/spacy_db/dev_spans.spacy --gpu-id 0

[38;5;4mℹ Saving to output directory: data/models/initial_model[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'spancat'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TRANS...  LOSS SPANCAT  SPANS_SC_F  SPANS_SC_P  SPANS_SC_R  SCORE 
---  ------  -------------  ------------  ----------  ----------  ----------  ------
  0       0        2936.91       2553.37        1.57        0.80       39.10    0.02
 50     200       54966.35      49404.17       77.02       73.06       81.44    0.77
100     400          16.55        494.16       77.91       72.65       83.97    0.78
150     600           3.30        192.04       77.

In [11]:
!python3 -m spacy benchmark accuracy ./data/models/initial_model/model-best/ ./data/data/spacy_db/dev_spans.spacy -o data/results/result_initial_model.json -P --gpu-id 0

[38;5;4mℹ Using GPU: 0[0m
[38;5;4mℹ Per-component scores will be saved to output JSON file.[0m
[38;5;2m✔ Saved results to data/results/result_initial_model.json[0m


Let's take 25 texts from the data with the least overall confidence scores


In [12]:
nlp = spacy.load("data/models/initial_model/model-best")
datas = sorted_scores(nlp, train_data_full)
with open("data/data/training_data/active_learning_1_test.txt", mode="w") as f:
    for line in datas[:25]:
        f.write(line[0])
        f.write("\n")
    f.close()

In [14]:
with open("data/data/training_data/active_learning_1.json", mode="r") as f:
    active_learning_data_1 = json_parser(json.load(f))
    f.close()

In [15]:
training_data_active = train_data + active_learning_data_1
db_creator_spans(training_data_active).to_disk("data/data/spacy_db/train_spans.spacy")

In [16]:
!python3 -m spacy train ./data/spacy/config.cfg --output ./data/models/active_1 --paths.train ./data/data/spacy_db/train_spans.spacy --paths.dev ./data/data/spacy_db/dev_spans.spacy --gpu-id 0

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[38;5;2m✔ Created output directory: data/models/active_1[0m
[38;5;4mℹ Saving to output directory: data/models/active_1[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'spancat'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TRANS...  LOSS SPANCAT  SPANS_SC_F  SPANS_SC_P  SPANS_SC_R  SCORE 
---  ------  -------------  ------------  ----------  ----------  ----------  ------
  0       0        3566.40       2850.62        1.57        0.80       39.10    0.02
 28     200       61716.21      56715.90       77.70       86.92       70.24    0.78
 57     400          70.38        963.51       80.78       81.41       80.17  

In [17]:
!python3 -m spacy benchmark accuracy ./data/models/active_1/model-best/ ./data/data/spacy_db/dev_spans.spacy -o data/results/result_active_1.json -P --gpu-id 0

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[38;5;4mℹ Using GPU: 0[0m
[38;5;4mℹ Per-component scores will be saved to output JSON file.[0m
[38;5;2m✔ Saved results to data/results/result_active_1.json[0m


Let's try getting the scores after the first iteration

In [18]:
nlp = spacy.load("data/models/active_1/model-best")
datas = sorted_scores(nlp, train_data_full)

In [19]:
with open("data/data/training_data/active_learning_2_test.txt", mode="w") as f:
    for line in datas[:25]:
        f.write(line[0])
        f.write("\n")
    f.close()

In [20]:
with open("data/data/training_data/active_learning_2.json", mode="r") as f:
    active_learning_data_2 = json_parser(json.load(f))
    f.close()
training_data_active  += active_learning_data_2
db_creator_spans(training_data_active).to_disk("data/data/spacy_db/train_spans.spacy")

In [21]:
!python3 -m spacy train ./data/spacy/config.cfg --output ./data/models/active_2 --paths.train ./data/data/spacy_db/train_spans.spacy --paths.dev ./data/data/spacy_db/dev_spans.spacy --gpu-id 0

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[38;5;2m✔ Created output directory: data/models/active_2[0m
[38;5;4mℹ Saving to output directory: data/models/active_2[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'spancat'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TRANS...  LOSS SPANCAT  SPANS_SC_F  SPANS_SC_P  SPANS_SC_R  SCORE 
---  ------  -------------  ------------  ----------  ----------  ----------  ------
  0       0        3518.08       2891.66        1.57        0.80       39.10    0.02
 20     200       63995.60      59011.56       79.02       72.81       86.38    0.79
 40     400         117.00       1329.44       81.30       81.93       80.68  

In [22]:
!python3 -m spacy benchmark accuracy ./data/models/active_2/model-best/ ./data/data/spacy_db/dev_spans.spacy -o data/results/result_active_2.json -P --gpu-id 0

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[38;5;4mℹ Using GPU: 0[0m
[38;5;4mℹ Per-component scores will be saved to output JSON file.[0m
[38;5;2m✔ Saved results to data/results/result_active_2.json[0m


Let's try to improve the recall score for this is cycle by targeting on reducing the false negatives

In [23]:
nlp = spacy.load("data/models/active_2/model-best")
data = get_sorted_false_negatives(nlp, train_data_full)

In [24]:
with open("data/data/training_data/active_learning_3_test.txt", mode="w") as f:
    for line in data[:25]:
        f.write(line[0])
        f.write("\n")

In [25]:
with open("data/data/training_data/active_learning_3.json", mode="r") as f:
    active_learning_data_3 = json_parser(json.load(f))
    f.close()
training_data_active  += active_learning_data_3
db_creator_spans(training_data_active).to_disk("data/data/spacy_db/train_spans.spacy")

In [28]:
!python3 -m spacy train ./data/spacy/config.cfg --output ./data/models/active_3 --paths.train ./data/data/spacy_db/train_spans.spacy --paths.dev ./data/data/spacy_db/dev_spans.spacy --gpu-id 0

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[38;5;2m✔ Created output directory: data/models/active_3[0m
[38;5;4mℹ Saving to output directory: data/models/active_3[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'spancat'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TRANS...  LOSS SPANCAT  SPANS_SC_F  SPANS_SC_P  SPANS_SC_R  SCORE 
---  ------  -------------  ------------  ----------  ----------  ----------  ------
  0       0        2941.08       2588.75        1.57        0.80       39.10    0.02
 13     200       65942.20      62243.63       79.43       77.00       82.03    0.79
 26     400         217.57       2185.33       84.11       88.24       80.35  

In [29]:
!python3 -m spacy benchmark accuracy ./data/models/active_3/model-best/ ./data/data/spacy_db/dev_spans.spacy -o data/results/result_active_3.json -P --gpu-id 0

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[38;5;4mℹ Using GPU: 0[0m
[38;5;4mℹ Per-component scores will be saved to output JSON file.[0m
[38;5;2m✔ Saved results to data/results/result_active_3.json[0m


In [30]:
nlp = spacy.load("data/models/active_3/model-best")
data = get_sorted_false_negatives(nlp, train_data_full)

In [31]:
with open("data/data/training_data/active_learning_4_test.txt", mode="w") as f:
    for line in data[:25]:
        f.write(line[0])
        f.write("\n")

In [32]:
with open("data/data/training_data/active_learning_4.json", mode="r") as f:
    active_learning_data_4 = json_parser(json.load(f))
    f.close()
training_data_active  += active_learning_data_4
db_creator_spans(training_data_active).to_disk("data/data/spacy_db/train_spans.spacy")

Let's do a final training with the NER component instead of spancat

In [41]:
db_creator(training_data_active).to_disk("data/data/spacy_db/train.spacy")
db_creator(dev_data).to_disk("data/data/spacy_db/dev.spacy")

In [42]:
!python3 -m spacy train ./data/spacy/config_final.cfg --output ./data/models/final --paths.train ./data/data/spacy_db/train.spacy --paths.dev ./data/data/spacy_db/dev.spacy --gpu-id 0

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[38;5;4mℹ Saving to output directory: data/models/final[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TRANS...  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  -------------  --------  ------  ------  ------  ------
  0       0         330.66    712.49    7.48    4.05   48.30    0.07
 11     200       47790.93  70766.52   82.52   86.01   79.30    0.83
 23     400        1562.14   2875.16   82.50   81.36   83.67    0.82
 35     600         509.21    877.58   84.12   86.43   81.92    0.84
 47     800         294.72    498.32   84.22   84.68   83.75    0.84
 58 

In [44]:
!python3 -m spacy benchmark accuracy ./data/models/final/model-best/ ./data/data/spacy_db/dev.spacy -o data/results/result_final.json -P --gpu-id 0

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[38;5;4mℹ Using GPU: 0[0m
[38;5;4mℹ Per-component scores will be saved to output JSON file.[0m
[38;5;2m✔ Saved results to data/results/result_final.json[0m


Let's Evaluate our model against the test data set

In [45]:
with open("data/data/pubtator_files/CDR_TestSet.PubTator.txt", mode="r") as f:
  test_data = pubtator_extractor(f.readlines())
  f.close()
db_creator(test_data).to_disk("data/data/spacy_db/test.spacy")

In [46]:
!python3 -m spacy benchmark accuracy ./data/models/final/model-best/ ./data/data/spacy_db/test.spacy -o data/results/result_test.json -P --gpu-id 0

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[38;5;4mℹ Using GPU: 0[0m
[38;5;4mℹ Per-component scores will be saved to output JSON file.[0m
[38;5;2m✔ Saved results to data/results/result_test.json[0m
