In [1]:
import os
import glob
import json
import string

import spacy
from src.NER.spacy.spacy import SpacyModel
from nervaluate import Evaluator
from tqdm import tqdm

from src.NLP.datasets.spacy import SpacyDataset
from src.NLP.tokenizers.spacy import SpacyTokenizer
from src.tools.general_tools import get_filepath
from src.tools.text_tools import preprocess_text

  from .autonotebook import tqdm as notebook_tqdm


## Spacy NER Evaluation

In [2]:
ds = SpacyDataset("../data/trainset/annotations.jsonl")

In [3]:
st = SpacyTokenizer()

In [4]:
_, evalset = ds.split_dataset()
evalset[0]

{'data': 'Title 21: Food and Drugs PART 556-TOLERANCES FOR RESIDUES OF NEW ANIMAL DRUGS IN FOOD Subpart B-Specific Tolerances for Residues of New Animal Drugs $556.513 Piperazine. A tolerance of 0.1 part per million piperazine base is established for edible tissues of poultry and swine. [64 FR 23019, Apr. 29, 1999]',
 'label': [(10, 14, 'SKIP'),
  (19, 24, 'SKIP'),
  (65, 77, 'SKIP'),
  (81, 85, 'SKIP'),
  (136, 148, 'SKIP'),
  (158, 168, 'Substance'),
  (185, 188, 'Value'),
  (189, 205, 'Unit'),
  (206, 216, 'Substance'),
  (241, 276, 'Usage')]}

In [5]:
true_eval_data = []
all_tags = set()
for file in evalset:
    text, labels = file['data'], file['label']
    utt_specific_entries = []
    for label in labels:
        start, end, tag = label[0], label[1], label[2]
        all_tags.add(tag)
        utt_specific_entries.append({
            "label": tag,
            "start": start,
            "end": end
        })
    true_eval_data.append(utt_specific_entries)

### Get predictions

In [6]:
_dataset_base_path: str = os.path.join('results', 'dataset', 'spacy')
train_path: str = get_filepath(_dataset_base_path, 'train.spacy')
eval_path: str = get_filepath(_dataset_base_path, 'eval.spacy')
config_path: str = get_filepath('config', 'spacy_config.cfg')
_evaluation_base_path: str = os.path.join('results', 'evaluation', 'spacy')

spacy_model = SpacyModel(
    dataset_base_path=_dataset_base_path,
    eval_base_path= os.path.join('results', 'evaluation', 'spacy_ner+tok2vec'),
    mode="evaluation"
)
# spacy_model.output_path = "results/evaluation/spacy_ner+tok2vec"

In [None]:
# extract dataset to spacy format type
# spacy_model.train()

2023-03-21 17:15:02.207 | INFO     | src.NER.spacy.spacy:train:72 - Training model...


[38;5;2m✔ Created output directory: results/evaluation/spacy[0m
[38;5;4mℹ Saving to output directory: results/evaluation/spacy[0m
[38;5;4mℹ Using CPU[0m
[1m


[2023-03-21 17:15:03,047] [INFO] Set up nlp object from config
2023-03-21 17:15:03 spacy INFO: Set up nlp object from config
[2023-03-21 17:15:03,070] [INFO] Pipeline: ['ner']
2023-03-21 17:15:03 spacy INFO: Pipeline: ['ner']
[2023-03-21 17:15:03,072] [INFO] Resuming training for: ['ner']
2023-03-21 17:15:03 spacy INFO: Resuming training for: ['ner']
[2023-03-21 17:15:03,094] [INFO] Created vocabulary
2023-03-21 17:15:03 spacy INFO: Created vocabulary
[2023-03-21 17:15:03,099] [INFO] Finished initializing nlp object
2023-03-21 17:15:03 spacy INFO: Finished initializing nlp object
[2023-03-21 17:15:03,101] [INFO] Initialized pipeline components: []
2023-03-21 17:15:03 spacy INFO: Initialized pipeline components: []


[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['ner'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  --------  ------  ------  ------  ------
  0       0    415.14    0.00    0.00    0.00    0.00
  1     200  92397.56    0.00    0.00    0.00    0.00
  2     400  64807.72    0.00    0.00    0.00    0.00
  3     600  54139.68    0.00    0.00    0.00    0.00
  4     800  51913.77    0.00    0.00    0.00    0.00
  5    1000  48252.51    0.00    0.00    0.00    0.00
  6    1200  47430.17    0.24    4.14    0.13    0.00
  8    1400  46353.07    0.83    8.05    0.44    0.01
  9    1600  46761.16    1.61    9.57    0.88    0.02
 10    1800  45090.09    7.24   24.76    4.24    0.07
 11    2000  44778.53    7.60   25.98    4.45    0.08
 12    2200  44392.44   12.19   31.40    7.56    0.12
 13    2400  44271.08   13.59   34.86    8.44    0.14
 14    2600  40983.22   14.67   34.52    9.31    0.15
 16    2800  40451.39  

In [7]:
for file in evalset:
    doc = spacy_model.model(file['data'])
    [tok.idx for tok in doc]
    print([(ent.text, ent.label_, ent.start_char, ent.end_char) for ent in doc.ents], doc)
    break

[('FOOD', 'SKIP', 81, 85), ('Piperazine', 'Substance', 158, 168), ('0.1', 'Value', 185, 188), ('part per million', 'Unit', 189, 205), ('piperazine base', 'SKIP', 206, 221), ('edible tissues of poultry and swine', 'Usage', 241, 276)] Title 21: Food and Drugs PART 556-TOLERANCES FOR RESIDUES OF NEW ANIMAL DRUGS IN FOOD Subpart B-Specific Tolerances for Residues of New Animal Drugs $556.513 Piperazine. A tolerance of 0.1 part per million piperazine base is established for edible tissues of poultry and swine. [64 FR 23019, Apr. 29, 1999]


In [7]:
spacy_model.evaluate()

{'ent_type': {'correct': 2992,
  'incorrect': 755,
  'partial': 0,
  'missed': 1620,
  'spurious': 1529,
  'possible': 5367,
  'actual': 5276,
  'precision': 0.5670962850644428,
  'recall': 0.5574809018073411,
  'f1': 0.562247486610918},
 'partial': {'correct': 2997,
  'incorrect': 0,
  'partial': 750,
  'missed': 1620,
  'spurious': 1529,
  'possible': 5367,
  'actual': 5276,
  'precision': 0.6391205458680819,
  'recall': 0.6282839575181666,
  'f1': 0.6336559240815559},
 'strict': {'correct': 2471,
  'incorrect': 1276,
  'partial': 0,
  'missed': 1620,
  'spurious': 1529,
  'possible': 5367,
  'actual': 5276,
  'precision': 0.4683472327520849,
  'recall': 0.46040618595118316,
  'f1': 0.4643427604998591},
 'exact': {'correct': 2997,
  'incorrect': 750,
  'partial': 0,
  'missed': 1620,
  'spurious': 1529,
  'possible': 5367,
  'actual': 5276,
  'precision': 0.5680439727065959,
  'recall': 0.558412520961431,
  'f1': 0.563187071314479}}

In [11]:
pred_eval_data = spacy_model.predict([file['data'] for file in evalset], False)
# for file in evalset:
#     pred_eval_data.append(
#         spacy_model.predict_as_dict(file['data'])
#     )

In [12]:
pred_eval_data[0]

[{'label': 'SKIP', 'start': 81, 'end': 85, 'text': 'FOOD'},
 {'label': 'Substance', 'start': 158, 'end': 168, 'text': 'Piperazine'},
 {'label': 'Value', 'start': 185, 'end': 188, 'text': '0.1'},
 {'label': 'Unit', 'start': 189, 'end': 205, 'text': 'part per million'},
 {'label': 'SKIP', 'start': 206, 'end': 221, 'text': 'piperazine base'},
 {'label': 'Usage',
  'start': 241,
  'end': 276,
  'text': 'edible tissues of poultry and swine'}]

In [16]:
evaluator = Evaluator(true_eval_data, pred_eval_data, tags=list(all_tags))

# Returns overall metrics and metrics for each tag
results, results_per_tag = evaluator.evaluate()
results

{'ent_type': {'correct': 2992,
  'incorrect': 755,
  'partial': 0,
  'missed': 1620,
  'spurious': 1529,
  'possible': 5367,
  'actual': 5276,
  'precision': 0.5670962850644428,
  'recall': 0.5574809018073411,
  'f1': 0.562247486610918},
 'partial': {'correct': 2997,
  'incorrect': 0,
  'partial': 750,
  'missed': 1620,
  'spurious': 1529,
  'possible': 5367,
  'actual': 5276,
  'precision': 0.6391205458680819,
  'recall': 0.6282839575181666,
  'f1': 0.6336559240815559},
 'strict': {'correct': 2471,
  'incorrect': 1276,
  'partial': 0,
  'missed': 1620,
  'spurious': 1529,
  'possible': 5367,
  'actual': 5276,
  'precision': 0.4683472327520849,
  'recall': 0.46040618595118316,
  'f1': 0.4643427604998591},
 'exact': {'correct': 2997,
  'incorrect': 750,
  'partial': 0,
  'missed': 1620,
  'spurious': 1529,
  'possible': 5367,
  'actual': 5276,
  'precision': 0.5680439727065959,
  'recall': 0.558412520961431,
  'f1': 0.563187071314479}}

In [12]:
results_per_tag

{'Function': {'ent_type': {'correct': 216,
   'incorrect': 42,
   'partial': 0,
   'missed': 41,
   'spurious': 18,
   'possible': 299,
   'actual': 276,
   'precision': 0.782608695652174,
   'recall': 0.7224080267558528,
   'f1': 0.7513043478260871},
  'partial': {'correct': 233,
   'incorrect': 0,
   'partial': 25,
   'missed': 41,
   'spurious': 18,
   'possible': 299,
   'actual': 276,
   'precision': 0.8894927536231884,
   'recall': 0.8210702341137124,
   'f1': 0.8539130434782609},
  'strict': {'correct': 200,
   'incorrect': 58,
   'partial': 0,
   'missed': 41,
   'spurious': 18,
   'possible': 299,
   'actual': 276,
   'precision': 0.7246376811594203,
   'recall': 0.6688963210702341,
   'f1': 0.6956521739130435},
  'exact': {'correct': 233,
   'incorrect': 25,
   'partial': 0,
   'missed': 41,
   'spurious': 18,
   'possible': 299,
   'actual': 276,
   'precision': 0.8442028985507246,
   'recall': 0.7792642140468228,
   'f1': 0.8104347826086956}},
 'SKIP': {'ent_type': {'correc

#### Other tests

In [16]:
_data = []
with open("../data/trainset/annotations.jsonl", "r") as f:
    for line in f:
        _data.append(json.loads(line))

In [17]:
idx = 98
tmp = _data[idx].copy()
tmp['label'] = sorted([(a[0], a[1], a[2], tmp['data'][a[0]: a[1]]) for a in tmp['label']])
tmp

{'id': 1278,
 'data': '"Commodity": "Beans, dry"||"Corn, field, grain"||"Corn, field, grain"||"Animal feed, nongrass, group 18, forage"||"Corn, field, pop"\n"Parts per million": "0.1"||"0.2"||"0.2"||"1.4"||"0.2"',
 'label': [(14, 24, 'Usage', 'Beans, dry'),
  (28, 46, 'Usage', 'Corn, field, grain'),
  (50, 68, 'Usage', 'Corn, field, grain'),
  (72, 111, 'Usage', 'Animal feed, nongrass, group 18, forage'),
  (115, 131, 'Usage', 'Corn, field, pop'),
  (134, 151, 'Unit', 'Parts per million'),
  (155, 158, 'Value', '0.1'),
  (162, 165, 'Value', '0.2'),
  (169, 172, 'Value', '0.2'),
  (176, 179, 'Value', '1.4'),
  (183, 186, 'Value', '0.2')]}

In [21]:
tmp2  = tmp.copy()
tmp2['label'] = [a[:3] for a in tmp2['label']]
tmp2 = preprocess_text(tmp2)
tmp2['label'] = sorted([(a[0], a[1], a[2], tmp['data'][a[0]: a[1]]) for a in tmp2['label']])
tmp2

{'data': '"Commodity": "Beans, dry" "Corn, field, grain" "Corn, field, grain" "Animal feed, nongrass, group 18, forage" "Corn, field, pop" "Parts per million": "0.1" "0.2" "0.2" "1.4" "0.2"',
 'label': [(14, 24, 'Usage', 'Beans, dry'),
  (27, 45, 'Usage', '"Corn, field, grai'),
  (48, 66, 'Usage', '|"Corn, field, gra'),
  (69, 108, 'Usage', '||"Animal feed, nongrass, group 18, for'),
  (111, 127, 'Usage', '"||"Corn, field,'),
  (130, 147, 'Unit', 'p"\n"Parts per mil'),
  (151, 154, 'Value', '": '),
  (157, 160, 'Value', '1"|'),
  (163, 166, 'Value', '.2"'),
  (169, 172, 'Value', '0.2'),
  (175, 178, 'Value', '"1.')]}

In [18]:
import re
from config.nlp_models import SYMBOLS_TO_REMOVE

def preprocess_text(entry):
    text = entry['data']
    # new_text = ""
    # print(entry['label'])
    labels = sorted([x[:-1] for x in entry['label']])
    for sym, sym_length in SYMBOLS_TO_REMOVE.items():
        # We will remove the symbol from the text and add a space in its place
        length_to_remove = sym_length - 1
        matches = re.finditer(sym, text)
        n_removed = 0
        tmp_labels = labels.copy()
        for match in matches:
            start, end = match.span()
            for i in range(len(labels)):
                label = labels[i]
                label_start, label_end, _ = label
                # print(label_start, end, i, sym, label)
                if label_start >= end:
                    # print("=>", sym)
                    for j in range(i, len(labels)):
                        label = tmp_labels[j]
                        label_start, label_end, label_type = label
                        label_start -= length_to_remove
                        label_end -= length_to_remove
                        # print(f"removing from {i=} {j=} {label_start=} {label_end=} {sym=}, {label_type=}")
                        tmp_labels[j] = (label_start, label_end, label_type)
                        n_removed += 1
                    break
        labels = tmp_labels
        text = re.sub(sym, " ", text)
        # print([(a[0], a[1], a[2], text[a[0]: a[1]]) for a in labels], f"{length_to_remove=} {sym=}, {n_removed=}")

    return {'data': text, 'label': labels}
ptmp = preprocess_text(tmp)
# ptmp['label'] = [(a[0], a[1], a[2], tmp['data'][a[0]: a[1]]) for a in ptmp['label']]

ptmp['label'] = [(a[0], a[1], a[2], ptmp['data'][a[0]: a[1]]) for a in ptmp['label']]
for l1, l2 in zip(tmp['label'], ptmp['label']):
    assert l1[2] == l2[2] and l1[3] == l2[3]
ptmp

{'data': '"Commodity": "Beans, dry" "Corn, field, grain" "Corn, field, grain" "Animal feed, nongrass, group 18, forage" "Corn, field, pop" "Parts per million": "0.1" "0.2" "0.2" "1.4" "0.2"',
 'label': [(14, 24, 'Usage', 'Beans, dry'),
  (27, 45, 'Usage', 'Corn, field, grain'),
  (48, 66, 'Usage', 'Corn, field, grain'),
  (69, 108, 'Usage', 'Animal feed, nongrass, group 18, forage'),
  (111, 127, 'Usage', 'Corn, field, pop'),
  (130, 147, 'Unit', 'Parts per million'),
  (151, 154, 'Value', '0.1'),
  (157, 160, 'Value', '0.2'),
  (163, 166, 'Value', '0.2'),
  (169, 172, 'Value', '1.4'),
  (175, 178, 'Value', '0.2')]}