In [1]:
import os
import glob
import json

import spacy
from spacy.tokens import DocBin
from spacy.cli.train import train

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
path = "../data/json/all.jsonl"
assert os.path.isfile(path), path

In [3]:
with open(path, 'r') as fr:
    data = [json.loads(line) for line in fr]
data[0]

{'id': 1164,
 'data': '"Commodity": "Oat, hay"||"Barley, forage"||"Rye, straw"||"Barley, straw"||"Barley, forage"\n"Parts per million": "1.0"||"0.3"||"0.3"||"0.3"||"0.3"',
 'label': [[14, 22, 'Usage'],
  [26, 40, 'Usage'],
  [44, 54, 'Usage'],
  [58, 71, 'Usage'],
  [75, 89, 'Usage'],
  [92, 109, 'Unit'],
  [113, 116, 'Value'],
  [120, 123, 'Value'],
  [127, 130, 'Value'],
  [134, 137, 'Value'],
  [141, 144, 'Value']]}

In [4]:
data[0]['data'][14:22]

'Oat, hay'

In [5]:
def to_spacy_format(data, ids_to_ignore=[], ids_to_include=None):
    nlp = spacy.blank("en")
    doc_bin = DocBin()
    n_skipped = 0
    n_valid = 0
    for file in data:
        if file['id'] in ids_to_ignore:
            continue
        if ids_to_include and file['id'] not in ids_to_include:
            continue
        doc = nlp(file['data'])
        ents = []
        # Make sure to remove duplicates.
        annots = list(set([tuple(l) for l in file['label']]))
        for annot in annots:
            start = annot[0]
            end = annot[1]
            label = annot[2]
            # Make sure the span is valid.
            span = doc.char_span(start, end, label, alignment_mode="strict")
            if span is None:
                print(f"Skipping entity {annot} -- {file['data'][start: end]}.")
                n_skipped += 1
                continue
            n_valid += 1
            ents.append(span)
        try:
            doc.ents = ents
        except ValueError as e:
            print(f"Skipping file {file}: {e}.")
            raise e
        doc_bin.add(doc)
    print(f"{n_skipped=}, {n_valid=}")
    return doc_bin

In [6]:
# Some entries have duplicate annotations so we need to remove them.
len(list(set([tuple(l) for l in data[68]['label']]))), len(data[68]['label'])

(14, 15)

In [7]:
train_perc = 0.8
dev_perc = 0.2
doc_bin_train = to_spacy_format(data[:int(len(data) * train_perc)])
doc_bin_dev = to_spacy_format(data[int(len(data) * train_perc):])

Skipping entity (58, 71, 'Usage') -- Barley, straw.
Skipping entity (120, 123, 'Value') -- 0.3.
Skipping entity (75, 89, 'Usage') -- Barley, forage.
Skipping entity (127, 130, 'Value') -- 0.3.
Skipping entity (14, 22, 'Usage') -- Oat, hay.
Skipping entity (141, 144, 'Value') -- 0.3.
Skipping entity (44, 54, 'Usage') -- Rye, straw.
Skipping entity (113, 116, 'Value') -- 1.0.
Skipping entity (134, 137, 'Value') -- 0.3.
Skipping entity (26, 40, 'Usage') -- Barley, forage.
Skipping entity (60, 67, 'Usage') -- Avocado.
Skipping entity (46, 56, 'Usage') -- Strawberry.
Skipping entity (37, 42, 'Usage') -- Peach.
Skipping entity (28, 33, 'Usage') -- Apple.
Skipping entity (111, 113, 'Value') -- 13.
Skipping entity (14, 24, 'Usage') -- Strawberry.
Skipping entity (117, 119, 'Value') -- 15.
Skipping entity (93, 95, 'Value') -- 13.
Skipping entity (99, 100, 'Value') -- 5.
Skipping entity (104, 107, 'Value') -- 7.0.
Skipping entity (77, 88, 'Substance') -- Acetamiprid.
Skipping entity (179, 212, '

In [8]:
# Save the data in the spacy format
os.makedirs("../data/spacy_data", exist_ok=True)
doc_bin_train.to_disk("../data/spacy_datasets/train_jsonl.spacy")
doc_bin_dev.to_disk("../data/spacy_datasets/dev_jsonl.spacy")

In [9]:
# Proceed to train the model.
train(
    "../config/spacy_config.cfg", 
    overrides={
        "paths.train": "../data/spacy_datasets/train_jsonl.spacy", "paths.dev": "../data/spacy_datasets/dev_jsonl.spacy"
    },
    output_path='../results/model/spacy_model_jsonl'
)

[38;5;2m✔ Created output directory: ../results/model/spacy_model_jsonl[0m
[38;5;4mℹ Saving to output directory: ../results/model/spacy_model_jsonl[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['ner'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  --------  ------  ------  ------  ------
  0       0    442.52    0.00    0.00    0.00    0.00
  1     200  91153.98    0.00    0.00    0.00    0.00
  2     400  69055.74    0.00    0.00    0.00    0.00
  3     600  56950.08    0.00    0.00    0.00    0.00
  4     800  50837.89    0.00    0.00    0.00    0.00
  5    1000  48515.08    0.00    0.00    0.00    0.00
  6    1200  47593.28    0.41    4.93    0.21    0.00
  7    1400  45964.62    0.62    4.87    0.33    0.01
  9    1600  46005.80    3.39   15.53    1.90    0.03
 10    1800  45574.04    7.30   22.17    4.37    0.07
 11    2000  42813.43    9.11   25.61    5.54    0.09
 

In [3]:
nlp = spacy.load("../results/model/spacy_model_jsonl/model-best")

In [5]:
doc = nlp("Diacetyl content, mg/l  0.2 TCVN 6058:1995  A II.")
[(ent.text, ent.label_) for ent in doc.ents]

[('mg/l  0.2', 'Unit')]