In [9]:
import spacy
from spacy.training import Example
from spacy.util import minibatch
from tqdm import tqdm
import json


In [10]:
nlp = spacy.blank("ko") #for custom dlp ner start with black not news_lg


In [11]:
with open("dlp_train_data.json", "r", encoding="utf-8") as f:
    TRAIN_DATA = json.load(f)

In [12]:
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")


In [13]:
#declare entity label
LABELS = set()

for text, ann in TRAIN_DATA:
    for start, end, label in ann["entities"]:
        LABELS.add(label)

LABELS


{'ORG_INTERNAL',
 'PROJECT_BUDGET',
 'PROJECT_DATE',
 'PROJECT_MANAGER',
 'PROJECT_NAME',
 'PROJECT_TERM'}

In [14]:
#add label
for label in LABELS:
    ner.add_label(label)


In [15]:
from spacy.util import filter_spans

def make_examples(nlp, data):
    examples = []

    for text, ann in data:
        doc = nlp.make_doc(text)
        spans = []

        for start, end, label in ann["entities"]:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span:
                spans.append(span)

        doc.ents = filter_spans(spans)
        examples.append(Example.from_dict(doc, {"entities": doc.ents}))

    return examples


In [16]:
examples = make_examples(nlp, TRAIN_DATA)
len(examples)


200

In [18]:
#split train dev data
import random

random.seed(42)
random.shuffle(TRAIN_DATA)

split = int(len(TRAIN_DATA) * 0.8)

TRAIN_SET = TRAIN_DATA[:split]
DEV_SET = TRAIN_DATA[split:]

len(TRAIN_SET), len(DEV_SET)


(160, 40)

In [19]:
from spacy.training import Example
from spacy.util import filter_spans

def make_examples(nlp, data):
    examples = []
    for text, ann in data:
        doc = nlp.make_doc(text)
        spans = []

        for start, end, label in ann["entities"]:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span:
                spans.append(span)

        doc.ents = filter_spans(spans)
        examples.append(Example.from_dict(doc, {"entities": doc.ents}))
    return examples


In [20]:
nlp = spacy.blank("ko")

train_examples = make_examples(nlp, TRAIN_SET)
dev_examples = make_examples(nlp, DEV_SET)


In [21]:
from spacy.tokens import DocBin

def save_docbin(examples, path):
    db = DocBin()
    for ex in examples:
        db.add(ex.reference)
    db.to_disk(path)


In [25]:
save_docbin(train_examples, "data/train.spacy")
save_docbin(dev_examples, "data/dev.spacy")


In [None]:
#test trained model.


In [26]:
import spacy

nlp = spacy.load("output/model-best")

text = "김하늘 매니저가 담당하는 프로젝트 아수스가요는 총 예산 100억 원이다."
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)


김하늘 PROJECT_MANAGER
100억 원 PROJECT_BUDGET


In [27]:
nlp = spacy.load("output/model-best")

text = "김하늘 매니저가 담당하는 프로젝트 알파는 총 예산 100억 원이다."
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)

김하늘 PROJECT_MANAGER
알파 PROJECT_NAME
100억 원 PROJECT_BUDGET
