In [149]:
import spacy
from spacy.training import Example
from spacy.util import minibatch
from tqdm import tqdm
import json


In [150]:
nlp = spacy.blank("ko") #for custom dlp ner start with black not news_lg

In [151]:
with open("data/complete_dataset_shuffled.json", "r", encoding="utf-8") as f:
    TD = json.load(f)

In [152]:
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")

In [153]:
#declare entity label
LABELS = set()

for text, ann in TD:
    for start, end, label in ann["entities"]:
        LABELS.add(label)

LABELS

{'ORG_INTERNAL',
 'PROJECT_BUDGET',
 'PROJECT_DATE',
 'PROJECT_MANAGER',
 'PROJECT_NAME',
 'PROJECT_TERM'}

In [154]:
#add label
for label in LABELS:
    ner.add_label(label)

In [155]:
from spacy.training import Example
from spacy.util import filter_spans

def make_examples(nlp, data):
    examples = []

    for text, ann in data:
        doc = nlp.make_doc(text)

        # 1️⃣ deduplicate entities first
        unique_entities = list({
            (start, end, label)
            for start, end, label in ann["entities"]
        })

        spans = []
        clean_entities = []

        for start, end, label in unique_entities:
            span = doc.char_span(
                start,
                end,
                label=label,
                alignment_mode="contract"
            )
            if span:
                spans.append(span)
                clean_entities.append((start, end, label))

        # 2️⃣ remove overlaps safely
        doc.ents = filter_spans(spans)

        # 3️⃣ create Example
        examples.append(
            Example.from_dict(doc, {"entities": clean_entities})
        )

    return examples


In [156]:
examples = make_examples(nlp, TD)
print(len(examples))




394


In [157]:
#split train dev data
import random

random.seed(20260126)
random.shuffle(TD)

split = int(len(TD) * 0.5)

TRAIN_SET = TD[:split]
DEV_SET = TD[split:]

len(TRAIN_SET), len(DEV_SET)

(197, 197)

In [158]:
from spacy.training import Example
from spacy.util import filter_spans

def make_examples(nlp, data):
    examples = []
    skipped = 0
    
    for text, ann in data:
        doc = nlp.make_doc(text)
        
        # entities가 비어있으면 빈 Example 생성
        if not ann["entities"]:
            examples.append(Example.from_dict(doc, {"entities": []}))
            continue
        
        # 1️⃣ 중복 제거 및 유효성 검사
        unique_entities = []
        seen = set()
        
        for item in ann["entities"]:
            # 튜플 형식 검증
            if not isinstance(item, (list, tuple)) or len(item) != 3:
                print(f"⚠️ 잘못된 entity 형식: {item}")
                skipped += 1
                continue
            
            start, end, label = item
            
            # 값 검증
            if not isinstance(start, int) or not isinstance(end, int):
                print(f"⚠️ start/end가 정수가 아님: {item}")
                skipped += 1
                continue
            
            if start >= end:
                print(f"⚠️ start >= end: {item}")
                skipped += 1
                continue
            
            if (start, end, label) not in seen:
                seen.add((start, end, label))
                unique_entities.append((start, end, label))
        
        # 2️⃣ Span 생성
        spans = []
        clean_entities = []
        
        for start, end, label in unique_entities:
            span = doc.char_span(
                start,
                end,
                label=label,
                alignment_mode="contract"
            )
            if span:
                spans.append(span)
                clean_entities.append((start, end, label))
        
        # 3️⃣ 겹치는 span 제거
        filtered_spans = filter_spans(spans)
        doc.ents = filtered_spans
        
        # 4️⃣ Example 생성
        examples.append(
            Example.from_dict(doc, {"entities": clean_entities})
        )
    
    if skipped > 0:
        print(f"⚠️ 총 {skipped}개의 잘못된 entity를 건너뛰었습니다.")
    
    return examples

In [159]:
nlp = spacy.blank("ko")

train_examples = make_examples(nlp, TRAIN_SET)
dev_examples = make_examples(nlp, DEV_SET)



In [160]:
from spacy.tokens import DocBin

def save_docbin(examples, path):
    db = DocBin()
    for ex in examples:
        db.add(ex.reference)
    db.to_disk(path)

In [161]:
save_docbin(train_examples, "data/train.spacy")
save_docbin(dev_examples, "data/dev.spacy")

In [166]:
nlp = spacy.load("output/model-last")

text = """김하늘 매니저가 담당하는 
프로젝트 알파는 총 예산 100억 원이다."""
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)

김하늘 PROJECT_MANAGER
알파 PROJECT_NAME


In [167]:
import spacy

nlp = spacy.load("output/model-last")

text = """김하늘 매니저가 담당하는 
프로젝트 아수스가요는 총 예산 100만
원이다."""
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)


김하늘 PROJECT_MANAGER
