In [19]:
from IPython.display import display as ipy_display
from IPython.core.display import HTML
import rich

import spacy
from spacy.tokens import DocBin
from spacy.training import Example

In [2]:
nlp = spacy.load(f'spacy_outputs/model-last')

In [None]:
ruler = nlp.add_pipe("entity_ruler").from_disk("patterns.jsonl") # type: ignore

In [None]:
with open('DATA/data_en/1.txt', 'r') as f:
    text = f.read()

doc = nlp(text)
for e in doc.ents:
    print(e.label_, e.text, e.start_char, e.end_char)

In [None]:
doc_bin = DocBin().from_disk('test.spacy')
docs = list(doc_bin.get_docs(nlp.vocab))

examples = []
for doc in docs:
    example = Example.from_dict(doc, {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]})
    examples.append(example)

results = nlp.evaluate(examples)
print(f"""
    precision: {results['ents_p']:.2f}")
    recall: {results['ents_r']:.2f}")
    F1: {results['ents_f']:.2f}")
    per type performance: {(r for r in results['ents_per_type'])}"""
)


    precision: 0.53")
    recall: 1.00")
    F1: 0.69")
    per type performance: {'Soft Skill': {'p': 0.4743886743886744, 'r': 1.0, 'f': 0.6435055865921787}, 'Hard Skill': {'p': 0.5556795634920635, 'r': 1.0, 'f': 0.714388202471104}}


In [10]:
print(f"""
    precision: {results['ents_p']:.2f}")
    recall: {results['ents_r']:.2f}")
    F1: {results['ents_f']:.2f}")
    soft skill perf: {results['ents_per_type']['Soft Skill']}
    hard skill perf: {results['ents_per_type']['Hard Skill']}
""")


    precision: 0.53")
    recall: 1.00")
    F1: 0.69")
    soft skill perf: {'p': 0.4743886743886744, 'r': 1.0, 'f': 0.6435055865921787}
    hard skill perf: {'p': 0.5556795634920635, 'r': 1.0, 'f': 0.714388202471104}



In [22]:
def render_ents(text, ents): # substitution for displacy not working
    colors = {'Hard Skill': '#cfffdc','Soft Skill': '#68ba7f','Certification':'#253d2c'}
    entities = sorted(ents, key=lambda x: x.start_char) # sort by spacy ent start index
    rich.print(entities)
    # build jupyter html 
    html = ''
    last_idx = 0
    for e in ents:
        start, end, label = e.start_char, e.end_char, e.label_
        html +=text[last_idx:start] # normal
        html += f'<mark style="background-color: {colors[label]}">{text[start:end]}</mark>'
        last_idx = end
    html += text[last_idx:]
    display(HTML(html))

In [26]:
render_ents(text,doc.ents)