In [None]:
from models.Document import Document, Label
from typing import List

import json, rich

from sklearn.model_selection import train_test_split

import spacy
from spacy.tokens import DocBin
from spacy.training import Example
from spacy.util import filter_spans

from IPython.display import display as ipy_display
from IPython.core.display import HTML

##### previously used (openai for labeling)
- too many incorrectly labeled examples, but quicker than using an entity ruler for preannotation
```py
def clean(text):
    text = text.strip().replace("\n", " ").replace("\r", "").replace("\t", " ")
    text = re.sub('\\s+', ' ', text)
    return text


resumes = []
for file in tqdm(os.listdir(f'{os.getcwd()}/DATA/data_en')):
    with open(f'{os.getcwd()}/DATA/data_en/{file}','r') as f:
        contents = clean(f.read())
        resumes.append(contents)


labeled_resumes = []
for i in tqdm(range(len(resumes[:500]))):
    text = resumes[i]
    api_labels = await send_request(text)

    labels = []
    for l in api_labels or []:
        label = l['type']
        content = l['text']
        start = text.find(content)
        if start == -1:
            continue
        end = start + len(content)
        labels.append(Label(start,end,label,content).to_dict())

    doc = Document(i,text,labels).to_dict()
    labeled_resumes.append(doc)

```

##### data preprocessing

In [2]:
with open('DATA/docs_12.json','r') as j:
    labeled_resumes = json.loads(j.read())

In [None]:
cleaned_resumes = []
i = 0

for d in labeled_resumes:
    text, labels = d[0], d[1]
    text = text.replace("\n", "-n").replace("\r", "-r").replace("\t", "-t")
    clean_labels = []
    starts = []

    for l in labels:
        start, end, label, value = l[0], l[1], l[2], l[3]
        
        if (start in starts) or label=='Certification':
            continue

        starts.append(start)
        clean_labels.append(Label(start,end,label,value).to_dict())

    cleaned_resumes.append(Document(i,text,clean_labels).to_dict())
    i+=1

In [5]:
print(len(cleaned_resumes))

1348


In [6]:
train, test = train_test_split(cleaned_resumes, test_size=0.1, random_state=707)

In [7]:
def has_overlap(span1, span2):
    return not (span1.end <= span2.start or span2.end <= span1.start)

In [None]:
def preprocess(data: List[dict], output_name: str):
    invalid_count, ent_count = 0, 0
    nlp = spacy.blank('en')
    doc_bin = DocBin()

    for d in data:
        text, labels = d['content'],d["labels"]
        doc = nlp(text)
        ents = []
        for l in labels:
            start, end, label, value = l["start"], l["end"], l["label"], l["value"]
            span = doc.char_span(start, end, label=label,alignment_mode='expand')
            if span is None:
                invalid_count += 1
                continue
            elif any(has_overlap(span, existing_span) for existing_span in ents):
                span = doc.char_span(start, end, label=label,alignment_mode='contract')
                if span is None:
                    invalid_count += 1
                    continue
        
            ents.append(span)
                
        ent_count += len(ents)
        doc.ents = filter_spans(ents)
        doc_bin.add(doc) 
    print(invalid_count, ent_count)
    doc_bin.to_disk(f"{output_name}.spacy")

In [None]:
preprocess(train,'train')
preprocess(test,'test')

In [None]:
def filter_whitespace(input_path):
    nlp = spacy.blank("en")
    doc_bin = DocBin().from_disk(input_path)
    filtered_doc_bin = DocBin()

    for doc in doc_bin.get_docs(nlp.vocab):
        filtered_ents = []
        for ent in doc.ents:
            if not (ent.text.startswith(" ") or ent.text.endswith(" ")):
                filtered_ents.append(ent)
        doc.ents = filtered_ents
        filtered_doc_bin.add(doc)

    filtered_doc_bin.to_disk(input_path)

filter_whitespace("train.spacy")
filter_whitespace("test.spacy")

In [16]:
!python -m spacy debug data config.cfg --paths.train ./train.spacy --paths.dev ./test.spacy

[1m
[38;5;2m✔ Pipeline can be initialized with data[0m
[38;5;2m✔ Corpus is loadable[0m
[1m
Language: en
Training pipeline: tok2vec, ner
1212 training docs
135 evaluation docs
[38;5;2m✔ No overlap between training and evaluation data[0m
[38;5;3m⚠ Low number of examples to train a new pipeline (1212)[0m
[1m
[38;5;4mℹ 1152316 total word(s) in the data (41082 unique)[0m
[38;5;4mℹ No word vectors present in the package[0m
[1m
[38;5;4mℹ 2 label(s)[0m
0 missing value(s) (tokens with '-' label)
[38;5;2m✔ Good amount of examples for all labels[0m
[38;5;2m✔ Examples without occurrences available for all labels[0m
[38;5;2m✔ No entities consisting of or starting/ending with whitespace[0m
[38;5;2m✔ No entities crossing sentence boundaries[0m
[1m
[38;5;2m✔ 7 checks passed[0m


In [18]:
!python -m spacy train config.cfg --output ./spacy_outputs --paths.train ./train.spacy --paths.dev ./test.spacy

[38;5;4mℹ Saving to output directory: spacy_outputs[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    419.10    0.76    0.86    0.68    0.01
  0     200        229.05  24592.31    0.00    0.00    0.00    0.00
  0     400        321.51  18028.61    1.09   46.05    0.55    0.01
  0     600        364.96  18220.97    9.90   45.58    5.55    0.10
  0     800        442.90  17772.90    6.85   51.10    3.67    0.07
  0    1000        543.13  16440.35    1.93   66.67    0.98    0.02
  0    1200        464.97  16134.82   11.91   52.15    6.72    0.12
  1    1400        562.18  16606.81   15.16   50.09    8.93    0.15
  1    1600        654.99  16978.79    9.66   60.04    5.25    0.10
  1    1800        706.21  17127.25    