In [None]:
from spacy.tokens import DocBin
import spacy
import json
from tqdm import tqdm
import random

In [None]:
#Function to create spacy3.0 format training data
nlp = spacy.blank("en")
def create_training(TRAIN_DATA):
    db = DocBin()
    for text, annot in tqdm(TRAIN_DATA):
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in annot["entities"]:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                print (".")
            else:
                ents.append(span)
        doc.ents = ents
        db.add(doc)
    return (db)

In [None]:
#Sample training, testing and validation set, based on the web annotation tool: http://agateteam.org/spacynerannotate/
TRAIN_DATA = [("Socrates walked to the store.",{"entities":[0,8, "PERSON"]}),
("Quine writes philosophy.",{"entities":[(0,5,"PERSON")]})]

TEST_DATA = [("Plato loves cookies.",{"entities":[0,5, "PERSON"]}),
("God is watching over him.",{"entities":[(0,3,"PERSON")]})]

VALIDATION_DATA = [("Ruben is writing sample sentences.",{"entities":[0,5, "PERSON"]})]

In [None]:
#SpaCy 3.0 formatting of training data
read_train = create_training(TRAIN_DATA)
read_train.to_disk("pathtofile/trainingdata.spacy")

In [None]:
#SpaCy 3.0 formatting of testing data
read_test = create_training(TEST_DATA)
read_test.to_disk("pathtofile/testingdata.spacy")

In [None]:
#SpaCy 3.0 formatting of validation data
read_val = create_training(VALIDATION_DATA)
read_val.to_disk("pathtofile/validationdata.spacy")

In [None]:
#Create BILUO format FLAIR train/test/val data
from spacy.training import offsets_to_biluo_tags
nlp = spacy.load("en_core_web_sm")

with open("data/flairtrain.txt","w") as f:
    for sent,tags in TRAIN_DATA:
        doc = nlp(sent)
        biluo = offsets_to_biluo_tags(doc,tags['entities'])
        for word,tag in zip(doc, biluo):
            f.write(f"{word} {tag}\n")
        f.write("\n")
        
with open("data/flairtest.txt","w") as f:
    for sent,tags in TEST_DATA:
        doc = nlp(sent)
        biluo = offsets_to_biluo_tags(doc,tags['entities'])
        for word,tag in zip(doc, biluo):
            f.write(f"{word} {tag}\n")
        f.write("\n")
        
with open("data/flairval.txt","w") as f:
    for sent,tags in VALIDATION_DATA:
        doc = nlp(sent)
        biluo = offsets_to_biluo_tags(doc,tags['entities'])
        for word,tag in zip(doc, biluo):
            f.write(f"{word} {tag}\n")
        f.write("\n")