## Building a basic spacy model

In [34]:
import re

def convert_data_to_spacy(dict_deid_surrogate_test_all_groundtruth_version2,
                          deid_surrogate_test_all_groundtruth_version2):
    
    dict_labels, dict_text = dict_deid_surrogate_test_all_groundtruth_version2, deid_surrogate_test_all_groundtruth_version2
    sapcy_training_data = []
    for key in dict_labels:
        tags = dict_labels[key]
        list_of_tags = []
        for tag in tags:    
            pattern = re.search(tag[1], dict_text[key][0][1])
            if pattern:
                list_of_tags.append((pattern.span()[0], pattern.span()[1], tag[0]))
        sapcy_training_data.append((dict_text[key][0][1], {'entities': list_of_tags}))
        
    return sapcy_training_data

In [35]:
import xml.etree.ElementTree as ET

def xml_parse_deid_surrogate_test_all_groundtruth_version2(file):
    tree = ET.parse(file)
    root = tree.getroot() 
    return {child.attrib['ID']: [(i.attrib['TYPE'], i.text) for i in child[0]] for child in root}

def xml_parse_deid_surrogate_test_all_version2(file):
    tree = ET.parse(file)
    root = tree.getroot()
    return {child.attrib['ID']: [(i.tag, i.text) for i in child] for child in root}

In [36]:
dict_labels = xml_parse_deid_surrogate_test_all_groundtruth_version2("data/deid_surrogate_test_all_groundtruth_version2.xml")
dict_text = xml_parse_deid_surrogate_test_all_version2('data/deid_surrogate_test_all_version2.xml')

In [37]:
training_data = convert_data_to_spacy(dict_labels, dict_text)

In [42]:
import spacy
import random

def train_spacy(training_data, output_dir = './'):

    TRAIN_DATA = training_data
    nlp = spacy.load('en')  
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    else:
        ner = nlp.get_pipe('ner')
    
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])
            
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  
        optimizer = nlp.begin_training()
        for itn in range(10):
            print("Statring iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],  
                    [annotations], 
                    drop=0.2, 
                    sgd=optimizer, 
                    losses=losses)
            print(losses)
            
    nlp.to_disk(output_dir)
            
    return nlp

In [None]:
new_nlp = train_spacy(training_data)

Statring iteration 0
{'ner': 30777.25510596777}
Statring iteration 1
{'ner': 29202.463073653787}
Statring iteration 2
{'ner': 29082.50368124411}
Statring iteration 3
{'ner': 29070.937848038582}
Statring iteration 4
{'ner': 28679.794244414363}
Statring iteration 5


## Building a basic CRF ( RASA NLU ) model