In [4]:
import spacy
import random
import time
import numpy as np
from spacy.util import minibatch, compounding

In [27]:
from os import path, mkdir

if not path.isdir("models3/"):
    mkdir("models3/")

In [41]:

          
TRAIN_DATA=[['Xadago are too tall and they pretend to care about your feelings',{'entities': [(0, 6, 'DRUG')]}],
    ['Xadago are too tall and they pretend to care about your feelings', {'entities': [(0, 6, 'DRUG')]}],
    ['Xadago pretend to care about your feelings', {'entities': [(0, 6, 'DRUG')]}],
    ['they pretend to care about your feelings, those Madago', {'entities': [(48, 54, 'MEDI')]}],
    ['Jadago?', {'entities': [(0, 6, 'MEDI')]}]
    ]
LABELS = ['DRUG','MEDI']


In [42]:
TRAIN_DATA

[['Xadago are too tall and they pretend to care about your feelings',
  {'entities': [(0, 6, 'DRUG')]}],
 ['Xadago are too tall and they pretend to care about your feelings',
  {'entities': [(0, 6, 'DRUG')]}],
 ['Xadago pretend to care about your feelings',
  {'entities': [(0, 6, 'DRUG')]}],
 ['they pretend to care about your feelings, those Madago',
  {'entities': [(48, 54, 'MEDI')]}],
 ['Jadago?', {'entities': [(0, 6, 'MEDI')]}]]

In [48]:
# A simple decorator to log function processing time
def timer(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        print("Completed in {} seconds".format(int(te - ts)))
        return result
    return timed

# Data must be of the form (sentence, {entities: [start, end, label]})
@timer
def train_spacy(train_data, labels, iterations, dropout = 0.2, display_freq = 1):
    ''' Train a spacy NER model, which can be queried against with test data
   
    train_data : training data in the format of (sentence, {entities: [(start, end, label)]})
    labels : a list of unique annotations
    iterations : number of training iterations
    dropout : dropout proportion for training
    display_freq : number of epochs between logging losses to console
    '''
    nlp = spacy.blank('en')
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
   
    # Add entity labels to the NER pipeline
    for i in labels:
        ner.add_label(i)

    # Disable other pipelines in SpaCy to only train NER
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):
        nlp.vocab.vectors.name = 'spacy_model' # without this, spaCy throws an "unnamed" error
        optimizer = nlp.begin_training()
        for itr in range(iterations):
            random.shuffle(train_data) # shuffle the training data before each iteration
            losses = {}
            batches = minibatch(train_data, size = compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(          
                    texts,
                    annotations,
                    drop = dropout,  
                    sgd = optimizer,
                    losses = losses)
            if itr % display_freq == 0:
                print("Iteration {} Loss: {}".format(itr + 1, losses))
    return nlp

# Train (and save) the NER model
ner = train_spacy(TRAIN_DATA, LABELS,20)
ner.to_disk("models3/spacy_example")

Iteration 1 Loss: {'ner': 38.425674080848694}
Iteration 2 Loss: {'ner': 36.328261494636536}
Iteration 3 Loss: {'ner': 30.211132168769836}
Iteration 4 Loss: {'ner': 18.654891654849052}
Iteration 5 Loss: {'ner': 9.92096258699894}
Iteration 6 Loss: {'ner': 7.449768096801563}
Iteration 7 Loss: {'ner': 8.043009520855776}
Iteration 8 Loss: {'ner': 6.579349713846227}
Iteration 9 Loss: {'ner': 5.001892689098895}
Iteration 10 Loss: {'ner': 3.737800239570788}
Iteration 11 Loss: {'ner': 3.6278371312773743}
Iteration 12 Loss: {'ner': 2.085963350898055}
Iteration 13 Loss: {'ner': 1.8078734143759903}
Iteration 14 Loss: {'ner': 1.146323315635975}
Iteration 15 Loss: {'ner': 0.9286841761459457}
Iteration 16 Loss: {'ner': 0.5950170991061035}
Iteration 17 Loss: {'ner': 0.2879956646547648}
Iteration 18 Loss: {'ner': 0.17613519543920816}
Iteration 19 Loss: {'ner': 0.0034951271274366036}
Iteration 20 Loss: {'ner': 0.004195592513404733}
Completed in 3 seconds


In [57]:
from spacy import displacy

def load_model(model_path):
    ''' Loads a pre-trained model for prediction on new test sentences
   
    model_path : directory of model saved by spacy.to_disk
    '''
    nlp = spacy.blank('en')
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    ner = nlp.from_disk(model_path)
    return ner

ner = load_model("models3/spacy_example")

#TEST_DATA, _ = load_data_spacy("data/test.txt")




In [58]:
m=ner("Xadago pretend to care about your feelings")

In [59]:
print('Entities', [(ent.text, ent.label_) for ent in m.ents])

Entities [('Xadago', 'DRUG')]


In [60]:
test_sentences = [x[0] for x in TRAIN_DATA]

In [61]:
# extract the sentences from [sentence, entity]
for x in test_sentences:
    doc = ner(x)
    displacy.render(doc, jupyter = True, style = "ent")

In [62]:
def calc_precision(pred, true):        
    precision = len([x for x in pred if x in true]) / (len(pred) + 1e-20) # true positives / total pred
    return precision

def calc_recall(pred, true):
    recall = len([x for x in true if x in pred]) / (len(true) + 1e-20)    # true positives / total test
    return recall

def calc_f1(precision, recall):
    f1 = 2 * ((precision * recall) / (precision + recall + 1e-20))
    return f1


In [63]:
from itertools import chain

# run the predictions on each sentence in the test dataset, and return the spacy object
preds = [ner(x[0]) for x in TRAIN_DATA]

precisions, recalls, f1s = [], [], []

# iterate over predictions and test data and calculate precision, recall, and F1-score
for pred, true in zip(preds, TRAIN_DATA):
    true = [x[2] for x in list(chain.from_iterable(true[1].values()))] # x[2] = annotation, true[1] = (start, end, annot)
    pred = [i.label_ for i in pred.ents] # i.label_ = annotation label, pred.ents = list of annotations
    precision = calc_precision(true, pred)
    precisions.append(precision)
    recall = calc_recall(true, pred)
    recalls.append(recall)
    f1s.append(calc_f1(precision, recall))
   
print("Precision: {} \nRecall: {} \nF1-score: {}".format(np.around(np.mean(precisions), 3),
                                                         np.around(np.mean(recalls), 3),
                                                         np.around(np.mean(f1s), 3)))


Precision: 1.0 
Recall: 1.0 
F1-score: 1.0
