In [1]:
import spacy
import random
import time
import numpy as np
from spacy.util import minibatch, compounding

In [2]:
from os import path, mkdir
if not path.isdir("data/"):
    mkdir("data/")
if not path.isdir("models/"):
    mkdir("models/")

In [3]:
!curl https://groups.csail.mit.edu/sls/downloads/movie/engtest.bio -o data/test.txt
!curl https://groups.csail.mit.edu/sls/downloads/movie/engtrain.bio -o data/train.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:02 --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:04 --:--:--     0
  0  246k    0     0    0     0      0      0 --:--:--  0:00:05 --:--:--     0
 12  246k   12 32768    0     0   5461      0  0:00:46  0:00:06  0:00:40  5363
 71  246k   71  176k    0     0  25746      0  0:00:09  0:00:07  0:00:02 29803
100  246k  100  246k    0     0  36090      0  0:00:07  0:00:07 --:--:-- 51327
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   T

In [4]:
def load_data_spacy(file_path):
    ''' Converts data from:
    label \t word \n label \t word \n \n label \t word
    to: sentence, {entities : [(start, end, label), (stard, end, label)]}
    '''
    file = open(file_path, 'r')
    training_data, entities, sentence, unique_labels = [], [], [], []
    current_annotation = None
    end = 0 # initialize counter to keep track of start and end characters
    for line in file:
        line = line.strip("\n").split("\t")
        # lines with len > 1 are words
        if len(line) > 1:
            label = line[0][2:]     # the .txt is formatted: label \t word, label[0:2] = label_type
            label_type = line[0][0] # beginning of annotations - "B", intermediate - "I"
            word = line[1]
            sentence.append(word)
            end += (len(word) + 1)  # length of the word + trailing space
           
            if label_type != 'I' and current_annotation:  # if at the end of an annotation
                entities.append((start, end - 2 - len(word), current_annotation))  # append the annotation
                current_annotation = None                 # reset the annotation
            if label_type == 'B':                         # if beginning new annotation
                start = end - len(word) - 1  # start annotation at beginning of word
                current_annotation = label   # append the word to the current annotation
            if label_type == 'I':            # if the annotation is multi-word
                current_annotation = label   # append the word
           
            if label != 'O' and label not in unique_labels:
                unique_labels.append(label)
 
        # lines with len == 1 are breaks between sentences
        if len(line) == 1:
            if current_annotation:
                entities.append((start, end - 1, current_annotation))
            sentence = " ".join(sentence)
            training_data.append([sentence, {'entities' : entities}])
            # reset the counters and temporary lists
            end = 0            
            entities, sentence = [], []
            current_annotation = None
    file.close()
    return training_data, unique_labels            
           
TRAIN_DATA, LABELS = load_data_spacy("data/train.txt")


In [13]:
TRAIN_DATA

[['what was the cast of the league of extraordinary gentlemen',
  {'entities': [(21, 58, 'TITLE')]}],
 ['list a 1960 s science fiction movie',
  {'entities': [(7, 11, 'YEAR'), (14, 29, 'GENRE')]}],
 ['is there a helena ruzickova 1940 movie about living dead',
  {'entities': [(11, 27, 'ACTOR'), (28, 32, 'YEAR'), (45, 56, 'PLOT')]}],
 ['what movies pair tom hanks and meg ryan together',
  {'entities': [(17, 26, 'ACTOR'), (31, 39, 'ACTOR')]}],
 ['is there a mystery film which director was robert zemeckis',
  {'entities': [(11, 18, 'GENRE'), (43, 58, 'DIRECTOR')]}],
 ['are there any g rated thrillers',
  {'entities': [(14, 21, 'RATING'), (22, 31, 'GENRE')]}],
 ['show me action movies with jude law', {'entities': [(27, 35, 'ACTOR')]}],
 ['who played ash in army of darkness',
  {'entities': [(11, 14, 'CHARACTER'), (18, 34, 'TITLE')]}],
 ['i am looking for documentaries about family',
  {'entities': [(17, 43, 'GENRE')]}],
 ['name a film with elton john in it', {'entities': [(17, 27, 'ACTOR')]

In [15]:
import pandas as pd

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [16]:
df=pd.DataFrame(TRAIN_DATA)

In [18]:
df.to_csv("test_data.csv")

In [14]:
LABELS

['',
 'ACTOR',
 'YEAR',
 'TITLE',
 'GENRE',
 'DIRECTOR',
 'SONG',
 'PLOT',
 'REVIEW',
 'CHARACTER',
 'RATING',
 'RATINGS_AVERAGE',
 'TRAILER']

In [5]:
[x[0] for x in TRAIN_DATA[1:10]]

['show me films with drew barrymore from the 1980s',
 'what movies starred both al pacino and robert deniro',
 'find me all of the movies that starred harold ramis and bill murray',
 'find me a movie with a quote about baseball in it',
 'what movies have mississippi in the title',
 'show me science fiction films directed by steven spielberg',
 'do you have any thrillers directed by sofia coppola',
 'what leonard cohen songs have been used in a movie',
 'show me films elvis films set in hawaii']

In [6]:
[x[1] for x in TRAIN_DATA[1:10]]

[{'entities': [(19, 33, 'ACTOR'), (43, 48, 'YEAR')]},
 {'entities': [(25, 34, 'ACTOR'), (39, 52, 'ACTOR')]},
 {'entities': [(39, 51, 'ACTOR'), (56, 67, 'ACTOR')]},
 {'entities': []},
 {'entities': [(17, 28, 'TITLE')]},
 {'entities': [(8, 29, 'GENRE'), (42, 58, 'DIRECTOR')]},
 {'entities': [(16, 25, 'GENRE'), (38, 51, 'DIRECTOR')]},
 {'entities': [(5, 24, 'SONG')]},
 {'entities': [(14, 19, 'ACTOR'), (26, 39, 'PLOT')]}]

In [7]:
import sys
!{sys.executable} -m spacy download en

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
Building wheels for collected packages: en-core-web-sm
  Building wheel for en-core-web-sm (setup.py): started
  Building wheel for en-core-web-sm (setup.py): finished with status 'done'
  Created wheel for en-core-web-sm: filename=en_core_web_sm-2.2.5-py3-none-any.whl size=12011743 sha256=11f79dcf40d1069ef4ac854fc4b0b763e669b08058d4d7488adfa56e4a1f4437
  Stored in directory: C:\Users\sarka\AppData\Local\Temp\pip-ephem-wheel-cache-3y0hmu1_\wheels\15\d1\af\882f77dfe853f3df1f661fc72934bfd8376d52f264f4223f6f
Successfully built en-core-web-sm
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-2.2.5
symbolic link created for d:\tf\lib\site-packages\spacy\data\en <<===>> d:\tf\lib\site-packages\en_core_web_sm
[+] Download and installation successful
You can now load the model via spacy.load('en_c

In [8]:
from spacy import displacy
import warnings
warnings.filterwarnings("ignore")
nlp = spacy.load('en')
TEST_DATA, _ = load_data_spacy("data/test.txt")

test_sentences = [x[0] for x in TEST_DATA[0:15]] # extract the sentences from [sentence, entity]
for x in test_sentences:
    doc = nlp(x)
    displacy.render(doc, jupyter = True, style = "ent")
warnings.filterwarnings("default")

In [9]:
# A simple decorator to log function processing time
def timer(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        print("Completed in {} seconds".format(int(te - ts)))
        return result
    return timed

# Data must be of the form (sentence, {entities: [start, end, label]})
@timer
def train_spacy(train_data, labels, iterations, dropout = 0.2, display_freq = 1):
    ''' Train a spacy NER model, which can be queried against with test data
   
    train_data : training data in the format of (sentence, {entities: [(start, end, label)]})
    labels : a list of unique annotations
    iterations : number of training iterations
    dropout : dropout proportion for training
    display_freq : number of epochs between logging losses to console
    '''
    nlp = spacy.blank('en')
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
   
    # Add entity labels to the NER pipeline
    for i in labels:
        ner.add_label(i)

    # Disable other pipelines in SpaCy to only train NER
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):
        nlp.vocab.vectors.name = 'spacy_model' # without this, spaCy throws an "unnamed" error
        optimizer = nlp.begin_training()
        for itr in range(iterations):
            random.shuffle(train_data) # shuffle the training data before each iteration
            losses = {}
            batches = minibatch(train_data, size = compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(          
                    texts,
                    annotations,
                    drop = dropout,  
                    sgd = optimizer,
                    losses = losses)
            if itr % display_freq == 0:
                print("Iteration {} Loss: {}".format(itr + 1, losses))
    return nlp

# Train (and save) the NER model
ner = train_spacy(TRAIN_DATA, LABELS,6)
ner.to_disk("models/spacy_example")

  return f(*args, **kwds)


Iteration 1 Loss: {'ner': 19337.692373875634}
Iteration 2 Loss: {'ner': 12994.5199235371}
Iteration 3 Loss: {'ner': 11303.230611235022}
Iteration 4 Loss: {'ner': 9950.893918391652}
Iteration 5 Loss: {'ner': 9036.169604277053}
Iteration 6 Loss: {'ner': 8518.114959996969}
Completed in 962 seconds


  srsly.json_dumps(self.meta)
  writer(path / key)


In [10]:
from spacy import displacy

def load_model(model_path):
    ''' Loads a pre-trained model for prediction on new test sentences
   
    model_path : directory of model saved by spacy.to_disk
    '''
    nlp = spacy.blank('en')
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    ner = nlp.from_disk(model_path)
    return ner

ner = load_model("models/spacy_example")

TEST_DATA, _ = load_data_spacy("data/test.txt")

test_sentences = [x[0] for x in TEST_DATA[0:15]] # extract the sentences from [sentence, entity]
for x in test_sentences:
    doc = ner(x)
    displacy.render(doc, jupyter = True, style = "ent")


In [11]:
def calc_precision(pred, true):        
    precision = len([x for x in pred if x in true]) / (len(pred) + 1e-20) # true positives / total pred
    return precision

def calc_recall(pred, true):
    recall = len([x for x in true if x in pred]) / (len(true) + 1e-20)    # true positives / total test
    return recall

def calc_f1(precision, recall):
    f1 = 2 * ((precision * recall) / (precision + recall + 1e-20))
    return f1


In [12]:
from itertools import chain

# run the predictions on each sentence in the test dataset, and return the spacy object
preds = [ner(x[0]) for x in TEST_DATA]

precisions, recalls, f1s = [], [], []

# iterate over predictions and test data and calculate precision, recall, and F1-score
for pred, true in zip(preds, TEST_DATA):
    true = [x[2] for x in list(chain.from_iterable(true[1].values()))] # x[2] = annotation, true[1] = (start, end, annot)
    pred = [i.label_ for i in pred.ents] # i.label_ = annotation label, pred.ents = list of annotations
    precision = calc_precision(true, pred)
    precisions.append(precision)
    recall = calc_recall(true, pred)
    recalls.append(recall)
    f1s.append(calc_f1(precision, recall))
   
print("Precision: {} \nRecall: {} \nF1-score: {}".format(np.around(np.mean(precisions), 3),
                                                         np.around(np.mean(recalls), 3),
                                                         np.around(np.mean(f1s), 3)))


Precision: 0.873 
Recall: 0.882 
F1-score: 0.869


In [21]:
TEST_DATA

[['are there any good romantic comedies out right now',
  {'entities': [(19, 36, 'GENRE'), (41, 50, 'YEAR')]}],
 ['show me a movie about cars that talk', {'entities': [(22, 36, 'PLOT')]}],
 ['list the five star rated movies starring mel gibson',
  {'entities': [(9, 18, 'RATINGS_AVERAGE'), (41, 51, 'ACTOR')]}],
 ['what science fiction films have come out recently',
  {'entities': [(5, 20, 'GENRE'), (41, 49, 'YEAR')]}],
 ['did the same director make all of the harry potter movies',
  {'entities': [(38, 50, 'TITLE')]}],
 ['show me 1980s action movies',
  {'entities': [(8, 13, 'YEAR'), (14, 20, 'GENRE')]}],
 ['what is the name of the third movie in the star trek series',
  {'entities': [(43, 59, 'TITLE')]}],
 ['can you get a soundtrac for the harry potter films',
  {'entities': [(14, 23, 'SONG'), (32, 50, 'TITLE')]}],
 ['find me science fiction movies since 2005',
  {'entities': [(8, 23, 'GENRE'), (31, 41, 'YEAR')]}],
 ['what is the most current movie featuring mat damon',
  {'entities': [

In [27]:
xy=ner("hello how are you cristopher")

In [28]:
xy.ents

(hello how are you cristopher,)

In [29]:
print('Entities', [(ent.text, ent.label_) for ent in xy.ents])

Entities [('hello how are you cristopher', 'TITLE')]
