## Setup

In [4]:
from __future__ import unicode_literals, print_function

import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
import pandas as pd
import pickle
import warnings
from spacy import displacy

## Read data

In [5]:
import pickle

with open('../data/ner_jsonl_dataset.pickle', 'rb') as fp:
    dataset = pickle.load(fp)

In [6]:
df = pd.DataFrame(dataset)

In [7]:
df.head()

Unnamed: 0,0,1
0,Thousands of demonstrators have marched throug...,"{'entities': [(48, 54, 'geo'), (77, 81, 'geo')..."
1,Iranian officials say they expect to get acces...,"{'entities': [(0, 7, 'gpe'), (87, 96, 'tim'), ..."
2,"In Beirut , a string of officials voiced their...","{'entities': [(3, 9, 'geo'), (68, 82, 'org'), ..."
3,She was a longtime member of the Zapatista mov...,"{'entities': [(33, 42, 'geo')]}"
4,Saturday 's violence came a day after Iraqi fo...,"{'entities': [(0, 8, 'tim'), (38, 43, 'gpe'), ..."


In [5]:
df.shape

(918, 5)

## Create model

In [2]:
model=None
new_model_name="ner_model" 
output_dir="../models/ner_model"
n_iter=100

In [11]:
random.seed(0)
if model is not None:
    nlp = spacy.load(model)  # load existing spaCy model
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank("en")  # create blank Language class
    print("Created blank 'en' model")

Created blank 'en' model


## Add Ner Pipeline to model

In [12]:
# Add entity recognizer to model if it's not in the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if "ner" not in nlp.pipe_names:
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)
# otherwise, get it, so we can add labels to it
else:
    ner = nlp.get_pipe("ner")

## Add labels

In [21]:
ner.add_label("geo")
ner.add_label("tim")
ner.add_label("gpe")
ner.add_label("per")
ner.add_label("org")
ner.add_label("art")
ner.add_label("nat")
ner.add_label("eve")

## Train the model

In [22]:
# get names of other pipes to disable them during training
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
print(other_pipes)
with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():  # only train NER
    
    warnings.filterwarnings("once", category=UserWarning, module='spacy')
    
    if model is None:
        nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(dataset)
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(dataset, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(
                texts,  # batch of texts
                annotations,  # batch of annotations
                drop=0.2,  # dropout - make it harder to memorise data
                losses=losses,
            )
        print("Losses", losses)

[]


  **kwargs
  **kwargs


Losses {'ner': 263921.4421927165}
Losses {'ner': 250241.47440917173}
Losses {'ner': 245974.20855903166}
Losses {'ner': 243044.41729112464}
Losses {'ner': 240475.7237344407}
Losses {'ner': 239396.98484679245}
Losses {'ner': 237281.7715465767}
Losses {'ner': 236557.4139251585}
Losses {'ner': 235453.03034034223}
Losses {'ner': 234552.29836780188}
Losses {'ner': 233040.7541645741}
Losses {'ner': 233640.52691602003}
Losses {'ner': 232447.65078567847}
Losses {'ner': 231313.78144394435}
Losses {'ner': 231634.7244849312}
Losses {'ner': 230373.0302320144}
Losses {'ner': 229843.67416371347}
Losses {'ner': 229590.31484822437}
Losses {'ner': 229444.5880639447}
Losses {'ner': 229136.5777237773}
Losses {'ner': 228924.77029033474}
Losses {'ner': 229455.6061231999}
Losses {'ner': 228028.6616101425}
Losses {'ner': 228312.4036155009}
Losses {'ner': 228207.69952982935}
Losses {'ner': 227154.39907296497}
Losses {'ner': 226683.67879137257}
Losses {'ner': 226812.369536981}
Losses {'ner': 226659.6400351938}


## Save the model

In [25]:
# save model to output directory
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.meta["name"] = new_model_name  # rename model
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

Saved model to ../models/ner_model


## Load the model

In [10]:
# test the saved model
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)

Loading from ../models/ner_model


## It's your turn

Test out the model you just trained. Run the code Cell below and type your reviews in the widget, Have fun!🎉

Here are some inspirations:

- Obama was the president of USA.
- The 1906 San Francisco earthquake was the biggest earthquake that has ever hit San Francisco on April 18, 1906
- Next Monday is Christmas!

Can you do better? Play around with the model hyperparameters!

In [11]:
sentence = "The 1906 San Francisco earthquake was the biggest earthquake that has ever hit San Francisco on April 18, 1906"

In [12]:
from ipywidgets import interact_manual
from ipywidgets import widgets

doc2 = nlp2(sentence)
displacy.render(doc2, style="ent")