[View in Colaboratory](https://colab.research.google.com/github/prikulkarni/nlp/blob/master/spacy_ner_colab.ipynb)

### Install spacy

In [5]:
!pip install -U spacy
!python -m spacy download en

Requirement already up-to-date: spacy in /usr/local/lib/python3.6/dist-packages (2.0.12)

[93m    Linking successful[0m
    /usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
    /usr/local/lib/python3.6/dist-packages/spacy/data/en

    You can now load the model via spacy.load('en')



### Get access to google drive

In [6]:
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

Preconfiguring packages ...
Selecting previously unselected package cron.
(Reading database ... 18408 files and directories currently installed.)
Preparing to unpack .../00-cron_3.0pl1-128ubuntu5_amd64.deb ...
Unpacking cron (3.0pl1-128ubuntu5) ...
Selecting previously unselected package libapparmor1:amd64.
Preparing to unpack .../01-libapparmor1_2.11.0-2ubuntu17.1_amd64.deb ...
Unpacking libapparmor1:amd64 (2.11.0-2ubuntu17.1) ...
Selecting previously unselected package libdbus-1-3:amd64.
Preparing to unpack .../02-libdbus-1-3_1.10.22-1ubuntu1_amd64.deb ...
Unpacking libdbus-1-3:amd64 (1.10.22-1ubuntu1) ...
Selecting previously unselected package dbus.
Preparing to unpack .../03-dbus_1.10.22-1ubuntu1_amd64.deb ...
Unpacking dbus (1.10.22-1ubuntu1) ...
Selecting previously unselected package dirmngr.
Preparing to unpack .../04-dirmngr_2.1.15-1ubuntu8.1_amd64.deb ...
Unpacking dirmngr (2.1.15-1ubuntu8.1) ...
Selecting previously unselected package distro-info-data.
Preparing to unpack .

In [0]:
!mkdir -p drive
!google-drive-ocamlfuse drive

In [8]:
!ls

adc.json  drive  sample_data


In [9]:
import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'

In [10]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 6975725286382621616, name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 11281989632
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 16304593802951983396
 physical_device_desc: "device: 0, name: Tesla K80, pci bus id: 0000:00:04.0, compute capability: 3.7"]

In [11]:
!python --version

Python 3.6.3


### Customised NER

In [22]:
import spacy
from spacy.matcher import PhraseMatcher
import plac
from pathlib import Path
import random

def offseter(lbl, doc, matchitem):
    o_one = len(str(doc[0:matchitem[1]]))
    subdoc = doc[matchitem[1]:matchitem[2]]
    o_two = o_one + len(str(subdoc))
    return (o_one, o_two, lbl)

#nlp = spacy.blank('en')
nlp = spacy.load('en')
#output_dir = "drive/models/angela"
#nlp = spacy.load(output_dir)
#print("Loading from", output_dir)

if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner)
else:
    ner = nlp.get_pipe('ner')

matcher = PhraseMatcher(nlp.vocab)

#ner.add_label(label)

label = 'CHANCEL'
for i in ['Angela Merkel', 'Angela', 'Merkel',]:
    matcher.add(label, None, nlp(i))


# label1 = 'PRESID'
# for i in ['Vladimir Putin', 'Vladimir', 'Putin',]:
#     matcher.add(label1, None, nlp(i))

ner.add_label(label)
#ner.add_label(label1)

res = []
to_train_ents = []
with open('drive/data/angela_merkel.txt') as am:
    line = True
    while line:
        line = am.readline()
        mnlp_line = nlp(line)
        matches = matcher(mnlp_line)
        res = [offseter(label, mnlp_line, x)
               for x
               in matches]
        to_train_ents.append((line, dict(entities=res)))
#         res1 = [offseter(label1, mnlp_line, x)
#                for x
#                in matches]
#         to_train_ents.append((line, dict(entities=res1)))
        
@plac.annotations(
    new_model_name=("New model name for model meta.", "option", "nm", str),
    output_dir=("Optional output directory", "option", "o", Path))
def train(new_model_name='angela', output_dir=None):
  
    optimizer = nlp.begin_training()
    
    other_pipes = [pipe
                   for pipe
                   in nlp.pipe_names
                   if pipe != 'ner']
    
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(20):
            losses = {}
            random.shuffle(to_train_ents)
            for item in to_train_ents:
                nlp.update([item[0]],
                           [item[1]],
                           sgd=optimizer,
                           drop=0.35,
                           losses=losses)
            print(losses)

    if output_dir is None:
        output_dir = "drive/models/angela"
        
    noutput_dir = Path(output_dir)
    if not noutput_dir.exists():
        noutput_dir.mkdir()
        
    nlp.meta['name'] = new_model_name
    nlp.to_disk(output_dir)
        
    random.shuffle(to_train_ents)

    test_text = to_train_ents[0][0]
    #test_text = "Merkel had a meeting with Putin."
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

train()

{'ner': 173.74445107543806}
{'ner': 102.9416562527794}
{'ner': 93.00138173953404}
{'ner': 92.33353682925804}
{'ner': 99.63629841558858}
{'ner': 101.31497825984094}
{'ner': 98.86415659651185}
{'ner': 105.24894067756175}
{'ner': 93.71357046286384}
{'ner': 92.93112548411149}
{'ner': 80.39163961015095}
{'ner': 87.70076541640069}
{'ner': 99.08991635996188}
{'ner': 106.26035018736168}
{'ner': 102.69900018218374}
{'ner': 79.65151813520843}
{'ner': 93.38510100399553}
{'ner': 98.30610358305337}
{'ner': 97.47421664615672}
{'ner': 97.6742594244663}
Entities in 'Chancellor Angela Merkel said Saturday she expected "no special" outcomes from her largely private "working meeting" with Putin at Meseberg Palace, a German state guest house an hour's drive north of Berlin.
'
CHANCEL Merkel


In [21]:
import spacy
from spacy.matcher import PhraseMatcher
import plac
from pathlib import Path
import random

def offseter(lbl, doc, matchitem):
    o_one = len(str(doc[0:matchitem[1]]))
    subdoc = doc[matchitem[1]:matchitem[2]]
    o_two = o_one + len(str(subdoc))
    return (o_one, o_two, lbl)

output_dir = "drive/models/angela"
nlp = spacy.load(output_dir)
print("Loading from", output_dir)

if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner)
else:
    ner = nlp.get_pipe('ner')

matcher = PhraseMatcher(nlp.vocab)

# label1 = 'PRESID'
# for i in ['Vladimir Putin', 'Vladimir', 'Putin',]:
#     matcher.add(label1, None, nlp(i))

# ner.add_label(label1)

res = []
to_train_ents = []
with open('drive/data/angela_merkel.txt') as am:
    line = True
    while line:
        line = am.readline()
        mnlp_line = nlp(line)
        matches = matcher(mnlp_line)
        res = [offseter(label, mnlp_line, x)
               for x
               in matches]
        to_train_ents.append((line, dict(entities=res)))
        res1 = [offseter(label1, mnlp_line, x)
               for x
               in matches]
        to_train_ents.append((line, dict(entities=res1)))
        
# @plac.annotations(
#     new_model_name=("New model name for model meta.", "option", "nm", str),
#     output_dir=("Optional output directory", "option", "o", Path))
def train():
  
    optimizer = nlp.begin_training()
    
    other_pipes = [pipe
                   for pipe
                   in nlp.pipe_names
                   if pipe != 'ner']
    
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(20):
            losses = {}
            random.shuffle(to_train_ents)
            for item in to_train_ents:
                nlp.update([item[0]],
                           [item[1]],
                           sgd=optimizer,
                           drop=0.35,
                           losses=losses)
            print(losses)

    output_dir = "drive/models/angela"
        
#     noutput_dir = Path(output_dir)
#     if not noutput_dir.exists():
#         noutput_dir.mkdir()
        
    #nlp.meta['name'] = new_model_name
    nlp.to_disk(output_dir)
        
    random.shuffle(to_train_ents)

    test_text = to_train_ents[0][0]
    #test_text = "Merkel had a meeting with Putin."
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

train()

Loading from drive/models/angela
{'ner': 3.4321834452984623}
{'ner': 1.7412152510469334e-10}
{'ner': 9.065124844037724e-13}
{'ner': 2.514983265468342e-12}
{'ner': 4.412116847037798e-12}
{'ner': 7.144525218460551e-10}
{'ner': 2.662367437908956e-12}
{'ner': 7.563513025217692e-09}
{'ner': 5.762896914477426e-15}
{'ner': 1.6698920223583376e-11}
{'ner': 5.895119745754372e-12}
{'ner': 9.33615474315164e-12}
{'ner': 3.609067579151486e-11}
{'ner': 1.9425484491820584e-13}
{'ner': 2.0662835726337913e-10}
{'ner': 1.0116392631618534e-12}
{'ner': 3.170202931342149e-09}
{'ner': 2.0620950342310205e-14}
{'ner': 4.0795843403088606e-13}
{'ner': 1.305186358867908e-13}
Entities in '
'


In [22]:
import spacy 
#from spacy.lang.en import English
#nlp = English().from_disk('drive/models/angela')
output_dir = "drive/models/angela"
print("Loading from", output_dir)
nlp = spacy.load(output_dir)
#nlp.entity.cfg['extra_labels']
print(nlp.entity.move_names)
test_text = "German Chancellor Angela Merkel has a meeting with Russian President Putin"
doc = nlp(test_text)
print("Entities in '%s'" % test_text)
for ent in doc.ents:
  print(ent.label_, ent.text)


Loading from drive/models/angela
['B-PERSON', 'B-CARDINAL', 'B-ORG', 'B-GPE', 'B-FAC', 'B-MONEY', 'B-NORP', 'B-DATE', 'B-TIME', 'B-ORDINAL', 'B-PERCENT', 'B-PRODUCT', 'B-LANGUAGE', 'B-LOC', 'B-QUANTITY', 'B-WORK_OF_ART', 'B-EVENT', 'B-LAW', 'I-PERSON', 'I-CARDINAL', 'I-ORG', 'I-GPE', 'I-FAC', 'I-MONEY', 'I-NORP', 'I-DATE', 'I-TIME', 'I-ORDINAL', 'I-PERCENT', 'I-PRODUCT', 'I-LANGUAGE', 'I-LOC', 'I-QUANTITY', 'I-WORK_OF_ART', 'I-EVENT', 'I-LAW', 'L-PERSON', 'L-CARDINAL', 'L-ORG', 'L-GPE', 'L-FAC', 'L-MONEY', 'L-NORP', 'L-DATE', 'L-TIME', 'L-ORDINAL', 'L-PERCENT', 'L-PRODUCT', 'L-LANGUAGE', 'L-LOC', 'L-QUANTITY', 'L-WORK_OF_ART', 'L-EVENT', 'L-LAW', 'U-PERSON', 'U-CARDINAL', 'U-ORG', 'U-GPE', 'U-FAC', 'U-MONEY', 'U-NORP', 'U-DATE', 'U-TIME', 'U-ORDINAL', 'U-PERCENT', 'U-PRODUCT', 'U-LANGUAGE', 'U-LOC', 'U-QUANTITY', 'U-WORK_OF_ART', 'U-EVENT', 'U-LAW', 'O', 'B-CHANCEL', 'I-CHANCEL', 'L-CHANCEL', 'U-CHANCEL', 'B-PRESID', 'I-PRESID', 'L-PRESID', 'U-PRESID']
Entities in 'German Chancellor An

### Spacy Documentation code

In [19]:
from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path
import spacy


# training data
TRAIN_DATA = [
    ('Who is Shakira?', {
        'entities': [(7, 14, 'PERSON')]
    }),
    ('I like Europe and Asia.', {
        'entities': [(7, 13, 'LOC'), (18, 22, 'LOC')]
    })
]

#testing data
TEST_DATA = 'Shakira is a singer. She is popular in both Europe and Asia'


@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int))
def main(model=None, output_dir=None, n_iter=100):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
        
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe('ner')

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)
            
    # test the trained model
    #for text, _ in TEST_DATA:
    text = TEST_DATA
    doc = nlp(text)
    print(nlp.entity.move_names)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        #for text, _ in TEST_DATA:
        doc = nlp2(text)
        print(nlp2.entity.move_names)
        print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
        print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

            
main()

Created blank 'en' model
{'ner': 8.464841950684786}
{'ner': 9.539180871332064}
{'ner': 14.378742635250092}
{'ner': 8.862236209874027}
{'ner': 8.156067713886658}
{'ner': 8.783401463919915}
{'ner': 8.757857236324076}
{'ner': 9.635891117231788}
{'ner': 3.963297861298565}
{'ner': 5.525518167817382}
{'ner': 4.989305564356019}
{'ner': 10.916562724253527}
{'ner': 7.2199440604531615}
{'ner': 0.04891543866688441}
{'ner': 0.0004499736704655606}
{'ner': 2.0074987859583078}
{'ner': 2.000294718656887}
{'ner': 1.4683032412422963e-05}
{'ner': 1.9680268730233648}
{'ner': 1.0932478362961655}
{'ner': 3.7398193811467184}
{'ner': 3.064042616382443}
{'ner': 1.9851628033032405}
{'ner': 0.31719812750816384}
{'ner': 0.19036422763488567}
{'ner': 1.988214451646076}
{'ner': 4.1466865657052136e-05}
{'ner': 0.012779633874253803}
{'ner': 2.4967281278496412e-05}
{'ner': 7.154414071209226e-06}
{'ner': 1.3157078476760944}
{'ner': 3.9352342894508817e-13}
{'ner': 0.02238365009328181}
{'ner': 7.624213809763367e-06}
{'ner

### Dataturks annotated data - NER

In [11]:

import json
import random
import logging
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from sklearn.metrics import accuracy_score
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    try:
        training_data = []
        lines=[]
        with open(dataturks_JSON_FilePath, 'r') as f:
            lines = f.readlines()

        for line in lines:
            data = json.loads(line)
            text = data['content']
            entities = []
            for annotation in data['annotation']:
                #only a single point in text annotation.
                point = annotation['points'][0]
                labels = annotation['label']
                # handle both list of labels or a single label.
                if not isinstance(labels, list):
                    labels = [labels]

                for label in labels:
                    #dataturks indices are both inclusive [start, end] but spacy is not [start, end)
                    entities.append((point['start'], point['end'] + 1 ,label))


            training_data.append((text, {"entities" : entities}))

        return training_data
    except Exception as e:
        logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e))
        return None


import spacy
################### Train Spacy NER.###########
def train_spacy():

    TRAIN_DATA = convert_dataturks_to_spacy("drive/data/traindata.json")
    nlp = spacy.blank('en')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
       

    # add labels
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(10):
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)
    #test the model and evaluate it
    examples = convert_dataturks_to_spacy("drive/data/testdata.json")
    tp=0
    tr=0
    tf=0

    ta=0
    c=0        
    for text,annot in examples:

        f=open("resume"+str(c)+".txt","w")
        doc_to_test=nlp(text)
        d={}
        for ent in doc_to_test.ents:
            d[ent.label_]=[]
        for ent in doc_to_test.ents:
            d[ent.label_].append(ent.text)

        for i in set(d.keys()):

            f.write("\n\n")
            f.write(i +":"+"\n")
            for j in set(d[i]):
                f.write(j.replace('\n','')+"\n")
        d={}
        for ent in doc_to_test.ents:
            d[ent.label_]=[0,0,0,0,0,0]
        for ent in doc_to_test.ents:
            doc_gold_text= nlp.make_doc(text)
            gold = GoldParse(doc_gold_text, entities=annot.get("entities"))
            y_true = [ent.label_ if ent.label_ in x else 'Not '+ent.label_ for x in gold.ner]
            y_pred = [x.ent_type_ if x.ent_type_ ==ent.label_ else 'Not '+ent.label_ for x in doc_to_test]  
            if(d[ent.label_][0]==0):
                #f.write("For Entity "+ent.label_+"\n")   
                #f.write(classification_report(y_true, y_pred)+"\n")
                (p,r,f,s)= precision_recall_fscore_support(y_true,y_pred,average='weighted')
                a=accuracy_score(y_true,y_pred)
                d[ent.label_][0]=1
                d[ent.label_][1]+=p
                d[ent.label_][2]+=r
                d[ent.label_][3]+=f
                d[ent.label_][4]+=a
                d[ent.label_][5]+=1
        c+=1
    for i in d:
        print("\n For Entity "+i+"\n")
        print("Accuracy : "+str((d[i][4]/d[i][5])*100)+"%")
        print("Precision : "+str(d[i][1]/d[i][5]))
        print("Recall : "+str(d[i][2]/d[i][5]))
        print("F-score : "+str(d[i][3]/d[i][5]))
train_spacy()

Statring iteration 0
{'ner': 1638.9786479917047}
Statring iteration 1
{'ner': 605.3991683064176}
Statring iteration 2
{'ner': 535.7001255856526}
Statring iteration 3
{'ner': 431.86820218492323}
Statring iteration 4
{'ner': 365.46321266512564}
Statring iteration 5
{'ner': 259.6570975410603}
Statring iteration 6
{'ner': 251.85436714328213}
Statring iteration 7
{'ner': 211.92943803246865}
Statring iteration 8
{'ner': 226.48755159647297}
Statring iteration 9
{'ner': 146.140918776508}


  'recall', 'true', average, warn_for)



 For Entity Name

Accuracy : 100.0%
Precision : 1.0
Recall : 1.0
F-score : 1.0

 For Entity Location

Accuracy : 99.46524064171123%
Precision : 0.9946810644748985
Recall : 0.9946524064171123
F-score : 0.9931741475187439

 For Entity Designation

Accuracy : 98.6096256684492%
Precision : 0.9862899851551261
Recall : 0.986096256684492
F-score : 0.9808288914349038

 For Entity Graduation Year

Accuracy : 99.89304812834224%
Precision : 1.0
Recall : 0.9989304812834224
F-score : 0.9994649545211343

 For Entity College Name

Accuracy : 97.54010695187165%
Precision : 1.0
Recall : 0.9754010695187165
F-score : 0.9875473741201949

 For Entity Companies worked at

Accuracy : 99.25133689839572%
Precision : 0.9925694786382531
Recall : 0.9925133689839573
F-score : 0.9896159815431554

 For Entity Degree

Accuracy : 99.46524064171123%
Precision : 0.994681033791753
Recall : 0.9946524064171123
F-score : 0.9927497276175877
