https://towardsdatascience.com/train-ner-with-custom-training-data-using-spacy-525ce748fab7

In [2]:
import spacy
import pandas as pd


In [8]:
dataFrame = pd.read_csv("ner_dataset.csv", encoding='latin-1')

In [29]:
df = dataFrame.head(56)

In [30]:
def process_data(df):
    df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")
    sentences = df.groupby("Sentence #")["Word"].apply(list).values
    tag = df.groupby("Sentence #")["Tag"].apply(list).values
    return sentences, tag

In [31]:
process_data(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


(array([list(['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.']),
        list(['Families', 'of', 'soldiers', 'killed', 'in', 'the', 'conflict', 'joined', 'the', 'protesters', 'who', 'carried', 'banners', 'with', 'such', 'slogans', 'as', '"', 'Bush', 'Number', 'One', 'Terrorist', '"', 'and', '"', 'Stop', 'the', 'Bombings', '.', '"']),
        list(['They', 'marched'])], dtype=object),
 array([list(['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']),
        list(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-per', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']),
        list(['O', 'O'])], dtype=object))

In [28]:
dataFrame.head(20)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
5,Sentence: 1,through,IN,O
6,Sentence: 1,London,NNP,B-geo
7,Sentence: 1,to,TO,O
8,Sentence: 1,protest,VB,O
9,Sentence: 1,the,DT,O


In [32]:
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import spacy
from tqdm import tqdm

In [33]:
TRAIN_DATA = [
    ('Who is Nishanth?', {
        'entities': [(7, 15, 'PERSON')]
    }),
     ('Who is Kamal Khumar?', {
        'entities': [(7, 19, 'PERSON')]
    }),
    ('I like London and Berlin.', {
        'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]
    })
]

In [34]:
model = None
output_dir=Path("ner")
n_iter=100

In [35]:
#load the model

if model is not None:
    nlp = spacy.load(model)  
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  
    print("Created blank 'en' model")

#set up the pipeline

if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe('ner')

Created blank 'en' model


In [36]:
for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in tqdm(TRAIN_DATA):
            nlp.update(
                [text],  
                [annotations],  
                drop=0.5,  
                sgd=optimizer,
                losses=losses)
        print(losses)

100%|██████████| 3/3 [00:00<00:00, 48.22it/s]
100%|██████████| 3/3 [00:00<00:00, 74.95it/s]
100%|██████████| 3/3 [00:00<00:00, 80.89it/s]
100%|██████████| 3/3 [00:00<00:00, 76.86it/s]
100%|██████████| 3/3 [00:00<00:00, 81.45it/s]
100%|██████████| 3/3 [00:00<00:00, 80.48it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 13.317753493785858}
{'ner': 12.119819581508636}
{'ner': 11.142244428396225}
{'ner': 10.148663520812988}
{'ner': 7.97254104167223}
{'ner': 7.1288202833384275}


100%|██████████| 3/3 [00:00<00:00, 70.71it/s]
100%|██████████| 3/3 [00:00<00:00, 58.75it/s]
100%|██████████| 3/3 [00:00<00:00, 74.45it/s]
100%|██████████| 3/3 [00:00<00:00, 64.27it/s]
100%|██████████| 3/3 [00:00<00:00, 70.46it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 5.6652601305395365}
{'ner': 6.044148804736324}
{'ner': 6.832673414144665}
{'ner': 6.5430102195823565}
{'ner': 6.306748951901682}


100%|██████████| 3/3 [00:00<00:00, 60.78it/s]
100%|██████████| 3/3 [00:00<00:00, 69.17it/s]
100%|██████████| 3/3 [00:00<00:00, 75.32it/s]
100%|██████████| 3/3 [00:00<00:00, 64.60it/s]
100%|██████████| 3/3 [00:00<00:00, 69.17it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 4.776247605797835}
{'ner': 5.425092695048079}
{'ner': 4.591697333846241}
{'ner': 3.6688542225165293}
{'ner': 5.114266253964161}


100%|██████████| 3/3 [00:00<00:00, 78.01it/s]
100%|██████████| 3/3 [00:00<00:00, 75.51it/s]
100%|██████████| 3/3 [00:00<00:00, 79.00it/s]
100%|██████████| 3/3 [00:00<00:00, 79.97it/s]
100%|██████████| 3/3 [00:00<00:00, 79.70it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 4.245966509217396}
{'ner': 3.4251580655118232}
{'ner': 3.6462248960888246}
{'ner': 5.7501596358197276}
{'ner': 4.427834743284052}


100%|██████████| 3/3 [00:00<00:00, 59.43it/s]
100%|██████████| 3/3 [00:00<00:00, 71.94it/s]
100%|██████████| 3/3 [00:00<00:00, 79.43it/s]
100%|██████████| 3/3 [00:00<00:00, 81.88it/s]
100%|██████████| 3/3 [00:00<00:00, 71.94it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 4.047945205211363}
{'ner': 3.871515908014999}
{'ner': 3.919908962029524}
{'ner': 2.293411441628905}
{'ner': 2.2003575233782726}


100%|██████████| 3/3 [00:00<00:00, 78.47it/s]
100%|██████████| 3/3 [00:00<00:00, 82.18it/s]
100%|██████████| 3/3 [00:00<00:00, 79.86it/s]
100%|██████████| 3/3 [00:00<00:00, 80.27it/s]
100%|██████████| 3/3 [00:00<00:00, 77.32it/s]
100%|██████████| 3/3 [00:00<00:00, 75.23it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 2.182798518684706}
{'ner': 1.3132185123467934}
{'ner': 1.8508750866267718}
{'ner': 1.5298049741852189}
{'ner': 1.4373662451419666}
{'ner': 1.2785954473490992}


100%|██████████| 3/3 [00:00<00:00, 72.57it/s]
100%|██████████| 3/3 [00:00<00:00, 76.77it/s]
100%|██████████| 3/3 [00:00<00:00, 82.92it/s]
100%|██████████| 3/3 [00:00<00:00, 81.78it/s]
100%|██████████| 3/3 [00:00<00:00, 69.65it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 2.330505256601522}
{'ner': 0.7372974952192792}
{'ner': 0.8946849017561925}
{'ner': 1.2896130412129194}
{'ner': 0.5467738652193446}


100%|██████████| 3/3 [00:00<00:00, 66.12it/s]
100%|██████████| 3/3 [00:00<00:00, 71.34it/s]
100%|██████████| 3/3 [00:00<00:00, 81.06it/s]
100%|██████████| 3/3 [00:00<00:00, 80.75it/s]
100%|██████████| 3/3 [00:00<00:00, 81.71it/s]
100%|██████████| 3/3 [00:00<00:00, 78.98it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 1.5461249794129919}
{'ner': 1.8683437221252204}
{'ner': 0.8823877909976808}
{'ner': 2.449752368150347}
{'ner': 1.4724401673417098}
{'ner': 0.1890140961215873}


100%|██████████| 3/3 [00:00<00:00, 72.73it/s]
100%|██████████| 3/3 [00:00<00:00, 71.88it/s]
100%|██████████| 3/3 [00:00<00:00, 80.22it/s]
100%|██████████| 3/3 [00:00<00:00, 76.52it/s]
100%|██████████| 3/3 [00:00<00:00, 79.65it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 1.0235889843135275}
{'ner': 0.7491107530193777}
{'ner': 0.005551131952855893}
{'ner': 0.35833845343612386}
{'ner': 0.013424686670155336}


100%|██████████| 3/3 [00:00<00:00, 78.25it/s]
100%|██████████| 3/3 [00:00<00:00, 83.10it/s]
100%|██████████| 3/3 [00:00<00:00, 79.64it/s]
100%|██████████| 3/3 [00:00<00:00, 86.95it/s]
100%|██████████| 3/3 [00:00<00:00, 82.24it/s]
100%|██████████| 3/3 [00:00<00:00, 81.84it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.0003207497052871136}
{'ner': 0.013376293522545149}
{'ner': 0.0277798999572299}
{'ner': 0.007128732277459584}
{'ner': 0.0002743818835364948}
{'ner': 0.0003179615532773738}


100%|██████████| 3/3 [00:00<00:00, 82.27it/s]
100%|██████████| 3/3 [00:00<00:00, 83.55it/s]
100%|██████████| 3/3 [00:00<00:00, 80.80it/s]
100%|██████████| 3/3 [00:00<00:00, 77.54it/s]
100%|██████████| 3/3 [00:00<00:00, 79.67it/s]
100%|██████████| 3/3 [00:00<00:00, 80.53it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 1.0115581959413447e-05}
{'ner': 0.0015211673554762157}
{'ner': 1.9664212346691426e-05}
{'ner': 0.0004162777862635708}
{'ner': 0.002779375233501777}
{'ner': 0.0007966912252734382}


100%|██████████| 3/3 [00:00<00:00, 76.51it/s]
100%|██████████| 3/3 [00:00<00:00, 78.38it/s]
100%|██████████| 3/3 [00:00<00:00, 82.44it/s]
100%|██████████| 3/3 [00:00<00:00, 79.89it/s]
100%|██████████| 3/3 [00:00<00:00, 80.67it/s]
100%|██████████| 3/3 [00:00<00:00, 86.42it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.0847514820156729}
{'ner': 1.2742057094235788e-06}
{'ner': 0.0023378104959059943}
{'ner': 1.0780765785097596e-05}
{'ner': 1.2021903894631077e-07}
{'ner': 2.2627411731187325e-06}


100%|██████████| 3/3 [00:00<00:00, 82.03it/s]
100%|██████████| 3/3 [00:00<00:00, 85.43it/s]
100%|██████████| 3/3 [00:00<00:00, 83.04it/s]
100%|██████████| 3/3 [00:00<00:00, 62.16it/s]
100%|██████████| 3/3 [00:00<00:00, 53.06it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 1.4045629820977998e-05}
{'ner': 7.432769053871193e-05}
{'ner': 0.005003195414424603}
{'ner': 0.0171814400690817}
{'ner': 2.8613166353281385e-05}


100%|██████████| 3/3 [00:00<00:00, 57.90it/s]
100%|██████████| 3/3 [00:00<00:00, 62.79it/s]
100%|██████████| 3/3 [00:00<00:00, 63.10it/s]
100%|██████████| 3/3 [00:00<00:00, 84.88it/s]
100%|██████████| 3/3 [00:00<00:00, 78.52it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.0012137735595901524}
{'ner': 2.228268114784656e-06}
{'ner': 4.200158453346693e-08}
{'ner': 0.00014564561535326248}
{'ner': 2.836488098308766e-06}


100%|██████████| 3/3 [00:00<00:00, 71.53it/s]
100%|██████████| 3/3 [00:00<00:00, 77.55it/s]
100%|██████████| 3/3 [00:00<00:00, 72.93it/s]
100%|██████████| 3/3 [00:00<00:00, 76.19it/s]
100%|██████████| 3/3 [00:00<00:00, 81.31it/s]
100%|██████████| 3/3 [00:00<00:00, 80.12it/s]

{'ner': 2.6336718143419938e-05}
{'ner': 3.5643257486381937e-06}
{'ner': 5.547387742392952e-05}
{'ner': 0.002224006405558955}
{'ner': 3.0145070988877903e-05}



100%|██████████| 3/3 [00:00<00:00, 74.66it/s]
100%|██████████| 3/3 [00:00<00:00, 81.53it/s]
100%|██████████| 3/3 [00:00<00:00, 81.11it/s]
100%|██████████| 3/3 [00:00<00:00, 82.52it/s]
100%|██████████| 3/3 [00:00<00:00, 83.63it/s]
100%|██████████| 3/3 [00:00<00:00, 81.78it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 3.2222338219929534e-06}
{'ner': 0.00031975260760609307}
{'ner': 5.691784379430317e-08}
{'ner': 0.0003252306964599801}
{'ner': 1.5267682444378834e-05}
{'ner': 1.6868878884342997e-07}
{'ner': 2.405963902392367e-07}


100%|██████████| 3/3 [00:00<00:00, 77.51it/s]
100%|██████████| 3/3 [00:00<00:00, 80.75it/s]
100%|██████████| 3/3 [00:00<00:00, 83.18it/s]
100%|██████████| 3/3 [00:00<00:00, 81.81it/s]
100%|██████████| 3/3 [00:00<00:00, 82.90it/s]
100%|██████████| 3/3 [00:00<00:00, 83.08it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 2.788262474326089e-07}
{'ner': 8.089289319441023e-07}
{'ner': 1.0511700901597741e-08}
{'ner': 1.8575960879516996e-10}
{'ner': 1.1304989067051499e-09}
{'ner': 9.736886482698609e-07}


100%|██████████| 3/3 [00:00<00:00, 79.24it/s]
100%|██████████| 3/3 [00:00<00:00, 77.69it/s]
100%|██████████| 3/3 [00:00<00:00, 81.18it/s]
100%|██████████| 3/3 [00:00<00:00, 83.37it/s]
100%|██████████| 3/3 [00:00<00:00, 82.37it/s]
100%|██████████| 3/3 [00:00<00:00, 80.68it/s]

{'ner': 1.1801765026897642e-08}
{'ner': 0.001168346081720464}
{'ner': 1.8647931893978814e-07}
{'ner': 0.00039354127274736643}
{'ner': 8.090516324911744e-06}
{'ner': 1.4073367038472735e-08}





In [37]:

for text, _ in TRAIN_DATA:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])

Entities [('London', 'LOC'), ('Berlin', 'LOC')]
Entities [('Kamal Khumar', 'PERSON')]
Entities [('Nishanth', 'PERSON')]


In [38]:
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

Saved model to ner
