# Using NLTK tagging:

In [4]:
from nltk.chunk import conlltags2tree, tree2conlltags
from nltk import pos_tag
from nltk import word_tokenize
from nltk.chunk import ne_chunk


import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [11]:
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [5]:
sentence = "Clement and Mathieu are working at Apple."

In [12]:
#Tokenize, pos_tag and do the chuncking

ne_tree = ne_chunk(pos_tag(word_tokenize(sentence)))

In [14]:
iob_tagged = tree2conlltags(ne_tree)
print(iob_tagged)

[('Clement', 'NN', 'B-GPE'), ('and', 'CC', 'O'), ('Mathieu', 'NNP', 'B-PERSON'), ('are', 'VBP', 'O'), ('working', 'VBG', 'O'), ('at', 'IN', 'O'), ('Apple', 'NNP', 'B-ORGANIZATION'), ('.', '.', 'O')]


In [15]:
ne_tree = conlltags2tree(iob_tagged)
print(ne_tree)

(S
  (GPE Clement/NN)
  and/CC
  (PERSON Mathieu/NNP)
  are/VBP
  working/VBG
  at/IN
  (ORGANIZATION Apple/NNP)
  ./.)


# Stanford Named Entity Recognizer (NER)

Stanford's tagger uses the same algorithm. While it is written in Java and to use it you must
download the JAR files to use it (you can find these files on the website), NLTK offers us a
Python interface to access the tagger.

In [21]:
from nltk.tag import StanfordNERTagger
st = StanfordNERTagger('/content/english.muc.7class.distsim.crf.ser.gz', '/content/stanford-ner.jar', encoding='utf-8')

The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.tag.corenlp.CoreNLPPOSTagger[0m or [91mnltk.tag.corenlp.CoreNLPNERTagger[0m instead.
  super(StanfordNERTagger, self).__init__(*args, **kwargs)


In [22]:
st.tag('Baptiste Capdeville is studying at Columbia University in NY'.split())

CRFClassifier invoked on Mon Aug 10 16:27:36 UTC 2020 with arguments:
   -loadClassifier /content/english.muc.7class.distsim.crf.ser.gz -textFile /tmp/tmpnyrkd4pf -outputFormat slashTags -tokenizerFactory edu.stanford.nlp.process.WhitespaceTokenizer -tokenizerOptions "tokenizeNLs=false" -encoding utf-8
tokenizerFactory=edu.stanford.nlp.process.WhitespaceTokenizer
Unknown property: |tokenizerFactory|
tokenizerOptions="tokenizeNLs=false"
Unknown property: |tokenizerOptions|
loadClassifier=/content/english.muc.7class.distsim.crf.ser.gz
encoding=utf-8
Unknown property: |encoding|
textFile=/tmp/tmpnyrkd4pf
outputFormat=slashTags
Loading classifier from /content/english.muc.7class.distsim.crf.ser.gz ... Error deserializing /content/english.muc.7class.distsim.crf.ser.gz
Exception in thread "main" java.lang.RuntimeException: java.lang.ClassCastException: class java.util.ArrayList cannot be cast to class [Ledu.stanford.nlp.util.Index; (java.util.ArrayList is in module java.base of loader 'boots

OSError: ignored

NER-tagging with spaCy

In [23]:
import spacy
nlp = spacy.load('en')

In [24]:
sent_0 = nlp(u'Donald Trump visited at the government headquarters in France today.')
sent_1 = nlp(u'Emmanuel Jean-Michel Frédéric Macron is a French politician serving as President of France and ex officio Co-Prince of Andorra since 14 May 2017.')
sent_2 = nlp(u"He studied philosophy at Paris Nanterre University, completed a Master's of Public Affairs at Sciences Po, and graduated from the École nationale d'administration (ÉNA) in 2004.")
sent_3 = nlp(u'He worked at the Inspectorate General of Finances, and later became an investment banker at Rothschild & Cie Banque.')

In [29]:
for token in sent_0:
  print(token.text, token.ent_type_,token.pos_)

Donald PERSON PROPN
Trump PERSON PROPN
visited  VERB
at  ADP
the  DET
government  NOUN
headquarters  NOUN
in  ADP
France GPE PROPN
today DATE NOUN
.  PUNCT


In [30]:
for ent in sent_0.ents:
  print(ent.text, ent.label_)

Donald Trump PERSON
France GPE
today DATE


In [31]:
#sentence1
for token in sent_1:
  print(token.text, token.ent_type_)

for ent in sent_1.ents:
  print(ent.text, ent.label_)

Emmanuel PERSON
Jean PERSON
- PERSON
Michel PERSON
Frédéric PERSON
Macron PERSON
is 
a 
French NORP
politician 
serving 
as 
President 
of 
France GPE
and 
ex 
officio 
Co PERSON
- PERSON
Prince PERSON
of 
Andorra ORG
since 
14 DATE
May DATE
2017 DATE
. 
Emmanuel Jean-Michel Frédéric Macron PERSON
French NORP
France GPE
Co-Prince PERSON
Andorra ORG
14 May 2017 DATE


# Training our own NER-taggers

In [49]:
import plac
import spacy
from pathlib import Path
import random
from spacy.util import minibatch, compounding
import warnings

In [33]:
TRAIN_DATA = [
              ("who is Shaka Khan?", {'entities': [(7,17,"PERSON")]}),
              ("I like London and Berlin.",{'entities': [(7,13,"LOC"),(18,24,"LOC")]})
]

In [34]:
plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int),
)

<function plac_core.annotations.<locals>.annotate>

In [58]:
def main(model = None, output_dir=None, n_iter=100):
  """Load the model, set up the pipeline and train the entity recognizer."""

  if model is not None:
    nlp = spacy.load(model)
    print("Loaded model '%s'" % model)
  else:
    nlp = spacy.blank('en')
    print('created blank model')

  # create the built-in pipeline components and add them to the pipeline
  # nlp.create_pipe works for built-ins that are registered with spaCy

  if "ner" not in nlp.pipe_names:
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)
  else:
    ner = np.get_pipe("ner")

  #add the labels:
  for _,annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
      ner.add_label(ent[2])
  
  #Get the other pipes and disable them before training:
  pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
  other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

  #disable the other_pipe
  with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
    # show warnings for misaligned entity spans once
    warnings.filterwarnings("once", category=UserWarning, module='spacy')


    if model is None:
      nlp.begin_training()
    for itr in range(n_iter):
      random.shuffle(TRAIN_DATA)
      losses={}
      batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
      for batch in batches:
        text,annotations = zip(*batch)
        nlp.update(text, annotations, drop=0.5, losses=losses,)
      print("losses",losses)

  # test the trained model
  for text, _ in TRAIN_DATA:
      doc = nlp(text)
      print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
      print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
  
  # save model to output directory
  output_dir='/content/ner'
  if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
      output_dir.mkdir()
      nlp.to_disk(output_dir)
      print("Saved model to", output_dir)

  # test the saved model
  print("Loading from", output_dir)
  nlp2 = spacy.load(output_dir)
  for text, _ in TRAIN_DATA:
    doc = nlp2(text)
    print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
    print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

In [59]:
if __name__ == "__main__":
    main()

created blank model
losses {'ner': 9.899998903274536}
losses {'ner': 9.633561372756958}
losses {'ner': 9.289416193962097}
losses {'ner': 9.011696577072144}
losses {'ner': 8.505022048950195}
losses {'ner': 8.523720264434814}
losses {'ner': 7.158397316932678}
losses {'ner': 6.730583012104034}
losses {'ner': 7.2702717781066895}
losses {'ner': 6.4985533356666565}
losses {'ner': 5.960755467414856}
losses {'ner': 5.4764240980148315}
losses {'ner': 5.272995352745056}
losses {'ner': 4.963782429695129}
losses {'ner': 4.344571650028229}
losses {'ner': 4.490230165421963}
losses {'ner': 4.492963269352913}
losses {'ner': 5.344881422817707}
losses {'ner': 4.682546824216843}
losses {'ner': 3.957813311368227}
losses {'ner': 4.53189567476511}
losses {'ner': 4.746835498139262}
losses {'ner': 5.312202986795455}
losses {'ner': 4.440175414085388}
losses {'ner': 3.305656388401985}
losses {'ner': 4.392327409237623}
losses {'ner': 3.8977370988577604}
losses {'ner': 3.834113746881485}
losses {'ner': 3.69621737

https://spacy.io/usage/training#section-ner