In [None]:
# Setup nltk NER and check if everything works
!pip install spacy
import nltk
nltk.download('maxent_ne_chunker')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
sent = nltk.corpus.treebank.tagged_sents()[22]
ner_data = nltk.ne_chunk(sent)

class ListTable(list):
    """ Overridden list class which takes a 2-dimensional list of 
        the form [[1,2,3],[4,5,6]], and renders an HTML Table in 
        IPython Notebook. """
    
    def _repr_html_(self):
        html = ["<table>"]
        for row in self:
            html.append("<tr>")
            
            for col in row:
                html.append("<td>{0}</td>".format(col))
            
            html.append("</tr>")
        html.append("</table>")
        return ''.join(html)

In [142]:
import nltk.corpus
import os.path
import nltk.chunk.util

# Need for the visualizations
from spacy.tokens import Span, Doc
from spacy.vocab import Vocab
from spacy import displacy
from IPython.core.display import display

#
# Helper function that renders NRE-tagged sentences as beautiful HTML
# This function uses spacy visualizers hacked to handle the format used by NLTK
#  tokens -  List of tokens (can be created using nltk.word_tokenize)
#  tags   -  NER tags (can be created using nltk.ne_chunk)
#
def draw_rte(tokens, tags):
    doc = Doc(Vocab(strings=[]), words=tokens)
    ents = []
    last_iob_tag = 'O'
    # Go through the IOB tagged tokens for the sentence
    for index, t in enumerate(nltk.chunk.util.tree2conlltags(tags)):
        # Split IOB tag by "-"
        iob_tags = t[2].split('-')
        if iob_tags[0] != 'O':
            new_label = iob_tags[len(iob_tags)-1]
            # Rewrite the tags to match tags used by Spacy
            if new_label == "ORGANIZATION":
                new_label = "ORG"
            
            if iob_tags[0] == 'I' and last_iob_tag == 'B':
                # Continue last tag
                last_span = ents[len(ents)-1]
                ents[len(ents)-1] = Span(doc, last_span.start, last_span.end+1, new_label)
            else:
                # Begin new tag
                ents.append(Span(doc, index, index+1, new_label))
        last_iob_tag = iob_tags[0]
    
    # This part renders the tagged sentence using spacy visualizers (displacy)
    doc.ents = ents
    if len(doc.ents) == 0:
        return ' '.join(tokens)
    return displacy.render(doc, style="ent", jupyter=False, page=False)

# Load the corpus
# Note the abspath hack:
#  In Jupyter __file__ is not defines so it's easy way to get the current path
corpus = nltk.corpus.rte.pairs(os.path.join(os.path.abspath(''), 'data/dev.xml'))

# Create output table
l = ListTable()
l.append(["No", "Type", "Sentence"])

# Create table with only 10 first TEXT examples
l_10 = ListTable()

# Iterate through the corpus and print the sentences
for i, pair in enumerate(corpus):
    # Tokenization for the hypothesis from the corpus pair
    tokens_hyp = nltk.word_tokenize(pair.hyp)
    # Tagging and named entity recognition
    tags_hyp = nltk.ne_chunk(nltk.pos_tag(tokens_hyp))
    # Render the output
    l.append([str(i+1), "HYP", draw_rte(tokens_hyp, tags_hyp)])
    
    # The same goes for the text from the pair
    tokens_text = nltk.word_tokenize(pair.text)
    tags_text = nltk.ne_chunk(nltk.pos_tag(tokens_text))
    drawn_rte = draw_rte(tokens_text, tags_text)
    l.append([str(i+1), "TEXT", drawn_rte])
    if i < 10:
        l_10.append([str(i+1), drawn_rte])

display(l_10)

0,1
1,"Crude  GPE  oil for April delivery traded at $ 37.80 a barrel , down 28 cents"
2,Oracle  PERSON  had fought to keep the forms from being released
3,iTunes software has seen strong sales in Europe  GPE  .
4,"All genetically modified food , including soya or maize oil produced from GM  ORG  soya and maize , and food ingredients , must be labelled ."
5,Researchers at the Harvard School  ORG  of Public  ORG  Health say that people who drink coffee may be doing a lot more than keeping themselves awake - this kind of consumption apparently also can help reduce the risk of diseases .
6,Eating lots of foods that are a good source of fiber may keep your blood glucose from rising too fast after you eat .
7,The Yankees  ORG  split Hollywood  PERSON  with something to feel OK about after last night 's 5-4 loss to the Dodgers  ORG  .
8,Scientists at the Genome Institute  ORG  of Singapore  GPE  ( GIS  ORG  ) have discovered the complete genetic sequence of a coronavirus isolated from a Singapore  GPE  patient with SARS  ORG  .
9,Phish  GPE  disbands after a final concert in Vermont  GPE  on Aug. 15
10,Euro-Scandinavian  GPE  media cheer Denmark  PERSON  v Sweden  PERSON  draw .


## Report

Recognized entities from the first 10 sentences (in TEXT category) are categorized into 4 categories:
1. GOOD - Valid NE
2. MISS - Named Entities that were undetected
3. FAIL - Tokens that are not NE but were recognized as NE
4. PART - Invalid NE (partial detection or separation)

Crude - FAIL
Oracle - FAIL
Europe - GOOD
GM - FAIL
Harvard School (Public) - PART
Yankees - GOOD
Hollywood - FAIL
Dodgers - GOOD
Genome Institute - GOOD
Singapore - GOOD
GIS - GOOD
SARS - FAIL
Phish - FAIL
Vermont - GOOD
Euro-Scandinavian - GOOD
Denmark - FAIL
Sweden - FAIL

Summary:
_______________

    GOOD: 8
    MISS: 0
    FAIL: 8
    PART: 1
    TOTAL: 17
_______________

## Rest of the sentences

Below you can see all of the other tagged sentences:

In [143]:
display(l)

0,1,2
No,Type,Sentence
1,HYP,Crude  GPE  oil prices rose to $ 37.80 per barrel
1,TEXT,"Crude  GPE  oil for April delivery traded at $ 37.80 a barrel , down 28 cents"
2,HYP,Oracle  PERSON  released a confidential document
2,TEXT,Oracle  PERSON  had fought to keep the forms from being released
3,HYP,Strong  GPE  sales for iTunes  ORG  in Europe  GPE  .
3,TEXT,iTunes software has seen strong sales in Europe  GPE  .
4,HYP,Companies selling genetically modified foods do n't need labels .
4,TEXT,"All genetically modified food , including soya or maize oil produced from GM  ORG  soya and maize , and food ingredients , must be labelled ."
5,HYP,Coffee  GPE  drinking has health benefits .
