<a href="https://colab.research.google.com/github/steveazzolin/NLU-second-assignment/blob/main/code/NLU_second_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## NLU second assignment

Steve Azzolin

In [8]:
%%capture
!pip install spacy==3.0.3
!python -m spacy download en_core_web_sm
!git clone https://github.com/steveazzolin/NLU-second-assignment.git

In [2]:
import spacy
from spacy.tokens import Span
nlp = spacy.load('en_core_web_sm')

from sklearn.metrics import classification_report
from sklearn import preprocessing
from tqdm import tqdm
import numpy as np
import pandas as pd
from collections import defaultdict

#### conll.py

In the following cell I included the conll script as provided by prof. Stepanov, with a slight modification in the function `read_corpus_conll`. In particular, I removed the line containing `DOCSTRING` from the output.

In [3]:
import re

"""
Modified version of https://pypi.org/project/conlleval/
"""

def stats():
    return {'cor': 0, 'hyp': 0, 'ref': 0}


def evaluate(ref, hyp, otag='O'):
    # evaluation for NLTK
    aligned = align_hyp(ref, hyp)
    return conlleval(aligned, otag=otag)


def align_hyp(ref, hyp):
    # align references and hypotheses for evaluation
    # add last element of token tuple in hyp to ref
    if len(ref) != len(hyp):
        raise ValueError("Size Mismatch: ref: {} & hyp: {}".format(len(ref), len(hyp)))

    out = []
    for i in range(len(ref)):
        if len(ref[i]) != len(hyp[i]):
            raise ValueError("Size Mismatch: ref: {} & hyp: {}".format(len(ref), len(hyp)))
        out.append([(*ref[i][j], hyp[i][j][-1]) for j in range(len(ref[i]))])
    return out


def conlleval(data, otag='O'):
    # token, segment & class level counts for TP, TP+FP, TP+FN
    tok = stats()
    seg = stats()
    cls = {}

    for sent in data:

        prev_ref = otag      # previous reference label
        prev_hyp = otag      # previous hypothesis label
        prev_ref_iob = None  # previous reference label IOB
        prev_hyp_iob = None  # previous hypothesis label IOB

        in_correct = False  # currently processed chunks is correct until now

        for token in sent:

            hyp_iob, hyp = parse_iob(token[-1])
            ref_iob, ref = parse_iob(token[-2])

            ref_e = is_eoc(ref, ref_iob, prev_ref, prev_ref_iob, otag)
            hyp_e = is_eoc(hyp, hyp_iob, prev_hyp, prev_hyp_iob, otag)

            ref_b = is_boc(ref, ref_iob, prev_ref, prev_ref_iob, otag)
            hyp_b = is_boc(hyp, hyp_iob, prev_hyp, prev_hyp_iob, otag)

            if not cls.get(ref) and ref:
                cls[ref] = stats()

            if not cls.get(hyp) and hyp:
                cls[hyp] = stats()

            # segment-level counts
            if in_correct:
                if ref_e and hyp_e and prev_hyp == prev_ref:
                    in_correct = False
                    seg['cor'] += 1
                    cls[prev_ref]['cor'] += 1

                elif ref_e != hyp_e or hyp != ref:
                    in_correct = False

            if ref_b and hyp_b and hyp == ref:
                in_correct = True

            if ref_b:
                seg['ref'] += 1
                cls[ref]['ref'] += 1

            if hyp_b:
                seg['hyp'] += 1
                cls[hyp]['hyp'] += 1

            # token-level counts
            if ref == hyp and ref_iob == hyp_iob:
                tok['cor'] += 1

            tok['ref'] += 1

            prev_ref = ref
            prev_hyp = hyp
            prev_ref_iob = ref_iob
            prev_hyp_iob = hyp_iob

        if in_correct:
            seg['cor'] += 1
            cls[prev_ref]['cor'] += 1

    return summarize(seg, cls)


def parse_iob(t):
    m = re.match(r'^([^-]*)-(.*)$', t)
    return m.groups() if m else (t, None)


def is_boc(lbl, iob, prev_lbl, prev_iob, otag='O'):
    """
    is beginning of a chunk

    supports: IOB, IOBE, BILOU schemes
        - {E,L} --> last
        - {S,U} --> unit

    :param lbl: current label
    :param iob: current iob
    :param prev_lbl: previous label
    :param prev_iob: previous iob
    :param otag: out-of-chunk label
    :return:
    """
    boc = False

    boc = True if iob in ['B', 'S', 'U'] else boc
    boc = True if iob in ['E', 'L'] and prev_iob in ['E', 'L', 'S', otag] else boc
    boc = True if iob == 'I' and prev_iob in ['S', 'L', 'E', otag] else boc

    boc = True if lbl != prev_lbl and iob != otag and iob != '.' else boc

    # these chunks are assumed to have length 1
    boc = True if iob in ['[', ']'] else boc

    return boc


def is_eoc(lbl, iob, prev_lbl, prev_iob, otag='O'):
    """
    is end of a chunk

    supports: IOB, IOBE, BILOU schemes
        - {E,L} --> last
        - {S,U} --> unit

    :param lbl: current label
    :param iob: current iob
    :param prev_lbl: previous label
    :param prev_iob: previous iob
    :param otag: out-of-chunk label
    :return:
    """
    eoc = False

    eoc = True if iob in ['E', 'L', 'S', 'U'] else eoc
    eoc = True if iob == 'B' and prev_iob in ['B', 'I'] else eoc
    eoc = True if iob in ['S', 'U'] and prev_iob in ['B', 'I'] else eoc

    eoc = True if iob == otag and prev_iob in ['B', 'I'] else eoc

    eoc = True if lbl != prev_lbl and iob != otag and prev_iob != '.' else eoc

    # these chunks are assumed to have length 1
    eoc = True if iob in ['[', ']'] else eoc

    return eoc


def score(cor_cnt, hyp_cnt, ref_cnt):
    # precision
    p = 1 if hyp_cnt == 0 else cor_cnt / hyp_cnt
    # recall
    r = 0 if ref_cnt == 0 else cor_cnt / ref_cnt
    # f-measure (f1)
    f = 0 if p+r == 0 else (2*p*r)/(p+r)
    return {"p": p, "r": r, "f": f, "s": ref_cnt}


def summarize(seg, cls):
    # class-level
    res = {lbl: score(cls[lbl]['cor'], cls[lbl]['hyp'], cls[lbl]['ref']) for lbl in set(cls.keys())}
    # micro
    res.update({"total": score(seg.get('cor', 0), seg.get('hyp', 0), seg.get('ref', 0))})
    return res


def read_corpus_conll(corpus_file, fs=" "):
    """
    read corpus in CoNLL format
    :param corpus_file: corpus in conll format
    :param fs: field separator
    :return: corpus
    """
    featn = None  # number of features for consistency check
    sents = []  # list to hold words list sequences
    words = []  # list to hold feature tuples

    for line in open(corpus_file):
        if "-DOCSTART-" in line:
          continue
        line = line.strip()
        if len(line.strip()) > 0:
            feats = tuple(line.strip().split(fs))
            if not featn:
                featn = len(feats)
            elif featn != len(feats) and len(feats) != 0:
                raise ValueError("Unexpected number of columns {} ({})".format(len(feats), featn))

            words.append(feats)
        else:
            if len(words) > 0:
                sents.append(words)
                words = []
    return sents


def get_chunks(corpus_file, fs="\t", otag="O"):
    sents = read_corpus_conll(corpus_file, fs=fs)
    return set([parse_iob(token[-1])[1] for sent in sents for token in sent if token[-1] != otag])


Since the goal of the following exercise is to evaluate the performances of Spacy on conll2003, I'll read only the test set, ignoring both the train and validation set. However, including them in the analysis would be straightforward.

In [4]:
test = read_corpus_conll("NLU-second-assignment/data/test.txt")

## ex01

>Evaluate spaCy NER on CoNLL 2003 data (provided)
>
>- report token-level performance (per class and total)
  - accuracy of correctly recognizing all tokens that belong to named entities (i.e. tag-level accuracy)
- report CoNLL chunk-level performance (per class and total);
  - precision, recall, f-measure of correctly recognizing all the named entities in a chunk per class and total

---


One important caveat of this exercise was the conversion between the [Spacy's annotation scheme](https://spacy.io/models/en) and the [Conll's annotation scheme](https://www.clips.uantwerpen.be/conll2003/ner/annotation.txt). To resolve this inconsistency, I simply translate the two sets of tags with a mapper. In doing this an ambiguity emerged, related to how to handle some particular tags of Spacy, such as *DATE* and *TIME*. In the annotation scheme linked above, dates are not mentioned, suggesting to don't consider this tag at all. But, since this would result in an incomplete mapping between Spacy and Conll I decided to map *DATE* and *TIME* to *O*, since is possible to find in the dataset examples of dates and hours labelled with that token. A similar reasoning can be applied also to other tags mapped to *O*.


Moreover, an additional difficulty was to take care of the different tokenization approach. In this case I enforced Spacy to use the conll's tokenization, by overloading the default tokenizer of Spacy. This may result in performances losses, however I noticed that in this dataset this is not the case. An alternative approach would be to re-join tokens, as provided by Spacy, by analyzing the attribute `whitespace_` of `Token`.

For token-level performances, I used [classification_report](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html) since provides a handy way for summarizing the most popular classification metrics, whereas for chunk-level performances I used the evaluate script provided by *conll.py*

In [5]:
def compose_string_prediction(doc, mapper):
  """
  Convert Doc object into list of NE tags.
  Thanks to 'mapper' convert also from Spacy's annotation scheme to Conll's annotation scheme
  """
  assert mapper is not None

  ret = []
  for t in doc:
    tmp = mapper[t.ent_type_]
    if t.ent_type_ != "" and tmp != "O": #if iob tag is O then ent_type_ is ""
      tmp = t.ent_iob_ + "-" + tmp 
    ret.append(tmp)
  return ret

In [6]:
# mapper to convert from spacy's tags to conll's tags
mapper = { 
    'PERSON':'PER', 
    'NORP':'MISC', 
    'LOC':'LOC', 
    'FAC':'LOC', 
    'GPE':'LOC', 
    'ORG':'ORG', 
    'PRODUCT':'MISC', 
    'EVENT':'MISC', 
    'WORK_OF_ART':'O', 
    'LAW':'O', 
    'LANGUAGE':'MISC', 
    'DATE':'O', 
    'TIME':'O', 
    'PERCENT':'O', 
    'MONEY':'O', 
    'QUANTITY':'O', 
    'ORDINAL':'O', 
    'CARDINAL':'O', 
    '':'O'
}


def my_tokenizer(txt=""):
  """
  Overload the spacy's tokenization in order to comply to the tokenization of conll.
  An alternative approach is to reconstruct the tokens provided by Spacy by analyzing
  the whitespace attribute. Here not implemented for timing issues
  """
  return spacy.tokens.Doc(nlp.vocab, my_tokenization)

nlp = spacy.load('en_core_web_sm')
nlp.tokenizer = my_tokenizer

pred , true , dataset = [] , [] , []
conll_pred , conll_true = [] , []
for sentence in tqdm(test[:]):
  y = [s[3] for s in sentence]
  x = " ".join([s[0] for s in sentence])
  my_tokenization = [s[0] for s in sentence]

  doc = nlp(x)

  #compose prediction/gt vectors for token-level evaluation
  pred.extend(compose_string_prediction(doc, mapper))
  true.extend(y)

  #compose prediction/gt vectors for chunk-level evaluation
  preds = compose_string_prediction(doc, mapper)
  conll_pred.append([])
  conll_true.append([])
  for j in range(len(y)):
    conll_pred[-1].append((sentence[j][0] , preds[j]))
    conll_true[-1].append((sentence[j][0] , y[j]))

100%|██████████| 3453/3453 [00:32<00:00, 106.65it/s]


In [7]:
le = preprocessing.LabelEncoder() #to convert from 'string' to 'int' the prediction vector
le.fit(true)

print("Overall performances per-class:")
print(classification_report(le.transform(true), le.transform(pred), target_names=le.classes_))
print("\n\n")

print("Chunk-level performances:")
pd_tbl = pd.DataFrame().from_dict(evaluate(conll_true, conll_pred), orient='index')
pd_tbl.round(decimals=2)

Overall performances per-class:
              precision    recall  f1-score   support

       B-LOC       0.77      0.71      0.74      1668
      B-MISC       0.77      0.55      0.64       702
       B-ORG       0.50      0.30      0.38      1661
       B-PER       0.79      0.61      0.69      1617
       I-LOC       0.58      0.66      0.61       257
      I-MISC       0.59      0.34      0.43       216
       I-ORG       0.42      0.52      0.46       835
       I-PER       0.82      0.76      0.78      1156
           O       0.95      0.98      0.96     38323

    accuracy                           0.91     46435
   macro avg       0.69      0.60      0.63     46435
weighted avg       0.90      0.91      0.90     46435




Chunk-level performances:


Unnamed: 0,p,r,f,s
MISC,0.76,0.54,0.63,702
PER,0.76,0.59,0.66,1617
ORG,0.45,0.27,0.34,1661
LOC,0.76,0.7,0.73,1668
total,0.69,0.52,0.59,5648


## ex02

> Grouping of Entities. Write a function to group recognized named entities using `noun_chunks` method of spaCy. Analyze the groups in terms of most frequent combinations (i.e. NER types that go together).

---

In this exercise I had to group the named entities using noun chunks, with the caveat that some named entities may not be part of any noun chunks. Indeed, my code tries to *intersect* the output of `doc.noun_chunks` and `doc.ents`, in such a way that all named entities are present in the output.

After that, I simply run a frequency count over all groups, by simply incrementing the entry of a dictionary depending whether a certain combination is present or not.

An example of result is shown below:

input: `Four Africans said to vie for top U.N. post.`
 
grouping: `[['CARDINAL', 'NORP'], ['ORG']]`

counting: `{'CARDINAL_NORP': 1, 'ORG':1}`

In the following output cell you can see the 15 most frequent groups.

In [None]:
nlp = spacy.load('en_core_web_sm')
nlp.tokenizer = my_tokenizer

def group(sentence:str):
  """
  Function to loop over all entities and noun_chunks, in order to group entities without filtering them
  """
  doc = nlp(sentence)

  ret = [[]]
  last_j = -1
  for i , entity in enumerate(doc.ents):
    found = False
    for j , chunk in enumerate(doc.noun_chunks):    
      if found: continue
      for t in chunk.ents:
        if found: continue
        if t == entity:
          if last_j == j - 1:
            ret[-1].append(t.label_)
          else:
            ret.append([])
            ret[-1].append(t.label_)
          found = True
          last_j = j - 1
    if found == False:
      ret.append([])
      ret[-1].append(entity.label_)
  # remove eventual [] at the beginning of the ret object
  l = []
  for r in ret:
    if r != []:
      l.append(r)
  return l

def frequency_group(groups, freq):
  """
  Frequency computation over groups
  """
  for group in groups:
    key = "_".join(group)
    freq[key] += 1

groups = []
freq = defaultdict(int)
for sentence in tqdm(test[:]):
  x = " ".join([s[0] for s in sentence])
  my_tokenization = [s[0] for s in sentence]
  g = group(x)
  frequency_group(g, freq)
  groups.append(g)

pd_tbl = pd.DataFrame().from_dict(freq, orient='index', columns=["Count"]).sort_values("Count", ascending=False)
pd_tbl.round(decimals=3).head(10)

100%|██████████| 3453/3453 [00:33<00:00, 102.92it/s]


Unnamed: 0,Count
CARDINAL,1395
GPE,1255
PERSON,961
DATE,904
ORG,887
NORP,288
MONEY,147
CARDINAL_PERSON,122
ORDINAL,108
TIME,75


### ex03

> One of the possible post-processing steps is to fix segmentation errors. Write a function that extends the entity span to cover the full noun-compounds. Make use of compound dependency relation

The tecnique was presented in [this](https://www.aclweb.org/anthology/2020.ecnlp-1.1.pdf) paper. Basically what needs to be done is a simple join between tokens adjacent to a named entity in a compund relation with it, in order to  extend the entity span to cover the full noun-compound.

An example of joining is presented in the following output cell.

In [None]:
def my_tokenizer(txt=""):
  tokens = my_tokenization
  return spacy.tokens.Doc(nlp.vocab, tokens)

def join_ne_compounds(doc):
  new_ne = []
  indexes_included = set() #to handle cases in which a token is in compound relation with multiple entities
  for t in doc.ents:
    i = t.start
    j = t.end
    if t.label != "":
      found = False
      for z in t:
        if ((doc[max(i-1, 0)].head == z and doc[i-1].dep_ == "compound" and doc[max(i-1, 0)].ent_type_ == "")
          or
          (doc[max(i-1, 0)] == z.head and z.dep_ == "compound" and doc[max(i-1, 0)].ent_type_ == "")):
          if i-1 not in indexes_included:
            found = True
            new_ne.append(Span(doc, i-1, j, t.label_))
            indexes_included.add(i-1) #otherways I may write doc[i-1].ent_type_ = t.ent_type to embed the check naturally in the conditions already defined, but I preferred a different way to don't manually update the fields of the Token object
            break
        elif ((doc[min(j, len(doc)-1)].head == z and doc[min(j, len(doc)-1)].dep_ == "compound" and doc[min(j, len(doc)-1)].ent_type_ == "")
            or
            (doc[min(j, len(doc)-1)] == z.head and z.dep_ == "compound" and doc[min(j, len(doc)-1)].ent_type_ == "")):
          found = True
          new_ne.append(Span(doc, i, j+1, t.label_))
          indexes_included.add(j)
          break
      if not found:
        new_ne.append(Span(doc, i, j, t.label_))
  doc.set_ents(new_ne)
  return doc


nlp = spacy.load('en_core_web_sm')
nlp.tokenizer = my_tokenizer

x = "Other facts : As a qualifier for the 1993 World Cup finals through Europa Cup results."
my_tokenization = x.split(" ")
doc = nlp(x)

print("Dependecy graph of the input sentence:\n")
displacy.render(doc, style="dep", jupyter=True)

print("\n\nOriginal named entities:")
displacy.render(doc, style="ent", jupyter=True)

#apply the algorithm
doc = join_ne_compounds(doc)

print("\n\nUpdated named entities:")
displacy.render(doc, style="ent", jupyter=True)

Dependecy graph of the input sentence:





Original named entities:




Updated named entities:


Basically, since `finals` is in a *compound* relation with `World Cup`, the resulting named entity will be `World Cup finals`.

To implement this algorithm I used the function `doc.set_ents` provided by Spacy, passing all the named entities found in the Doc object. Standard named entities are passed as they are (`Span(doc, ne.start, ne.end, ne.label_)`), whereas for named entities that needs to be *augmented* (as described previously) I passed instead an enlarged Span (either `Span(doc, ne.start-1, ne.end, ne.label_)` or `Span(doc, ne.start, ne.end+1, ne.label_)`). Some specific sanity checks are put in place in order to avoid tokens belonging to multiple Spans, something that would result in a Spacy exception.

After that, to check whether this technique was beneficial, I re-evaluated the spacy model with the conll script. Results are reported below.

In [None]:
conll_pred , conll_true , ret = [] , [] , []
for sentence in tqdm(test[:]):
  y = [s[3] for s in sentence]
  x = " ".join([s[0] for s in sentence])
  my_tokenization = [s[0] for s in sentence]

  doc = join_ne_compounds(nlp(x))
  ret.append(doc)

  #evaluation
  preds = compose_string_prediction(doc, mapper)
  conll_pred.append([])
  conll_true.append([])
  for j in range(len(y)):
    conll_pred[-1].append((sentence[j][0] , preds[j]))
    conll_true[-1].append((sentence[j][0] , y[j]))

100%|██████████| 3453/3453 [00:32<00:00, 107.27it/s]


In [None]:
print("Original chunk-level performances:\n")
pd_tbl.round(decimals=2)

Original chunk-level performances:



Unnamed: 0,p,r,f,s
PER,0.76,0.59,0.66,1617
LOC,0.76,0.7,0.73,1668
MISC,0.76,0.54,0.63,702
ORG,0.45,0.27,0.34,1661
total,0.69,0.52,0.59,5648


In [None]:
print("Updated chunk-level performances:\n")
pd_tbl2 = pd.DataFrame().from_dict(evaluate(conll_true, conll_pred), orient='index')
pd_tbl2.round(decimals=2)

Updated chunk-level performances:



Unnamed: 0,p,r,f,s
PER,0.65,0.51,0.57,1617
LOC,0.69,0.64,0.67,1668
MISC,0.69,0.49,0.58,702
ORG,0.36,0.22,0.27,1661
total,0.6,0.46,0.52,5648


Overall, applying the technique presented in [this](https://www.aclweb.org/anthology/2020.ecnlp-1.1.pdf) paper does not seem beneficial, at least considered the evaluation performed by the *conll* script.