<a href="https://colab.research.google.com/github/steveazzolin/NLU-first-assignment/blob/main/code/NLU_first_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLU assignment 01
## Steve Azzolin

#### preparation

In [None]:
!git clone https://github.com/steveazzolin/nltk.git
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip
!pip uninstall nltk
!pip install spacy==2.2.4

In [24]:
!mkdir ../data

In [25]:
!cp glove.6B.50d.txt ../data

#### import modules

In [11]:
import spacy
assert spacy.__version__
from spacy import displacy

from collections import defaultdict
import matplotlib.pyplot as plt

import time
import sys
from tqdm import tqdm
from pathlib import Path

spacy_nlp = spacy.load('en_core_web_sm')
sys.path.insert(1, 'nltk/') #to import the local modified version of NLTK

import nltk
from nltk.parse.transitionparser import TransitionParser, DependencyEvaluator
from nltk.corpus import dependency_treebank
nltk.download('dependency_treebank');

[nltk_data] Downloading package dependency_treebank to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package dependency_treebank is already up-to-date!


#### supporting functions

In [12]:
def plotDepGraph(spacy_doc):
  """
  function to plot the dependency graph inline
  """
  displacy.render(spacy_doc, style="dep", jupyter=True)

#### ex01

In [15]:
def es1(sentence:str, debug=False):
  """
  extract a path of dependency relations from the ROOT to a token
    - input is a sentence, you parse it and get a Doc object of spaCy
    - for each token the path will be a list of dependency relations, where first element is ROOT
  """
  spacy_doc = spacy_nlp(sentence)
  if debug: #plot the dependecy graph to inspect friendly the result
    plotDepGraph(spacy_doc)

  ret = []
  for sent in spacy_doc.sents:
      for token in sent:
        ancestors = " ".join([t.dep_ for t in token.ancestors][::-1])
        ret.append([t.dep_ for t in token.ancestors][::-1])
        ret[-1].extend([token.dep_])
        if debug: print("{}\t{:15s}\t{}".format(token.i, token.text, ancestors))
  return ret


example = "I saw the man with a telescope."
es1(example)

[['ROOT', 'nsubj'],
 ['ROOT'],
 ['ROOT', 'dobj', 'det'],
 ['ROOT', 'dobj'],
 ['ROOT', 'dobj', 'prep'],
 ['ROOT', 'dobj', 'prep', 'pobj', 'det'],
 ['ROOT', 'dobj', 'prep', 'pobj'],
 ['ROOT', 'punct']]

#### ex02

In [16]:
def es2(sentence:str, debug=False):
  """
  extract subtree of a dependents given a token
    - input is a sentence, you parse it and get a Doc object of spaCy
    - for each token in Doc objects you extract a subtree of its dependents as a list (ordered w.r.t. sentence order)
  """
  spacy_doc = spacy_nlp(sentence)
  if debug: #plot the dependecy graph to inspect friendly the result
    plotDepGraph(spacy_doc)

  ret = []
  for sent in spacy_doc.sents:
      for token in sent:
        desc = " ".join([t.text for t in token.subtree])
        ret.append([t.text for t in token.subtree])
        if debug: print("{}\t{:15s}\t{}".format(token.i, token.text, desc))
  return ret


example = "I saw the man with a telescope."
es2(example)

[['I'],
 ['I', 'saw', 'the', 'man', 'with', 'a', 'telescope', '.'],
 ['the'],
 ['the', 'man', 'with', 'a', 'telescope'],
 ['with', 'a', 'telescope'],
 ['a'],
 ['a', 'telescope'],
 ['.']]

#### ex03

In [20]:
def es3(sentence:str, subtree:list, debug=False):
  """
  check if a given list of tokens (segment of a sentence) forms a subtree
    - you parse a sentence and get a Doc object of spaCy
    - providing as an input ordered list of words from a sentence, you output True/False based on the sequence forming a subtree or not
  """
  assert " ".join(subtree) in sentence
  spacy_doc = spacy_nlp(sentence)
  if debug: #plot the dependecy graph to inspect friendly the result
    plotDepGraph(spacy_doc)

  for sent in spacy_doc.sents:
      for token in sent:
        desc = [t.text for t in token.subtree] #or one might reuse the function defined by es2()
        if desc == subtree:
          return True
  return False


example = "I saw the man with a telescope."
es3(example, ["the", "man", "with", "a", "telescope"]) , es3(example, ["man", "with", "a", "telescope"])

(True, False)

#### ex04

In [21]:
def es4(words:str, debug=False):
  """
  identify head of a span, given its tokens
    - input is a sequence of words (not necessarily a sentence)
    - output is the head of the span (single word)
  """
  spacy_doc = spacy_nlp(words)
  if debug: #plot the dependecy graph to inspect friendly the result
    plotDepGraph(spacy_doc)
  return list(spacy_doc.sents)[0].root


example = "I saw the man with a telescope."
es4(example)

saw

#### ex05

In [22]:
def es5(words:str, debug=False):
  """
  extract sentence subject, direct object and indirect object spans
    - input is a sentence, you parse it and get a Doc object of spaCy
    - output is lists of words that form a span (not a single word) for subject, direct object, and indirect object (if present of course, otherwise empty)
      - dict of lists, is better
  in particular, I extract tokens with the mentioned dependecy relations, but only the one forming a subtree (TODO ?????)
  """
  spacy_doc = spacy_nlp(words)
  if debug: #plot the dependecy graph to inspect friendly the result
    plotDepGraph(spacy_doc)

  ret = {"nsubj":[], "dobj":[], "iobj":[]}
  for sent in spacy_doc.sents:
      for token in sent:
        if token.dep_ in ["nsubj", "dobj", "iobj"]:
          #if [t.text for t in token.subtree] != [token.text]: #if it forms a span
          ret[token.dep_].append(" ".join([t.text for t in token.subtree]))
  return ret


example = "I saw the man with a telescope."
es5(example)

{'dobj': ['the man with a telescope'], 'iobj': [], 'nsubj': ['I']}

#### extra point

In [None]:
%%time

def extra_point():
  tp = TransitionParser('arc-standard', use_glove=False, linear_svm=False)
  tp.train(dependency_treebank.parsed_sents()[:100], 'tp.model')
  parses = tp.parse(dependency_treebank.parsed_sents()[-20:], 'tp.model')
  de = DependencyEvaluator(parses, dependency_treebank.parsed_sents()[-20:])
  las, uas = de.eval()
  print("\nLAS={} USA={} without GLOVE\n".format(round(las,2), round(uas,2)))

  tp = TransitionParser('arc-standard', use_glove=True, linear_svm=False)
  tp.train(dependency_treebank.parsed_sents()[:100], 'tp.model')
  parses = tp.parse(dependency_treebank.parsed_sents()[-20:], 'tp.model')
  de = DependencyEvaluator(parses, dependency_treebank.parsed_sents()[-20:])
  las, uas = de.eval()
  print("\nLAS={} USA={} with GLOVE\n".format(round(las,2), round(uas,2)))


  results = []
  times = []
  train_data_to_test = [100, 300]# 100, 300, 500, 1000
  pbar = tqdm(total=len(train_data_to_test*2))
  for i , linear in enumerate([True, False]): #enable/disable LinearSVC
    results.append([])
    times.append([])
    for train_data in train_data_to_test:    
      start = time.time()
      tp = TransitionParser('arc-standard', use_glove=False, linear_svm=linear)
      tp.train(dependency_treebank.parsed_sents()[:train_data], 'tp.model')

      times[i].append(time.time()-start)

      parses = tp.parse(dependency_treebank.parsed_sents()[-20:], 'tp.model')
      de = DependencyEvaluator(parses, dependency_treebank.parsed_sents()[-20:])
      las, _ = de.eval()
      results[i].append(las)
      pbar.update(1)      
  pbar.close()

  ax1 = plt.subplot(211)
  ax1.plot(train_data_to_test, results[0], label="LinearSVC")
  ax1.plot(train_data_to_test, results[1], label="SVC")
  ax1.set(xlabel='n° training samples', ylabel='LAS', title='LAS scores')
  plt.legend()

  ax2 = plt.subplot(212, sharex=ax1)
  ax2.plot(train_data_to_test, times[0], label="LinearSVC")
  ax2.plot(train_data_to_test, times[1], label="SVC")
  ax2.set(xlabel='n° training samples', ylabel='Training time\n (s)', title='Training times')

  plt.legend()
  plt.tight_layout()
  plt.show()



extra_point()

 Number of training examples : 100
 Number of valid (projective) examples : 100

LAS=0.77 USA=0.77 without GLOVE

Reading GLOVE....
GLOVE read
 Number of training examples : 100
 Number of valid (projective) examples : 100


  0%|          | 0/4 [00:00<?, ?it/s]


LAS=0.09 USA=0.09 with GLOVE

 Number of training examples : 100
 Number of valid (projective) examples : 100
