#Dependency parsing with Spacy


spaCy features a fast and accurate syntactic dependency parser, and has a rich API for navigating the tree. The parser also powers the sentence boundary detection, and lets you iterate over base noun phrases, or “chunks”. You can check whether a Doc object has been parsed by calling doc.has_annotation("DEP"), which checks whether the attribute Token.dep has been set returns a boolean value. If the result is False, the default sentence iterator will raise an exception.

#1. Libraries and Setup

In [1]:
!pip install chart-studio

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting chart-studio
  Downloading chart_studio-1.1.0-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.4/64.4 KB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting retrying>=1.3.3
  Downloading retrying-1.3.4-py3-none-any.whl (11 kB)
Installing collected packages: retrying, chart-studio
Successfully installed chart-studio-1.1.0 retrying-1.3.4


In [2]:
import spacy
from IPython.display import HTML
import warnings
import chart_studio.plotly as py
import plotly.graph_objs as go
import plotly.offline as offline
import pandas as pd
import matplotlib.pyplot as plt
from functools import partial
import numpy as np
import sys



In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Change pos_ by dep_ reciclying code
def rep_sentence(text, display_pos = True):
    html_colors = ['SkyBlue'
               ,'red'
               ,'YellowGreen'
               ,'yellow'
               ,'orange'
               ,'pink'
               ,'brown'
               ,'purple'
               , 'CadetBlue'
                ,'DarkKhaki'
                ,'DarkSalmon'
                ,'Gold'    
              ]
    doc = nlp(text)
    n_words = len(doc)
    unique_pos = list(set(map(lambda x: x.dep_, doc)))
    pos_to_color = {i:html_colors[unique_pos.index(i)] for i in unique_pos}
    css = ["<style>.word{font-weight:bold;}</style>"]
    for pos in unique_pos:
        css.append('<style>.{}{{background-color:{};}}</style>'.format(*[pos, pos_to_color[pos]]))
    css = "".join(css)

    html = ["<table width=100%>"]
    html.append(css)
    html.append("<tr>")            
    for i in range(n_words):
        word_string= doc[i].orth_
        html.append("<td><span class='word'>{0}</span></td>".format(word_string))
    html.append("</tr>")
    if display_pos:
        html.append("<tr>")            
        for i in range(n_words):
            pos = doc[i].dep_
            color = pos_to_color[pos]
            html.append("<td><span class='{0}'>{0}</span></td>".format(pos))
        html.append("</tr>")
    html = "".join(html)
    return html

In [5]:
# Loading the model
nlp=spacy.load('en_core_web_sm')

In [6]:
# Defining the string
text="I will wear a white shirt on Monday."

In [7]:
# Creating Doc object
doc=nlp(text)

In [8]:
# Getting dependency tags
for token in doc:
    print(token.text,'=>',token.dep_)
HTML(rep_sentence(text))

I => nsubj
will => aux
wear => ROOT
a => det
white => amod
shirt => dobj
on => prep
Monday => pobj
. => punct


0,1,2,3,4,5,6,7,8
I,will,wear,a,white,shirt,on,Monday,.
nsubj,aux,ROOT,det,amod,dobj,prep,pobj,punct


In [9]:
# Importing visualizer
from spacy import displacy

In [10]:
# Visualizing dependency tree
displacy.render(doc,jupyter=True)

In [11]:
# Getting head word (parent)
for token in doc:
    print(token.text,'=>',token.head.text)

I => wear
will => wear
wear => wear
a => shirt
white => shirt
shirt => wear
on => wear
Monday => on
. => wear


In [None]:
# Getting immediate children
for token in doc:
    print(token.text,'=>',token.children)

I => <generator object at 0x7f7d25a12370>
will => <generator object at 0x7f7d25a12370>
wear => <generator object at 0x7f7d25a12370>
a => <generator object at 0x7f7d25a12370>
white => <generator object at 0x7f7d25a12370>
shirt => <generator object at 0x7f7d25a12370>
on => <generator object at 0x7f7d25a12370>
Monday => <generator object at 0x7f7d25a12370>
. => <generator object at 0x7f7d25a12370>


In [12]:
# Getting immediate children
for token in doc:
    print(token.text,'=>',[child.text for child in token.children])

I => []
will => []
wear => ['I', 'will', 'shirt', 'on', '.']
a => []
white => []
shirt => ['a', 'white']
on => ['Monday']
Monday => []
. => []


In [None]:
# Getting left and right children
for token in doc:
    print(token.text,'=>',token.lefts,'=>',token.rights)

type(token.lefts)

I => <generator object at 0x7f7d25a120f0> => <generator object at 0x7f7d25a12910>
will => <generator object at 0x7f7d25a120f0> => <generator object at 0x7f7d25a12910>
wear => <generator object at 0x7f7d25a120f0> => <generator object at 0x7f7d25a12910>
a => <generator object at 0x7f7d25a120f0> => <generator object at 0x7f7d25a12910>
white => <generator object at 0x7f7d25a120f0> => <generator object at 0x7f7d25a12910>
shirt => <generator object at 0x7f7d25a120f0> => <generator object at 0x7f7d25a12910>
on => <generator object at 0x7f7d25a120f0> => <generator object at 0x7f7d25a12910>
Monday => <generator object at 0x7f7d25a120f0> => <generator object at 0x7f7d25a12910>
. => <generator object at 0x7f7d25a120f0> => <generator object at 0x7f7d25a12910>


generator

In [13]:
# Getting left children
for token in doc:
    print(token.text,'=>',token.n_lefts,'=>',[left for left in token.lefts])

I => 0 => []
will => 0 => []
wear => 2 => [I, will]
a => 0 => []
white => 0 => []
shirt => 2 => [a, white]
on => 0 => []
Monday => 0 => []
. => 0 => []


In [None]:
# Getting right children
for token in doc:
    print(token.text,'=>',token.n_rights,'=>',[right for right in token.rights])

#Saving Images

In [17]:
import spacy
from spacy import displacy
from pathlib import Path


In [15]:

path1='drive/My Drive/Colab Notebooks/trinity/data_trinity/'

In [18]:
doc1 = nlp("This is a sentence.")
doc2 = nlp("This is another sentence.")
html = displacy.render([doc1, doc2], style="dep", page=True)
svg = displacy.render(doc1, style="dep")
output_path = Path(path1+"sentence.svg")
output_path.open("w", encoding="utf-8").write(svg)

3024

In [19]:
nlp = spacy.load("en_core_web_sm")
sentences = ["This is an example.", "This is another one."]
for sent in sentences:
    doc = nlp(sent)
    svg = displacy.render(doc, style="dep", jupyter=False)
    file_name = '-'.join([w.text for w in doc if not w.is_punct]) + ".svg"
    output_path = Path(path1 + file_name)
    output_path.open("w", encoding="utf-8").write(svg)