<a href="https://colab.research.google.com/github/sourcecode369/deep-natural-language-processing/blob/master/spaCy/guide/Linguistic_Features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Part - of - speech tagging

In [0]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("Hello my name is sourcecode369 and i live in delhi, India")

In [0]:
for token in doc:
  print(token.text, token.lemma_, token.pos_, token.dep_, token.shape_, token.is_alpha, token.is_stop)

Hello hello INTJ intj Xxxxx True False
my -PRON- DET poss xx True True
name name NOUN nsubj xxxx True True
is be AUX ROOT xx True True
sourcecode369 sourcecode369 PROPN attr xxxxddd False False
and and CCONJ cc xxx True True
i i PRON nsubj x True True
live live VERB ROOT xxxx True False
in in ADP prep xx True True
delhi delhi PROPN pobj xxxx True False
, , PUNCT punct , False False
India India PROPN appos Xxxxx True False


### Dependency Parsing

#### Noun Chunks

In [0]:
# ext: The original noun chunk text.
# Root text: The original text of the word connecting the noun chunk to the rest of the parse.
# Root dep: Dependency relation connecting the root to its head.
# Root head text: The text of the root token’s head.

In [0]:
for chunk in doc.noun_chunks:
  print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text)

my name name nsubj is
sourcecode369 sourcecode369 attr is
i i nsubj live
delhi delhi pobj in
India India appos delhi


#### Navigataing the parse tree

In [0]:
# Text: The original token text.
# Dep: The syntactic relation connecting child to head.
# Head text: The original text of the token head.
# Head POS: The part-of-speech tag of the token head.
# Children: The immediate syntactic dependents of the token.

In [0]:
for token in doc:
  print(token.text, token.dep_, token.head.text, token.head.pos_, [child for child in token.children])

Hello intj is AUX []
my poss name NOUN []
name nsubj is AUX [my]
is ROOT is AUX [Hello, name, sourcecode369]
sourcecode369 attr is AUX []
and cc live VERB []
i nsubj live VERB []
live ROOT live VERB [and, i, in]
in prep live VERB [delhi]
delhi pobj in ADP [,, India]
, punct delhi PROPN []
India appos delhi PROPN []


In [0]:
from spacy.symbols import nsubj, VERB

verbs = set()
for possible_subject in doc:
  if possible_subject.dep == nsubj and possible_subject.head.pos==VERB:
    verbs.add(possible_subject.head)
print(verbs)

{live}


#### Iterating around the local tree

In [0]:
print([token.text for token in doc[2].lefts])
print([token.text for token in doc[2].rights])
print(doc[2].n_lefts)
print(doc[2].n_rights)

['my']
[]
1
0


In [0]:
### You can get a whole phrase by its syntactic head using the Token.subtree attribute. 
### This returns an ordered sequence of tokens. 
### You can walk up the tree with the Token.ancestors attribute, 
### and check dominance with Token.is_ancestor


doc = nlp("Credit and mortgage account holders must submit their requests")
root = [token for token in doc if token.head == token][0]
subject = list(root.lefts)[0]
for descendant in subject.subtree:
  assert subject is descendant or subject.is_ancestor(descendant)
  print(descendant.text, descendant.dep_, descendant.n_lefts,
            descendant.n_rights,
            [ancestor.text for ancestor in descendant.ancestors])

Credit nmod 0 2 ['account', 'holders', 'submit']
and cc 0 0 ['Credit', 'account', 'holders', 'submit']
mortgage conj 0 0 ['Credit', 'account', 'holders', 'submit']
account compound 1 0 ['holders', 'submit']
holders nsubj 1 0 ['submit']


In [0]:
# Finally, the .left_edge and .right_edge attributes can be especially useful, 
# because they give you the first and last token of the subtree. 
# This is the easiest way to create a Span object for a syntactic phrase. 
# Note that .right_edge gives a token within the subtree — 
# so if you use it as the end-point of a range, don’t forget to +1!

span = doc[doc[4].left_edge.i:doc[4].right_edge.i+1]
with doc.retokenize() as retokenizer:
  retokenizer.merge(span)

for token in doc:
  print(token.text, token.pos_, token.dep_, token.head.text)

Credit and mortgage account holders NOUN nsubj submit
must VERB aux submit
submit VERB ROOT submit
their DET poss requests
requests NOUN dobj submit


#### Vizualizing dependencies

In [0]:
displacy.render(doc, style='dep',jupyter=True)

NameError: ignored

#### disabling the parser

In [0]:
from spacy.lang.en import English

In [0]:
nlp = spacy.load('en_core_web_sm', disable=['parser'])
doc = nlp('I dont want parser.', disable=['parser'])

### Named entity Recognition

In [0]:
nlp = spacy.load('en_core_web_sm')
doc = nlp("Google is buying DeepMind startup for $500 billion.")

for ent in doc.ents:
  print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [0]:
doc = nlp("San Francisco considers banning sidewalk delivery robots")
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print(ents)

In [0]:
# I – Token is inside an entity.
# O – Token is outside an entity.
# B – Token is the beginning of an entity.

ent_san = [doc[0].text, doc[0].ent_iob_, doc[0].ent_type_]
ent_francisco = [doc[1].text, doc[1].ent_iob_, doc[1].ent_type_]
print(ent_san)  # ['San', 'B', 'GPE']
print(ent_francisco)  # ['Francisco', 'I', 'GPE']

#### Setting entity annotations

In [0]:
doc = nlp("fb is hiring a new vice president of global policy")
[(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
from spacy.tokens import Span
fb_ent = Span(doc, 0, 1, "ORG")
doc.ents = list(doc.ents) + [fb_ent]
[(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]

#### Setting entity annotations from array

In [0]:
import numpy 
import spacy
from spacy.attrs import ENT_IOB, ENT_TYPE

In [0]:
nlp = spacy.load('en_core_web_sm')
doc = nlp.make_doc("London is a big city in United Kindom.")
print("Before: ",doc.ents)

In [0]:
header = [ENT_IOB, ENT_TYPE]
attr_array = numpy.zeros((len(doc), len(header)))
attr_array[0,0]  = 3
attr_array[0,1] = doc.vocab.strings["GPE"]
doc.from_array(header, attr_array)
print("After:", doc.ents)

#### Vizualizing Entities

In [0]:
import spacy
from spacy import displacy

text = "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously."

nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
displacy.render(doc, style='ent',jupyter=True)

### Entity Linking

In [0]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Ada Lovelace was born in London")


ents = [(e.text, e.label_, e.kb_id_) for e in doc.ents]
print(ents)  

ent_ada_0 = [doc[0].text, doc[0].ent_type_, doc[0].ent_kb_id_]
ent_ada_1 = [doc[1].text, doc[1].ent_type_, doc[1].ent_kb_id_]
ent_london_5 = [doc[5].text, doc[5].ent_type_, doc[5].ent_kb_id_]
print(ent_ada_0)  
print(ent_ada_1)  
print(ent_london_5)

### Tokenization

![tokenizer](https://spacy.io/language_data-ef63e6a58b7ec47c073fb59857a76e5f.svg)


In [0]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text)

#### Add Special Case Tokenization Rules 

In [0]:
import spacy
from spacy.symbols import ORTH

nlp = spacy.load("en_core_web_sm")
doc = nlp("gimme that")  
print([w.text for w in doc])  

special_case = [{ORTH: "gim"}, {ORTH: "me"}]
nlp.tokenizer.add_special_case("gimme", special_case)

print([w.text for w in nlp("gimme that")])

In [0]:
from spacy.lang.en import English

nlp = English()
text = '''"Let's go!"'''
doc = nlp(text)
tok_exp = nlp.tokenizer.explain(text)
assert [t.text for t in doc if not t.is_space] == [t[1] for t in tok_exp]
for t in tok_exp:
    print(t[1], "\t", t[0])

#### Customizing spaCy's Tokenizer class

In [0]:
import re
import spacy
from spacy.tokenizer import Tokenizer

special_cases = {":)": [{"ORTH": ":)"}]}
prefix_re = re.compile(r'''^[[("']''')
suffix_re = re.compile(r'''[])"']$''')
infix_re = re.compile(r'''[-~]''')
simple_url_re = re.compile(r'''^https?://''')

def custom_tokenizer(nlp):
    return Tokenizer(nlp.vocab, rules=special_cases,
                                prefix_search=prefix_re.search,
                                suffix_search=suffix_re.search,
                                infix_finditer=infix_re.finditer,
                                token_match=simple_url_re.match)

nlp = spacy.load("en_core_web_sm")
nlp.tokenizer = custom_tokenizer(nlp)
doc = nlp("hello-world. :)")
print([t.text for t in doc]) 

#### Modifying existing rule sets

In [0]:
import spacy
from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER
from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
from spacy.util import compile_infix_regex

# default tokenizer
nlp = spacy.load("en_core_web_sm")
doc = nlp("mother-in-law")
print([t.text for t in doc]) # ['mother', '-', 'in', '-', 'law']

# modify tokenizer infix patterns
infixes = (
    LIST_ELLIPSES
    + LIST_ICONS
    + [
        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
        ),
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        # EDIT: commented out regex that splits on hyphens between letters:
        #r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
    ]
)

infix_re = compile_infix_regex(infixes)
nlp.tokenizer.infix_finditer = infix_re.finditer
doc = nlp("mother-in-law")
print([t.text for t in doc]) # ['mother-in-law']

#### Hooking an arbitrary tokenizer into the pipeline

In [0]:
import spacy
from spacy.tokens import Doc

class WhitespaceTokenizer(object):
    def __init__(self, vocab):
        self.vocab = vocab

    def __call__(self, text):
        words = text.split(' ')
        spaces = [True] * len(words)
        return Doc(self.vocab, words=words, spaces=spaces)

nlp = spacy.load("en_core_web_sm")
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
doc = nlp("What's happened to me? he thought. It wasn't a dream.")
print([t.text for t in doc])

#### Bringing own annotations

In [0]:
import spacy
from spacy.tokens import Doc
from spacy.lang.en import English

nlp = English()
doc = Doc(nlp.vocab, words=["Hello", ",", "world", "!"],
          spaces=[False, True, False, False])
print([(t.text, t.text_with_ws, t.whitespace_) for t in doc])

In [0]:
import spacy
from spacy.tokens import Doc
from spacy.lang.en import English

nlp = English()
bad_spaces = Doc(nlp.vocab, words=["Hello", ",", "world", "!"])
good_spaces = Doc(nlp.vocab, words=["Hello", ",", "world", "!"],
                  spaces=[False, True, False, False])

print(bad_spaces.text)   # 'Hello , world !'
print(good_spaces.text)  # 'Hello, world!'

#### Aligning Tokenization

In [0]:
from spacy.gold import align

other_tokens = ["i", "listened", "to", "obama", "'", "s", "podcasts", "."]
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts", "."]
cost, a2b, b2a, a2b_multi, b2a_multi = align(other_tokens, spacy_tokens)
print("Misaligned tokens:", cost)  # 2
print("One-to-one mappings a -> b", a2b)  # array([0, 1, 2, 3, -1, -1, 5, 6])
print("One-to-one mappings b -> a", b2a)  # array([0, 1, 2, 3, 5, 6, 7])
print("Many-to-one mappings a -> b", a2b_multi)  # {4: 4, 5: 4}
print("Many-to-one mappings b-> a", b2a_multi)  # {}

### Merging and Splitting

In [0]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("I live in New York")
print("Before:", [token.text for token in doc])

with doc.retokenize() as retokenizer:
    retokenizer.merge(doc[3:5], attrs={"LEMMA": "new york"})
print("After:", [token.text for token in doc])

In [0]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("I live in NewYork")
print("Before:", [token.text for token in doc])
displacy.render(doc, jupyter=True)  # displacy.serve if you're not in a Jupyter environment

with doc.retokenize() as retokenizer:
    heads = [(doc[3], 1), doc[2]]
    attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
    retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
print("After:", [token.text for token in doc])
displacy.render(doc, jupyter=True) 

#### Overwriting custom extension attribute

In [0]:
import spacy
from spacy.tokens import Token

# Register a custom token attribute, token._.is_musician
Token.set_extension("is_musician", default=False)

nlp = spacy.load("en_core_web_sm")
doc = nlp("I like David Bowie")
print("Before:", [(token.text, token._.is_musician) for token in doc])

with doc.retokenize() as retokenizer:
    retokenizer.merge(doc[2:4], attrs={"_": {"is_musician": True}})
print("After:", [(token.text, token._.is_musician) for token in doc])

### Sentence Segmentation