In [1]:
import spacy
from spacy.matcher import Matcher


#Pattern interactive https://explosion.ai/demos/matcher
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

doc = nlp(
    "i downloaded Fortnite on my laptop and can't open the game at all. Help? "
    "so when I was downloading Minecraft, I got the Windows version where it "
    "is the '.zip' folder and I used the default program to unpack it... do "
    "I also need to download Winzip?"
)

# Write a pattern that matches a form of "download" plus proper noun

pattern = [{"LEMMA": "download"}, {}]  #wild card

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("DOWNLOAD_THINGS_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", match_id, doc[start:end].text)

Total matches found: 3
Match found: 1475109908168048428 downloaded Fortnite
Match found: 1475109908168048428 downloading Minecraft
Match found: 1475109908168048428 download Winzip


In [None]:
pattern = [
    {"LEMMA": "love", "POS": "VERB"},
    {"POS": "VERB", "OP": "+"},
   # {"POS": "NOUN"}
]
matcher = Matcher(nlp2.vocab)

matcher.add("love pattern", None, pattern)
doc2 = nlp('I loved cats but I also love loving')

matches = matcher(doc2)


In [4]:
[*itertools.chain(['a', 'b', 'c'], (1, 2, 3))]

['a', 'b', 'c', 1, 2, 3]

# Building blocks: chunks, sentences, tokens, trees, spans

In [3]:
import spacy

# Load a language model and parse a document.
nlp = spacy.load('en_core_web_sm')
doc = nlp("Aliens come to earth to escape reality TV")

# Print all noun chunks.
# These are contiguous noun phrases.
for chunk in doc.noun_chunks:
    print(chunk)

Aliens
earth
reality TV


In [4]:
# Print the head word of each sentence.
# This is the grammatically most informative word.
for sentence in doc.sents:
    print(sentence.root)

come


In [5]:
# Print the dependency subtree of each token.
# These are the words operated upon by the token.
for token in doc:
    print(token, [*token.subtree])

Aliens [Aliens]
come [Aliens, come, to, earth, to, escape, reality, TV]
to [to, earth]
earth [earth]
to [to]
escape [to, escape, reality, TV]
reality [reality]
TV [reality, TV]


In [11]:
# The easiest way is to find the head of the subtree you want, and then use
    # the `.subtree`, `.children`, `.lefts` and `.rights` iterators. `.subtree`
    # is the one that does what you're asking for most directly:
for word in doc:
    if word.dep_ in ("advcl", "dobj"):
        print("".join(w.text_with_ws for w in word.subtree))

to escape reality TV
reality TV


In [12]:
doc[1]

come

In [10]:
[{"LOWER": "facebook"}, {"LEMMA": "be"}, {"POS": "ADV", "OP": "*"}, {"POS": "ADJ"}]


[{'LOWER': 'facebook'},
 {'LEMMA': 'be'},
 {'POS': 'ADV', 'OP': '*'},
 {'POS': 'ADJ'}]

In [14]:
def expand_person_entities(doc):
    new_ents = []
    for ent in doc.ents:
        if ent.label_ == "PERSON" and ent.start != 0:
            prev_token = doc[ent.start - 1]
            if prev_token.text in ("Dr", "Dr.", "Mr", "Mr.", "Ms", "Ms."):
                new_ent = Span(doc, ent.start - 1, ent.end, label=ent.label)
                new_ents.append(new_ent)
        else:
            new_ents.append(ent)
    doc.ents = new_ents
    return doc

In [15]:
#Spans from scratch
# Import the Doc and Span classes
from spacy.tokens import Doc, Span

# The words and spaces to create the doc from
words = ["Hello", "world", "!"]
spaces = [True, False, False]

# Create a doc manually
doc = Doc(nlp.vocab, words=words, spaces=spaces)

# Create a span manually
span = Span(doc, 0, 2)

# Create a span with a label
span_with_label = Span(doc, 0, 2, label="GREETING")

# Add span to the doc.ents
doc.ents = [span_with_label]

NameError: name 'nlp' is not defined

In [16]:
doc.ents
Doc.similarity(), Span.similarity()  Token.similarity()

SyntaxError: invalid syntax (<ipython-input-16-ed2543cbf811>, line 2)

In [14]:
# Initialize with the shared vocab
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

# Patterns are lists of dictionaries describing the tokens
pattern = [{"LEMMA": "love", "POS": "VERB"}, {"LOWER": "cats"}]
matcher.add("LOVE_CATS", None, pattern)

# Operators can specify how often a token should be matched
pattern = [{"TEXT": "very", "OP": "+"}, {"TEXT": "happy"}]
matcher.add("VERY_HAPPY", None, pattern)

# Calling matcher on doc returns list of (match_id, start, end) tuples
doc = nlp("I love cats and I'm very very happy")
matches = matcher(doc)

In [15]:
[match for match in matches]

[(9137535031263442622, 1, 3),
 (2447047934687575526, 7, 9),
 (2447047934687575526, 6, 9)]

In [16]:
matcher = Matcher(nlp.vocab)
matcher.add("DOG", None, [{"LOWER": "golden"}, {"LOWER": "retriever"}])
doc = nlp("I have a Golden Retriever")

for match_id, start, end in matcher(doc):
    span = doc[start:end]
    print("Matched span:", span.text)
    # Get the span's root token and root head token
    print("Root token:", span.root.text)
    print("Root head token:", span.root.head.text)
    # Get the previous token and its POS tag
    print("Previous token:", doc[start - 1].text, doc[start - 1].pos_)

Matched span: Golden Retriever
Root token: Retriever
Root head token: have
Previous token: a DET


In [17]:
from nlp_utils import visualise_doc
visualise_doc(doc)

In [18]:
import spacy
from spacy.pipeline import merge_entities
from spacy import displacy

nlp = spacy.load("en_core_web_sm")

def extract_person_orgs(doc):
    person_entities = [ent for ent in doc.ents if ent.label_ == "PERSON"]
    for ent in person_entities:
        head = ent.root.head
        if head.lemma_ == "work":
            preps = [token for token in head.children if token.dep_ == "prep"]
            for prep in preps:
                orgs = [t for t in prep.children if t.ent_type_ == "ORG"]
                aux = [token for token in head.children if token.dep_ == "aux"]                
                past_aux = any(t.tag_ == "VBD" for t in aux)                
                past = head.tag_ == "VBD" or head.tag_ == "VBG" and past_aux                
                print({'person': ent, 'orgs': orgs, 'past': past})
    return doc

# To make the entities easier to work with, we'll merge them into single tokens
nlp.add_pipe(merge_entities)
nlp.add_pipe(extract_person_orgs)

doc = nlp("Alex Smith worked at Acme Corp Inc.")
# If you're not in a Jupyter / IPython environment, use displacy.serve
displacy.render(doc, options={'fine_grained': True})

{'person': Alex Smith, 'orgs': [Acme Corp Inc.], 'past': True}
