In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')
s = "John watched an old movie at the cinema."
doc = nlp(s)
#https://www.nlpdemystified.org/course/advanced-preprocessing

In [2]:
[(t.text, t.pos_) for t in doc] # spacy.explain('PROPN')

[('John', 'PROPN'),
 ('watched', 'VERB'),
 ('an', 'DET'),
 ('old', 'ADJ'),
 ('movie', 'NOUN'),
 ('at', 'ADP'),
 ('the', 'DET'),
 ('cinema', 'NOUN'),
 ('.', 'PUNCT')]

In [3]:
[(t.text, t.tag_) for t in doc] # print(spacy.explain('NNP'))

[('John', 'NNP'),
 ('watched', 'VBD'),
 ('an', 'DT'),
 ('old', 'JJ'),
 ('movie', 'NN'),
 ('at', 'IN'),
 ('the', 'DT'),
 ('cinema', 'NN'),
 ('.', '.')]

In [4]:
# 1 to 1 relation
s = "Volkswagen is developing an electric sedan which could potentially come to America next fall."
doc = nlp(s)

[(t.text, t.ent_type_) for t in doc] # spacy.explain('GPE')

[('Volkswagen', 'ORG'),
 ('is', ''),
 ('developing', ''),
 ('an', ''),
 ('electric', ''),
 ('sedan', ''),
 ('which', ''),
 ('could', ''),
 ('potentially', ''),
 ('come', ''),
 ('to', ''),
 ('America', 'GPE'),
 ('next', 'DATE'),
 ('fall', 'DATE'),
 ('.', '')]

In [5]:
# things that are not blank often have a nonzero value associated
print([(t.text, t.ent_type_) for t in doc if t.ent_type != 0])

[('Volkswagen', 'ORG'), ('America', 'GPE'), ('next', 'DATE'), ('fall', 'DATE')]


In [6]:
# other way to parse through doc.ents itself
[(ent.text, ent.label_) for ent in doc.ents]

[('Volkswagen', 'ORG'), ('America', 'GPE'), ('next fall', 'DATE')]

In [8]:

from spacy import displacy
# jupyter to true else provides html

# doesnt work cuz spacy wants to import it from ipython.core.display when i think modenr versions is from IPython.core import display

displacy.render(doc, style='ent', jupyter=True)

ImportError: cannot import name 'display' from 'IPython.core.display' (/opt/anaconda3/envs/torch/lib/python3.11/site-packages/IPython/core/display.py)

In [9]:
s = "She enrolled in the course at the university."
doc = nlp(s)
[(t.text, t.dep_, t.head.text) for t in doc] # spacy.explain('nsubj')
# has the word, what the relation to the head is. 

[('She', 'nsubj', 'enrolled'),
 ('enrolled', 'ROOT', 'enrolled'),
 ('in', 'prep', 'enrolled'),
 ('the', 'det', 'course'),
 ('course', 'pobj', 'in'),
 ('at', 'prep', 'enrolled'),
 ('the', 'det', 'university'),
 ('university', 'pobj', 'at'),
 ('.', 'punct', 'enrolled')]

In [10]:
# matching in a sentence to find patterns
from spacy.matcher import Matcher

# initialize with a vocab
matcher = Matcher(nlp.vocab)

s = "I want to book a hotel room."
doc = nlp(s)

# pattern is what type of sequence we are looking for
# we are looking for book, followed by (?) zero or one determinants, followed by one or more nouns
pattern = [
{'TEXT': 'book'},
{'POS': 'DET', 'OP': '?'},
{'POS': 'NOUN', 'OP': '+'},
]

matcher.add('#Pattern Name', [pattern])
matches = matcher(doc)

print("Matches:", [doc[start:end].text for match_id, start, end in matches])

Matches: ['book a hotel', 'book a hotel room']


In [11]:
# for more robust matchers, use Name Entity Recognition or Part of Speech
doc = nlp("I want to book a flight and hotel room in Berlin.")
for noun_phrase in doc.noun_chunks:
    print(noun_phrase)
for noun_phrase in doc.noun_chunks:
    print("phrase: {}, root head: {}".format(noun_phrase, noun_phrase.root.head))

I
a flight and hotel room
Berlin
phrase: I, root head: want
phrase: a flight and hotel room, root head: book
phrase: Berlin, root head: in


In [12]:
def yodize(s: str):
  doc = nlp(s)
  for t in doc:
    if t.dep_ == "ROOT":
        print(t.text)
      # Assuming our sentence is of the form subject-verb-object, we take 
      # everything after the root (likely verb) and put it in front, and 
      # likewise take everything before the root, and put it after.
        seq = [doc[t.i + 1: -1].text, doc[0: t.i].text, t.text + '.']
        seq[0] = seq[0].capitalize()
        print(' '.join(seq))

In [13]:
yodize("I will fly to Texas.")

fly
To texas I will fly.


In [14]:
#
# EXERCISE: using doc.ents, identify and print the dates in this sentence.
# Expected output: ['Feb 13th', 'Feb 24th']
#
s = "We'll be in Osaka on Feb 13th and leave on Feb 24th."
doc = nlp(s)
[(ent.text, ent.label_) for ent in doc.ents] # i could just sort by dates

[('Osaka', 'GPE'), ('Feb 13th', 'DATE'), ('Feb 24th', 'DATE')]

In [15]:
pattern = [
    {'ENT_TYPE': 'DATE'},
]

matcher.add('#Pattern Name', [pattern])
matches = matcher(doc)

print("Matches:", [doc[start:end].text for match_id, start, end in matches])

Matches: ['Feb', '13th', 'Feb', '24th']
