<a href="https://colab.research.google.com/github/stepthom/NLP_course/blob/main/classification/slides_docclass_rules.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Document Classification Slides - Rule-based

Using spaCy to define classification rules. For more info and examples, see spaCy's [documentation](https://spacy.io/usage/rule-based-matching).

- Stephen W. Thomas
- Used for MMAI 891 and MMA/GMMA 865

In [26]:
import spacy
from spacy.matcher import Matcher
from spacy.matcher import DependencyMatcher
from spacy import displacy

nlp = spacy.load("en_core_web_sm")

## Simple Token Matching

In [27]:
matcher = Matcher(nlp.vocab)
pattern = [{"TEXT": "Michael"}, {"TEXT": "Jordan"}]
matcher.add("MJ", [pattern])

In [28]:
doc1 = nlp("I love Michael Jordan")
doc2 = nlp("This sushi tasts")
doc3 = nlp("Michael, what is going on?")

docs = [doc1, doc2, doc3]

for doc in docs:
  print(doc)
  print([token.text for token in doc])
  matches = matcher(doc)
  if len(matches) > 0:
    print("Match!")
  for match_id, start, end in matches:
      string_id = nlp.vocab.strings[match_id]  # Get string representation
      span = doc[start:end]  # The matched span
      print(match_id, string_id, start, end, span.text)

I love Michael Jordan
['I', 'love', 'Michael', 'Jordan']
Match!
5769313117714300985 MJ 2 4 Michael Jordan
This sushi tasts
['This', 'sushi', 'tasts']
Michael, what is going on?
['Michael', ',', 'what', 'is', 'going', 'on', '?']


## Dependency Matching

In [29]:
matcher = DependencyMatcher(nlp.vocab)

pattern = [
    {
        "RIGHT_ID": "anchor_founded",
        "RIGHT_ATTRS": {"ORTH": "founded"}
    },
    {
        "LEFT_ID": "anchor_founded",
        "REL_OP": ">",
        "RIGHT_ID": "founded_subject",
        "RIGHT_ATTRS": {"DEP": "nsubj"},
    },
    {
        "LEFT_ID": "anchor_founded",
        "REL_OP": ">",
        "RIGHT_ID": "founded_object",
        "RIGHT_ATTRS": {"DEP": "dobj"},
    },
    {
        "LEFT_ID": "founded_object",
        "REL_OP": ">",
        "RIGHT_ID": "founded_object_modifier",
        "RIGHT_ATTRS": {"DEP": {"IN": ["amod", "compound"]}},
    }
]

matcher.add("FOUNDED", [pattern])
doc = nlp("Lee, an experienced CEO, has founded two AI startups.")
matches = matcher(doc)

print(matches) # [(4851363122962674176, [6, 0, 10, 9])]
# Each token_id corresponds to one pattern dict
match_id, token_ids = matches[0]
for i in range(len(token_ids)):
    print(pattern[i]["RIGHT_ID"] + ":", doc[token_ids[i]].text)


[(4851363122962674176, [7, 0, 10, 9])]
anchor_founded: founded
founded_subject: Lee
founded_object: startups
founded_object_modifier: AI


In [34]:
doc = nlp("Lee, an experienced CEO, has founded two AI startups.")

displacy.render(doc, style='dep', jupyter=True, options={'distance': 100})

In [36]:
text = """In ancient Rome, some neighbors live in three adjacent houses. In the center is the house of Senex, who lives there with wife Domina, son Hero, and several slaves, including head slave Hysterium and the musical's main character Pseudolus. A slave belonging to Hero, Pseudolus wishes to buy, win, or steal his freedom. One of the neighboring houses is owned by Marcus Lycus, who is a buyer and seller of beautiful women; the other belongs to the ancient Erronius, who is abroad searching for his long-lost children (stolen in infancy by pirates). One day, Senex and Domina go on a trip and leave Pseudolus in charge of Hero. Hero confides in Pseudolus that he is in love with the lovely Philia, one of the courtesans in the House of Lycus (albeit still a virgin)."""
doc = nlp(text)
sentence_spans = list(doc.sents)
displacy.render(doc, style='dep', jupyter=True, options={'distance': 100})