# Matching

Rule-based matching with `spacy`

### ! Interactive matcher explorer

https://explosion.ai/demos/matcher

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [8]:
def load_data(p):
    with open(p) as f:
        x = f.read()
    x = x.replace("\n\n", " ")
    x = x.replace("\n", " ")
    return(x)

In [30]:
p = "../data/abstracts_manual.txt"
text = load_data(p)
abstracts = nlp(text)

# Match Playgrounds

In [12]:
from spacy.matcher import Matcher

### Example

In [68]:
pattern = [
    {"LEMMA": "love", "POS": "VERB"},
    {"POS": "NOUN"}
]

doc = nlp("I loved dogs but now I love cats more.")

In [69]:
matcher = Matcher(nlp.vocab)
matcher.add("amor", [pattern])

matches = matcher(doc)

In [71]:
print(doc, "\n")
# extract the matched text
for _, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)

I loved dogs but now I love cats more. 

loved dogs
love cats


### Example

In [62]:
# first, confirm appropriate lemma
word = "cancer"

lemma = nlp(word)[0].lemma_

word, lemma

('cancer', 'cancer')

In [63]:
# set up pattern
pattern = [{"TEXT": lemma}, {"POS": "NOUN"}]

matcher = Matcher(nlp.vocab)
matcher.add("my_patten", [pattern])

# process matches
matches = matcher(abstracts)

# extract the matched text
for _, start, end in matches:
    matched_span = abstracts[start:end]
    print(matched_span.text)

cancer patients
cancer treatment
cancer treatment
cancer immunoediting
cancer immunosurveillance
cancer prognoses
cancer therapy
cancer effects
cancer development
cancer cells
cancer treatment
cancer treatment
cancer treatment
cancer patients
cancer therapy
cancer cell
cancer cells
cancer stem
cancer tissue
cancer characteristics
cancer treatment
cancer cells
cancer tumorigenesis
cancer initiation
cancer progression
cancer chemoprevention
cancer cases
cancer deaths
cancer deaths
cancer deaths
cancer cells
cancer cells
cancer patients
cancer killer
cancer treatment
cancer mouse
cancer agents


### Example

In [64]:
# first, confirm appropriate lemma
word = "summarize"

lemma = nlp(word)[0].lemma_

word, lemma

('summarize', 'summarize')

In [65]:
# set up pattern
pattern = [
    {"LEMMA": lemma},
    {"POS": "DET", "OP": "?"},  # optional: match 0 or 1 times
    {"POS": "NOUN"}
]

# process matches
matcher = Matcher(nlp.vocab)
matcher.add("my_patten", [pattern])
matches = matcher(abstracts)

# extract the matched text
for _, start, end in matches:
    matched_span = abstracts[start:end]
    print(matched_span.text)

summarize hallmarks


# Dependency matching

More complex

https://spacy.io/usage/rule-based-matching#dependencymatcher

In [66]:
from spacy.matcher import DependencyMatcher

In [None]:
pattern = [
    
]