In [21]:
import spacy
from spacy import displacy
from spacy.matcher import Matcher
import pandas as pd
nlp = spacy.load("en_core_web_sm")

### Lets check our rule on a larger corpus

In [22]:
active_passive = pd.read_csv('../Dataset/active_passive.csv')
active_passive.head(2)

Unnamed: 0,Active,Passive
0,He reads a novel.,A novel is read.
1,He does not cook food.,Food is not cooked by him.


In [23]:
active_passive.shape

(40, 2)

In [24]:
active = active_passive['Active']
passive = active_passive['Passive']

### Create the rule

In [25]:
passive_rule = [{'DEP':'nsubjpass'}]
matcher = Matcher(nlp.vocab)
matcher.add('Rule',[passive_rule])

In [26]:
def is_passive(doc,matcher):
    if len(matcher(doc))>0:
        return True
    else:
        return False

### Check rule on active voice sentences

In [27]:
cnt = 0
for sent in active:
    doc = nlp(sent)
    if not is_passive(doc,matcher):
        cnt += 1
print(cnt)

40


### Check rule on passive voice sentences

In [28]:
cnt = 0
for sent in passive:
    doc = nlp(sent)
    if is_passive(doc,matcher):
        cnt += 1
print(cnt)

39


### Let's troubleshoot

In [29]:
cnt = 0
missed = []
for sent in passive:
    doc = nlp(sent)
    if is_passive(doc,matcher):
        cnt += 1
    else:
        missed.append(doc)
print(cnt)

39


In [30]:
missed[0]

Is a table being bought by Ritika?

In [31]:
missed[0]

Is a table being bought by Ritika?

### Let's visualize their dependency trees

In [32]:
for doc in missed:
    displacy.render(doc, style="dep")

In [33]:
spacy.explain("auxpass")

'auxiliary (passive)'

[Dependencies](https://universaldependencies.org/docs/en/dep/)

### Update our rule
[Reference](https://spacy.io/usage/rule-based-matching)

In [34]:
passive_rule = [{'DEP':{"IN":['nsubjpass','auxpass']}}]
matcher = Matcher(nlp.vocab)
matcher.add('Rule',[passive_rule])

In [35]:
cnt = 0
for sent in active:
    doc = nlp(sent)
    if not is_passive(doc,matcher):
        cnt += 1
print(cnt)

40


In [36]:
cnt = 0
missed = []
for sent in passive:
    doc = nlp(sent)
    if is_passive(doc,matcher):
        cnt += 1
    else:
        missed.append(doc)
print(cnt)

40


## Summary
 - Always test your rules and hueristics on a larger corpus to see the effectiveness of the rules
 - One can write intricate matching rules using `matcher` object

In [47]:
doc = nlp("Dole was defeated by Clinton")
displacy.render(doc, style="dep")