In [1]:
# Import libraries
import spacy
from spacy import displacy
from spacy.matcher import Matcher
import pandas as pd
nlp = spacy.load("en_core_web_sm")

### Lets check our rule on a larger corpus

In [2]:
# load the dataset csv file

active_passive = pd.read_csv('/content/sample_data/active_passive.csv')

In [3]:
# Print the shape of the dataframe.

active_passive.shape


(40, 2)

In [5]:
# Separate out active and passive sentences in arrays.
active = active_passive['Active']
passive = active_passive['Passive']

### Create the rule

In [6]:
passive_rule = [{'DEP': 'nsubjpass'}]
matcher = Matcher(nlp.vocab)
matcher.add('Rule', [passive_rule])

In [7]:
def is_passive(doc, matcher):
  if len(matcher(doc)) > 0:
    return True
  return False

### Check rule on active voice sentences

In [10]:
cnt = 0

for sent in active:
  doc = nlp(sent)
  if is_passive(doc, matcher):
    cnt += 1

print(cnt)

0


### Check rule on passive voice sentences

In [11]:
cnt = 0

for sent in passive:
  doc = nlp(sent)
  if is_passive(doc, matcher):
    cnt += 1

print(cnt)

38


### Let's troubleshoot

In [12]:
cnt = 0
missed = []

for sent in passive:
  doc = nlp(sent)
  if is_passive(doc, matcher):
    cnt += 1
  else:
    missed.append(sent)

print(cnt)

38


In [13]:
missed

['Are books being purchased by him?', 'Is a table being bought by Ritika?']

### Let's visualize their dependency trees

In [15]:
for x in missed:
  doc = nlp(x)
  displacy.render(doc, style='dep', jupyter=True)

[Dependencies](https://universaldependencies.org/docs/en/dep/)

### Update our rule
[Reference](https://spacy.io/usage/rule-based-matching)

In [17]:
passive_rule = [{'DEP': {'IN':['nsubpass','auxpass']}}]
matcher = Matcher(nlp.vocab)
matcher.add('Rule', [passive_rule])

In [18]:
cnt = 0
missed = []

for sent in passive:
  doc = nlp(sent)
  if is_passive(doc, matcher):
    cnt += 1
  else:
    missed.append(sent)

print(cnt)

40


## Summary
 - Always test your rules and hueristics on a larger corpus to see the effectiveness of the rules
 - One can write intricate matching rules using `matcher` object

In [26]:
s1 = "What is your name?"
s2 = "Is coffee serving here?"
s3 = "Is she being promoted as new assistant?"
s4 = "Women are said to live longer than men."

In [39]:
s1 = "Sofia is learning NLP."
s2 = "Eggs are laid by Hens."
s3 = "Mouse is eaten by a black cat."
s4 = "She has done her job productively."

In [41]:
s4 = "JetAirways cancelled the flight this morning which was already late."

In [42]:

doc = nlp(s4)
displacy.render(doc, style='dep')