## Identification of active and passive voice in given text corpus.

In [None]:
# import libraries
import nltk
from nltk.corpus import stopwords
import spacy
from spacy import displacy
import pandas as pd

In [None]:
# define active/passive voice sentences
active = ['Hens lay eggs.',
         'Birds build nests.',
         'The batter hit the ball.',
         'The computer transmitted a copy of the manual']

passive = ['Eggs are laid by hens',
           'Nests are built by birds',
           'The ball was hit by the batter',
           'A copy of the manual was transmitted by the computer.']

In [None]:
# load spacy model

nlp = spacy.load('en_core_web_sm')

In [None]:
# visualize dependency parsing of active sentences

for sentence in active:
  tokens = nlp(sentence)
  displacy.render(tokens)

In [None]:
# visualize dependency parsing of passive sentences

for sentence in passive:
  tokens = nlp(sentence)
  displacy.render(tokens)

In [None]:
# a simple rule, if dependency tag `nsubjpass` is present in the parse tree then it is passive voice else active voice
# to implement this we will use matcher library

from spacy.matcher import Matcher

In [None]:
# define the function which will identify active or passive voice

def is_passive_voice(sentence):
  rule = [{'DEP': 'nsubjpass'}]
  token = nlp(sentence)
  matcher = Matcher(nlp.vocab)
  matcher.add('Rule', [rule])
  if len(matcher(token)) > 0:
    return True
  else:
    return False

In [None]:
for sentence in active:
  print(is_passive_voice(sentence))

False
False
False
False


In [None]:
for sentence in passive:
  print(is_passive_voice(sentence))

True
True
True
True


In [None]:
# let us test the nlp model on dataset
# load the dataset

data = pd.read_csv('/content/sample_data/active_passive.csv')
data.head()

Unnamed: 0,Active,Passive
0,He reads a novel.,A novel is read.
1,He does not cook food.,Food is not cooked by him.
2,Does he purchase books?,Are books being purchased by him?
3,They grow plants.,Plants are grown by them.
4,She teaches me.,I am taught by her.


In [None]:
data.shape

(40, 2)

In [None]:
# identify total active voice sentences
total_active = 0

for sentence in data['Active']:
  if is_passive_voice(sentence) == False:
    total_active += 1

print(total_active)

40


In [None]:
# identify total passive voice sentences
total_passive = 0
missing = []

for sentence in data['Passive']:
  if is_passive_voice(sentence) == True:
    total_passive += 1
  else:
    missing.append(sentence)

print(total_passive)

39


In [None]:
missing

['Is a table being bought by Ritika?']

In [None]:
displacy.render(nlp(missing[0]))

In [None]:
# add new rule 'auxpass' in the above rule

def is_passive_voice2(sentence):
  rule = [{'DEP':{"IN":['nsubjpass','auxpass']}}]
  token = nlp(sentence)
  matcher = Matcher(nlp.vocab)
  matcher.add('Rule', [rule])
  if len(matcher(token)) > 0:
    return True
  else:
    return False

In [None]:
# identify total active voice sentences
total_passive = 0

for sentence in data['Passive']:
  if is_passive_voice2(sentence) == True:
    total_passive += 1

print(total_passive)

40
