In [48]:
#load libraries and environments
import spacy
from nltk.corpus import gutenberg
nlp = spacy.load("en_core_web_lg")

In [49]:
#get books names from the Gutenberg corpus
files = gutenberg.fileids()

In [50]:
#print the booknames
print("Names of the sample books from Gutenberg are:\n")
for i in range(len(files)):
    print(str(i+1)+". "+files[i])

Names of the sample books from Gutenberg are:

1. austen-emma.txt
2. austen-persuasion.txt
3. austen-sense.txt
4. bible-kjv.txt
5. blake-poems.txt
6. bryant-stories.txt
7. burgess-busterbrown.txt
8. carroll-alice.txt
9. chesterton-ball.txt
10. chesterton-brown.txt
11. chesterton-thursday.txt
12. edgeworth-parents.txt
13. melville-moby_dick.txt
14. milton-paradise.txt
15. shakespeare-caesar.txt
16. shakespeare-hamlet.txt
17. shakespeare-macbeth.txt
18. whitman-leaves.txt


In [51]:
#rule 1: Has a verb in it's lemma (base) form
def verb_in_lemma(sent):
    doc = nlp(sent)
    for token in doc:
        if token.pos_=='VERB' and token.text.lower() == token.lemma_:
            return True

In [52]:
#rule 2: Has a verb in its lemma (base) form and is the root
def root_verb_in_lemma(sent):
    doc = nlp(sent)
    for token in doc:
        if token.dep_=='ROOT' and token.pos_=='VERB' and token.text.lower() == token.lemma_:
            print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)
            return True

In [53]:
#rule 3: Has a verb in its lemma (base) form and is the root and does not have any subject child in it's dependency structure
def root_verb_in_lemma_nosubj(sent):
    doc = nlp(sent)
    cond1 = False
    cond2 = True
    for token in doc:
        if token.dep_=='ROOT' and token.pos_=='VERB' and token.text.lower() == token.lemma_:
            cond1 = True
            #print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)
        if token.dep_ == 'nsubj':
            cond2 = False   
    return (cond1 and cond2)

In [56]:
#filter sentences from gutenberg ebooks
for f in files:
    sentences = gutenberg.sents(f)
    i = 0
    for s in sentences:
        sent = " ".join(s)
        if root_verb_in_lemma_nosubj(sent):
            #print(sent)
            i+=1
            #print("\n")
        if i ==10:
            break

In [57]:
#sample sentences to test rules
sent1 = "Book me an appointment with my doctor!"
sent2 = "Ana, book me an appointment with my doctor!"

sent3 = "Did you book an appointment with my doctor?"
sent4 = "Doctor cancelled my appointment"

In [61]:
print("Sent 1 is imperative? "+str(root_verb_in_lemma_nosubj(sent1)))
print("Sent 2 is imperative? "+str(root_verb_in_lemma_nosubj(sent2)))
print("Sent 3 is imperative? "+str(root_verb_in_lemma_nosubj(sent3)))
print("Sent 4 is imperative? "+str(root_verb_in_lemma_nosubj(sent4)))

Sent 1 is imperative? True
Sent 2 is imperative? True
Sent 3 is imperative? False
Sent 4 is imperative? False
