In [44]:
#load libraries and environments
import spacy
from nltk.corpus import gutenberg
from spacy import displacy
import json
import os
import time
import re
import pandas as pd
import sklearn as sk
import random

In [5]:
###global variable declarations###

#for rule 1
#----
nlp = spacy.load("en_core_web_lg")

#for rule 2
#----
# Define the keywords that will be used to extract the imperative sentences
# Rules are identified from the paper - https://www.aclweb.org/anthology/W14-2117.pdf
# 1. "find those sentences with a verb (in its base form) as the root 
#     in the phrase structure and this particular verb has no 
#.    subject child in the dependency structure" 
#.    Example: You must first discuss the matter there, and you need to be specific”
# 2. "recognize the use of a personal pronoun or noun (e.g., “you”, “we”,
#.     or a username) followed by a modal verb (e.g., “should”, “must”, “need”) as an imperative"


# Source: https://www.myenglishpages.com/site_php_files/grammar-lesson-modals.php
modal_verbs = ['can', 'could', 'may', 'might', 'will', 'would', 'shall','should', 'must', 'ought', 'dare', 'had better', 'ask']
modal_verbs = ['can', 'may', 'might', 'will', 'would', 'shall','should', 'must', 'ought', 'dare', 'had better', 'ask']

# Source: https://grammar.yourdictionary.com/parts-of-speech/pronouns/list-of-personal-pronouns.html
# personal_pronoun = ['I', 'me', 'we', 'us', 'you', 'he', 'she', 'her', 'him', 'it', 'they', 'them']
personal_pronoun = ['us', 'you', 'he', 'she', 'her', 'him', 'it', 'they', 'them']

# Create combinations of possibilities for a sentence to be imperative
combinations = []
# form combinations of words and put them as a list
for pronoun in personal_pronoun:
    for verb in modal_verbs:
        combinations.append(pronoun + ' ' + verb)
        
print('Number of combinations for rule 2 =', len(combinations))

Number of combinations for rule 2 = 108


In [7]:
# rule 1: Has a verb in its lemma (base) form and is the root and does not have any subject child in it's dependency structure
def rule1(sent):
    doc = nlp(sent)
    cond1 = False
    cond2 = True
    for token in doc:
        if token.dep_=='ROOT' and token.pos_=='VERB' and token.text.lower() == token.lemma_:
            cond1 = True
            #print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)
        if token.dep_ == 'nsubj':
            cond2 = False   
    return (cond1 and cond2)

In [6]:
# rule 2:
def rule2(combinations, sentence):
    if any(combo in sentence.lower() for combo in combinations):
        return True
    return False

In [8]:
#load kevins dataset

In [22]:
imp_sents = []
nimp_sents = []
for fname in os.listdir("./data/kevin"):
    if "data" in fname:
        print ("./data/kevin/"+fname)
        f = open("./data/kevin/"+fname, "r")
        for line in f.readlines():
            dline = line.strip().split("\t")
            if int(dline[0])==2:
                imp_sents.append(dline[1])
            else:
                nimp_sents.append(dline[1])
        #data = pd.read_csv("./data/kevin/"+fname, sep='\s+', engine = 'python')

./data/kevin/tst_data
./data/kevin/trn_data


In [52]:
neg_samples = random.sample(nimp_sents, len(imp_sents))
neg_labels = [0]*len(neg_samples)
pos_samples = imp_sents
pos_labels = [1]*len(pos_samples)
all_samples = neg_samples+pos_samples
all_labels = neg_labels+pos_labels

In [81]:
all_df = pd.DataFrame({'labels': all_labels, 'sentences': all_samples})
all_df.to_pickle("./data/kevin_imp_data.pkl")

In [82]:
preds = []
for sent in all_samples:
    r1_cond = False
    r2_cond = False
    r1_cond = rule1(sent)
    r2_cond = rule2(combinations, sent)
    if r1_cond or r2_cond:
        preds.append(1)
    else:
        preds.append(0)

In [83]:
sk.metrics.accuracy_score(all_labels, preds)

0.7770114942528735

In [84]:
sk.metrics.precision_recall_fscore_support(all_labels, preds, average='binary')

(0.887459807073955, 0.6344827586206897, 0.7399463806970511, None)

In [85]:
pd.DataFrame(data=sk.metrics.precision_recall_fscore_support(all_labels, preds), index=['precision', 'recall', 'fscore', 'size'], columns=['Non Imperative', 'Imperative'])

Unnamed: 0,Non Imperative,Imperative
precision,0.715564,0.88746
recall,0.91954,0.634483
fscore,0.804829,0.739946
size,435.0,435.0
